def publish(self): """Publish a run to the OpenML server. Uploads the results of a run to OpenML. Sets the run_id on self Returns ------- self : OpenMLRun """ if self.model is None: raise PyOpenMLError("OpenMLRun obj does not contain a model. (This should never happen.) "); if self.flow_id is None: raise PyOpenMLError("OpenMLRun obj does not contain a flow id. (Should have been added while executing the task.) "); description_xml = self._create_description_xml() file_elements = {'description': ("description.xml", description_xml)} if self.error_message is None: predictions = arff.dumps(self._generate_arff_dict()) file_elements['predictions'] = ("predictions.arff", predictions) if self.trace_content is not None: trace_arff = arff.dumps(self._generate_trace_arff_dict()) file_elements['trace'] = ("trace.arff", trace_arff) return_value = _perform_api_call("/run/", file_elements=file_elements) run_id = int(xmltodict.parse(return_value)['oml:upload_run']['oml:run_id']) self.run_id = run_id return self
def publish(self) -> 'OpenMLRun': """ Publish a run (and if necessary, its flow) to the OpenML server. Uploads the results of a run to OpenML. If the run is of an unpublished OpenMLFlow, the flow will be uploaded too. Sets the run_id on self. Returns ------- self : OpenMLRun """ if self.model is None: raise PyOpenMLError("OpenMLRun obj does not contain a model. " "(This should never happen.) ") if self.flow_id is None: if self.flow is None: raise PyOpenMLError( "OpenMLRun object does not contain a flow id or reference to OpenMLFlow " "(these should have been added while executing the task). " ) else: # publish the linked Flow before publishing the run. self.flow.publish() self.flow_id = self.flow.flow_id if self.parameter_settings is None: if self.flow is None: self.flow = openml.flows.get_flow(self.flow_id) self.parameter_settings = self.flow.extension.obtain_parameter_values( self.flow, self.model, ) description_xml = self._create_description_xml() file_elements = {'description': ("description.xml", description_xml)} if self.error_message is None: predictions = arff.dumps(self._generate_arff_dict()) file_elements['predictions'] = ("predictions.arff", predictions) if self.trace is not None: trace_arff = arff.dumps(self.trace.trace_to_arff()) file_elements['trace'] = ("trace.arff", trace_arff) if self.additional_information is not None: for (name, (file, contents)) in self.additional_information.items(): file_elements[name] = (file, contents) return_value = openml._api_calls._perform_api_call( "/run/", 'post', file_elements=file_elements) result = xmltodict.parse(return_value) self.run_id = int(result['oml:upload_run']['oml:run_id']) return self
def toArffString(self): data = dict() data.update({"description": self.getDescription()}) data.update({"relation": self.relationName()}) attributes = [] for i in range(self.numAttributes()): t = [] t.append(self.attribute(i).name()) if self.attribute(i).type() == Attribute.NUMERIC: t.append("REAL") else: t.append(self.attribute(i).m_AttributeInfo.m_Values) attributes.append(tuple(t)) data.update({"attributes": attributes}) datas = [] for i in range(self.numInstances()): val = [] for j in range(self.numAttributes()): if self.instance(i).isMissing(j): val.append(None) elif self.attribute(j).isNominal(): val.append( self.attribute(j).value(self.instance(i).value(j))) else: val.append(str(self.instance(i).value(j))) datas.append(val) data.update({"data": datas}) text = arff.dumps(data) return text
def store_as_arff(data, labels, relation, path, description=u'', attributes=ATTRIBUTES): """ Writes feature data to an ARFF file. :param data: The data set. (Without class labels) :param labels: Class labels in the same order as the data instances in ``data`` :param relation: The name of the relation :param path: The full path to the file to write to. :param description: A description of the relation. :param attributes: The attributes in the data set and their levels """ attributes.append(('@@TRUTH@@', ['Y', 'N'])) new_data = [] for i in range(len(data)): l = list(data[i]) l += labels[i] new_data.append(l) arff_data = { 'description': description, 'relation': relation, 'attributes': attributes, 'data': new_data, } fh = open(path, 'w') fh.write(liac_arff.dumps(arff_data)) fh.close()
def partg(): data = arff.load(open(filename)) # find the attribute index for attribute_index in range(len(data['attributes'])): if data['attributes'][attribute_index][0] == attribute: break if attribute_index == len(data['attributes']): print("The given attribute name does not exist.") exit(1) attribute_type = data['attributes'][attribute_index][1] # Add attribute 10 times for i in range(10): # Add a new attribute data['attributes'].insert(0, (attribute + str(i), attribute_type)) # Append a copy to each piece of data for j in range(len(data['data'])): data['data'][j].insert(0, data['data'][j][attribute_index]) # Write a new file p4g_file = open('partg-' + filename, 'w') p4g_file.write(arff.dumps(data))
def vectors_to_arff_format(): # To be used for Weka analysis data['data'] = read_feat() print(arff.dumps(data)) # compute_features_vector()
def export(self, paintings): """Export the analysed data.""" data = {'description': self.__class__.__name__, 'relation': 'year', 'attributes': self.get_attributes(), 'data': self.get_values(paintings)} return arff.dumps(data)
def findMissingValues(filename): with open(filename, 'r+') as af: arffFile = arff.load(af) data = arffFile['data'] attributes = arffFile['attributes'] numExamples = len(data) averages = [] # loop over each attribute for index in range(len(attributes)): attr = attributes[index] average = '?' if isinstance(attr[1], list): # find mode if attr is classifier words_to_count = (row[index] for row in data if row[index] != None) c = Counter(words_to_count) average = c.most_common(1)[0][0] # stacks on stacks else: # find mean average = sum([row[index] for row in data if row[index] != None]) / numExamples averages.append(average) # udpate the missing values for row in data: for index in range(len(row)): if row[index] == None: row[index] = averages[index] # overwrite the file af.seek(0) af.write(arff.dumps(arffFile)) af.truncate() return data
def cccCentring(ra, combnk, files, aRa, rGoldIndiv): for i in range(len(v.eName)): for f, fname in enumerate(files[i][0]): meanByF = [] wghRater = [] csv = rGoldIndiv[v.eName[i]][f] #Firstly we compute the mean of all raters for each file for a in range(v.nAn): #We get the mean meanRatersF = np.nanmean(csv[:, a + 1]) meanByF.append(meanRatersF) #We take the weight of the rater in this file wghRater.append(aRa[a][i][f]) #Now we calculate the ponderate mean of all raters pondMean = np.sum(np.multiply(meanByF, wghRater)) / np.sum( aRa[:, i, f]) #We have the mean of all raters, we need the total mean of the file meanF = np.nanmean(csv[:, 1:]) #Now we will center each prediction according to the mean output = [] #We prepare the ARFF file, we get the template data = arff.load(open(v.arffTempPath, 'rb')) for line in range(len(csv) - 1): meanLine = np.nanmean(csv[line + 1, 1:]) newGs = meanLine - meanF + pondMean #We replace the values in the ARFF template data["data"][line][0] = fname.replace(".csv", "") data["data"][line][1] = round(csv[line + 1, 0], 2) data["data"][line][2] = round(newGs, 6) #We write the csv in the Gold Standard folder f = open(v.agsc[i] + fname.replace(".csv", ".arff"), "w") f.write(arff.dumps(data)) return None
def splitFile(filename): numClasses = 0 classData = dict() with open(filename, 'rb') as af: arffFile = arff.load(af) attributes = arffFile['attributes'] classes = attributes[-1][1] # replaces empty list for Firstyrcumgpa template['attributes'][-1] = (template['attributes'][-1][0], classes) numClasses = len(classes) for c in classes: classData.setdefault(c, []) arffData = arffFile['data'] for row in arffData: if row[-1] != None: cList = classData[row[-1]] cList.append(row) # save each key of classData to a sepporate arff filenum = 0 for key, data in classData.items(): template['data'] = data with open(temp_dir + '\o%g.arff' % filenum, 'w') as arffFile: arffFile.write(arff.dumps(template)) filenum += 1 return numClasses
def make_arff(symbol, preparer, attributes): symbol_quotes = list(quotes.find({'Symbol': symbol})) data = [preparer(instance, i, symbol_quotes) for i, instance in enumerate(symbol_quotes[:-1])] return arff.dumps(data, relation=RELATION_NAME % symbol, names=attributes)
def do_run(task, optimizer, output_dir, internet_access=True, publish=False): if internet_access: run = openml.runs.run_model_on_task(task, optimizer) score = run.get_metric_fn(sklearn.metrics.accuracy_score) print('%s [SCORE] Data: %s; Accuracy: %0.2f' % (openmlpimp.utils.get_time(), task.get_dataset().name, score.mean())) if publish: run = run.publish() run_xml = run._create_description_xml() predictions_arff = arff.dumps(run._generate_arff_dict()) with open(output_dir + '/run.xml', 'w') as f: f.write(run_xml) with open(output_dir + '/predictions.arff', 'w') as f: f.write(predictions_arff) if run.trace_content is not None: trace_arff = arff.dumps(run._generate_trace_arff_dict()) with open(output_dir + '/trace.arff', 'w') as f: f.write(trace_arff) return run else: res = openml.runs.functions._run_task_get_arffcontent( optimizer, task, task.class_labels) run = openml.runs.OpenMLRun(task_id=task.task_id, dataset_id=None, flow_id=None, model=optimizer) run.data_content, run.trace_content, run.trace_attributes, run.fold_evaluations, _ = res score = run.get_metric_fn(sklearn.metrics.accuracy_score) print('%s [SCORE] Data: %s; Accuracy: %0.2f' % (openmlpimp.utils.get_time(), task.get_dataset().name, score.mean())) if run.trace_content is not None: trace_arff = arff.dumps(run._generate_trace_arff_dict()) with open(output_dir + '/trace.arff', 'w') as f: f.write(trace_arff) predictions_arff = arff.dumps(run._generate_arff_dict()) with open(output_dir + '/predictions.arff', 'w') as f: f.write(predictions_arff) return run
def dumps(df): """ dump DataFrame to str :param DataFrame df: :rtype: str :return: dumped arff """ arff = __dump(df) return liacarff.dumps(arff)
def parth(): data = arff.load(open(filename)) for i in range(20): data['attributes'].insert(0, ('RANDOM' + str(i), 'NUMERIC')) for j in range(len(data['data'])): data['data'][j].insert(0, random.random()) p4h_file = open('parth-' + filename, 'w') p4h_file.write(arff.dumps(data))
def test_encode_destiny(self): src = ARFF_DESTINY count = 0 while count < 10: count += 1 obj = arff.loads(src) src = arff.dumps(obj) self.assertEqual(src, ARFF_DESTINY)
def VTiter(self, *parsedArgs, **envars): import arff largs, dictargs = self.full_parse(parsedArgs) self.nonames = True self.names = [] self.types = [] data = {} if 'query' not in dictargs: raise functions.OperatorError( __name__.rsplit('.')[-1], "No query argument ") query = dictargs['query'] cur = envars['db'].cursor() c = cur.execute(query) schema = cur.getdescriptionsafe() raw = [] try: first_row = c.next() raw.append(first_row) except: f = open('input.arff', 'w') f.write( "@RELATION hour-weka.filters.unsupervised.attribute.Remove-R1-2\n\n" ) for j in schema: f.write("@ATTRIBUTE %s NUMERIC\n" % (j[0], )) f.write("\n@DATA\n") yield (('result', ), ) yield (1, ) return updated_schema = [] for i in range(len(schema)): t = (schema[i][0], "NUMERIC") updated_schema.append(t) data[u'attributes'] = updated_schema for row in c: raw.append(row) data[u'data'] = raw data[u'description'] = u'' data[ u'relation'] = u'hour-weka.filters.unsupervised.attribute.Remove-R1-2' f = open('input.arff', 'w') f.write(arff.dumps(data)) yield (('result', ), ) yield (1, )
def run_experiment(rscv, task, args): try: count = 1 while count <= 100: try: print("%s Started classifier %s, condition %s, parameter '%s', deftype '%s', RS seed %s on task %s, dataset '%s'." % (hyperimp.utils.get_time(), args.classifier, args.condition, args.param, args.deftype, args.seed, args.task_id, task.get_dataset().name)) # train model run = train_model(task, rscv) break except openml.exceptions.OpenMLServerError as e: if count == 100: print("%s OpenMLServerError in run, I tried this 100 times already, so I'm just going to continue to the next run." % (hyperimp.utils.get_time())) raise sleeptime = randint(5,60) print("%s Error in run, trying again in %d seconds. Message: %s" % (hyperimp.utils.get_time(), sleeptime, e)) count += 1 sleep(sleeptime) run.tags.append('study_%s' %str(args.study_id)) score = run.get_metric_fn(sklearn.metrics.accuracy_score) print('%s [SCORE] Accuracy: %0.2f.' % (hyperimp.utils.get_time(), score.mean())) if args.log: # log xml, predictions output_dir = args.output_dir + '/' + args.classifier + '/task_' + str(args.task_id) + '/' + str(args.condition) os.makedirs(output_dir) run_xml = run._create_description_xml() predictions_arff = arff.dumps(run._generate_arff_dict()) with open(output_dir + '/run.xml', 'w') as f: f.write(run_xml) with open(output_dir + '/predictions.arff', 'w') as f: f.write(predictions_arff) else: None count_run = 1 while count_run <= 100: try: # publish run on OpenML run.publish() break except openml.exceptions.OpenMLServerError as e: if count_run == 100: print("%s OpenMLServerError in run, I tried uploading this 100 times already, so I'm just going to continue to the next run." % (hyperimp.utils.get_time())) raise sleeptime_run = randint(5,60) print("%s Error in uploading run trying again in %d seconds. Message: %s" % (hyperimp.utils.get_time(), sleeptime_run, e)) count_run += 1 sleep(sleeptime_run) print("%s Uploaded run condition %s, parameter %s, RS seed %s, task %s, with run id %d." % (hyperimp.utils.get_time(), args.condition, args.param, args.seed, args.task_id, run.run_id)) except TimeoutError as e: print("%s Run timed out." % (hyperimp.utils.get_time())) except Exception as e: print("%s Error in run: %s" % (hyperimp.utils.get_time(), e)) traceback.print_exc() return
def csv_to_arff(X, label_i, savePath, datatype, isTrain=True): # get attributes if datatype == 'real': attributes = [(X.columns[i], u"REAL") for i in range(len(X.columns))] attributes.append(('label_' + label_i.name, ['0', '1'])) data = [] i = 0 while i < len(label_i): attr_data = [j for j in list(X.iloc[i, :])] label_data = [str(label_i[i])] row_data = attr_data + label_data data.append(row_data) i += 1 # set obj obj = { 'description': u'', 'relation': 'relation', 'attributes': attributes, 'data': data, } elif datatype == "nominal": attributes = [('attr_' + X.columns[i], ['0', '1']) for i in range(len(X.columns))] attributes.append(('label_' + label_i.name, ['0', '1'])) data = [] i = 0 while i < len(label_i): attr_data = [str(int(j)) for j in list(X.iloc[i, :])] label_data = [str(label_i[i])] row_data = attr_data + label_data data.append(row_data) i += 1 # set obj obj = { 'description': u'', 'relation': 'relation', 'attributes': attributes, 'data': data, } else: raise TypeError("datatype.") arff_data = arff.dumps(obj) if isTrain: #w_file = open(savePath+label_i.name+"_train.arff", "w") w_file = open(savePath + "/train.arff", "w") w_file.write(arff_data) w_file.close() elif not isTrain: w_file = open(savePath + "/test.arff", "w") w_file.write(arff_data) w_file.close() else: raise (ValueError, "what type of dataset?")
def test_simple(self): dumps = self.get_dumps() s = dumps(OBJ) self.assertEqual(s, ARFF) count = 0 while count < 10: count += 1 obj = arff.loads(s) src = arff.dumps(obj) self.assertEqual(src, ARFF)
def publish(self): """Publish a run to the OpenML server. Uploads the results of a run to OpenML. """ predictions = arff.dumps(self._generate_arff()) description_xml = self._create_description_xml() data = {'predictions': ("predictions.csv", predictions), 'description': ("description.xml", description_xml)} return_code, return_value = _perform_api_call( "/run/", file_elements=data) return return_code, return_value
def _get_file_elements(self) -> Dict: """ Get file_elements to upload to the server. Derived child classes should overwrite this method as necessary. The description field will be populated automatically if not provided. """ if self.model is None: raise PyOpenMLError( "OpenMLRun obj does not contain a model. " "(This should never happen.) " ) if self.flow_id is None: if self.flow is None: raise PyOpenMLError( "OpenMLRun object does not contain a flow id or reference to OpenMLFlow " "(these should have been added while executing the task). " ) else: # publish the linked Flow before publishing the run. self.flow.publish() self.flow_id = self.flow.flow_id if self.parameter_settings is None: if self.flow is None: self.flow = openml.flows.get_flow(self.flow_id) self.parameter_settings = self.flow.extension.obtain_parameter_values( self.flow, self.model, ) file_elements = {'description': ("description.xml", self._to_xml())} if self.error_message is None: predictions = arff.dumps(self._generate_arff_dict()) file_elements['predictions'] = ("predictions.arff", predictions) if self.trace is not None: trace_arff = arff.dumps(self.trace.trace_to_arff()) file_elements['trace'] = ("trace.arff", trace_arff) return file_elements
def store(self, graph_builder, **kwargs): """ Stores the graph content in Conll format into the object file. :param graph: The graph to store. :param kwargs: Unused """ import arff data = [] entities = graph_builder.get_all_coref_entities() self.logger.debug("entities: %s", len(entities)) for entity in entities: mentions = graph_builder.get_all_entity_mentions(entity) self.logger.debug("Mentions: %s", len(mentions)) for mention in mentions: surface_learn = mention.get("surface_learn", {}) self.logger.debug("links: %s", len(surface_learn)) for link in surface_learn: data.append(surface_learn[link]) self.logger.debug("link: %s: ", surface_learn[link]) boolean = [str(True), str(False)] arff_file = { 'attributes': [ ('relax_match', boolean), ('mention_enumeration', boolean), ('candidate_enumeration', boolean), ('mention_appositive', boolean), ('candidate_appositive', boolean), ('equal_names', 'REAL'), ('equal_adjectives', 'REAL'), ('equal_rest', 'REAL'), ('extra_mention_names', 'REAL'), ('extra_mention_adjectives', 'REAL'), ('extra_mention_rest', 'REAL'), ('extra_candidate_names', 'REAL'), ('extra_candidate_adjectives', 'REAL'), ('extra_candidate_rest', 'REAL'), ('mention', 'STRING'), ('candidate', 'STRING'), ('sentence_distance', 'INTEGER'), ('linked', boolean)], 'description': 'surface relations between mentions', 'relation': 'surface', 'data': data } if len(data): self.file.write(arff.dumps(arff_file)) else: self.logger.info("Empty data")
def export_arff(file_name: str, data: DataFrame, attributes, description: str): if description is None: description = '' if attributes is None: attributes = [(attribute, "STRING") for attribute in data.columns] arff_dict = { 'description': description, 'relation': file_name, 'attributes': attributes, 'data': data.values.tolist() } return "data:text/arff;charset=utf-8," + \ urllib.parse.quote(arff.dumps(arff_dict))
def arffGenerator(self, dataList, rawNamesList, group): namesList = [] for name in rawNamesList: namesList.append((name,'REAL')) arffDict = {} arffDict['description'] = u'' arffDict['relation'] = 'perfEvents' arffDict['attributes'] = namesList arffDict['data'] = dataList # print arff.dumps(arffDict) outFile = os.path.join(self.dumpTo, group + '.arff') with open(outFile, 'w') as f: f.write(arff.dumps(arffDict))
def arffGenerator(self, dataList, rawNamesList, group): namesList = [] for name in rawNamesList: namesList.append((name, 'REAL')) arffDict = {} arffDict['description'] = u'' arffDict['relation'] = 'perfEvents' arffDict['attributes'] = namesList arffDict['data'] = dataList # print arff.dumps(arffDict) outFile = os.path.join(self.dumpTo, group + '.arff') with open(outFile, 'w') as f: f.write(arff.dumps(arffDict))
def transform(file, dont_care_category, a, attributes_file): """input example [ {id: 123, text: "this is text body", category: ["dont_care"]} ] output example @relation game_media_bot @attribute :return: """ classes = set() data = json.load(open(file, 'r')) master_vector = Counter() for tweet in data: classes.add(tweet['category'][0]) if tweet['category'][0] != dont_care_category: master_vector += get_word_vector(tweet) print(master_vector) attrs = None if not attributes_file: # most common words in the text of the target category attrs = [(word, 'INTEGER') for word, _ in master_vector.most_common(a)] attrs.append(('class', [value for value in classes])) else: # load attributes from this file arff_data = arff.load(open(attributes_file, 'r')) attrs = arff_data['attributes'] arff_data = { 'attributes': attrs, 'data': [], 'description': '', 'relation': '{}'.format(dont_care_category) } for tweet in data: word_vector = get_word_vector(tweet) tweet_data = [word_vector[attr[0]] for attr in attrs[:-1]] tweet_data.append(tweet['category'][0]) arff_data['data'].append(tweet_data) out_file = file.replace('.json', '.arff') data = arff.dumps(arff_data) with open(out_file, 'w') as f: f.write(data) return out_file
def md2arff(): ri_obj_list = get_repo_info(to_dict=False, combine_star_events=True) repo_set = set() data = dict() data['attributes'] = attrs data['description'] = '' data['relation'] = 'readme' readme_file_set = set() inline_data = list() for ri in ri_obj_list: if (ri.repo_owner, ri.repo_name) in repo_set: continue repo_set.add((ri.repo_owner, ri.repo_name)) paper_repo_owner = getattr(ri, 'paper_repo_owner') paper_repo_name = getattr(ri, 'paper_repo_name') repo_path = os.path.join(conf.repo_path, paper_repo_owner, paper_repo_name) assert os.path.exists(repo_path) file_list = os.listdir(repo_path) readme_path = '' for f in file_list: if f.lower().startswith('readme.'): readme_path = os.path.join(repo_path, f) break if readme_path == '': readme_content = '' else: with open(readme_path, 'r', encoding='utf-8', errors='ignore') as readme_f: readme_content = readme_f.read() if readme_path != '' and f.lower() == 'readme.md': readme_content = parse_markdown(readme_content) readme_content = readme_content.lower() readme_content = readme_content.replace('\n', ' ') readme_content = readme_content.replace('\"', ' ') readme_content = readme_content.replace('\'', ' ') inline_data_unit = list() if ri.stars_count >= threshold: inline_data_unit.append('popular') else: inline_data_unit.append('unpopular') inline_data_unit.append(readme_content) inline_data.append(inline_data_unit) data['data'] = inline_data file_content = arff.dumps(data) arff_path = os.path.join(conf.root_path, 'text_analysis.arff') with open(arff_path, 'w', encoding='utf-8') as f: f.write(file_content)
def _to_filesystem(self, file_path): """Serialize the trace object to the filesystem. Serialize the trace object as an arff. Parameters ---------- file_path: str File path where the trace arff will be stored. """ trace_arff = arff.dumps(self.trace_to_arff()) with open(os.path.join(file_path, 'trace.arff'), 'w') as f: f.write(trace_arff)
def dump_dataset(features, feature_format, evaluations, train_context, format='arff', positive_class=None): if format == 'arff': data = { 'attributes': [], 'data': [], 'description': '', 'relation': 'default' } n_features = len(features.splitlines()) for i in range(1, n_features + 1): feature = ('f%d' % i, ['+', '-']) data['attributes'].append(feature) target = train_context.target_table if not target in train_context.orng_tables: raise Exception( 'Target table is not preloaded in memory! Please select the `dump data` parameter in the converter widget.' ) if feature_format == 'aleph': target_vals = ('negative', 'positive') else: orng_target = train_context.orng_tables[target] target_vals = tuple(sorted(orng_target.domain.classVar.values)) class_attr = ('class', target_vals) data['attributes'].append(class_attr) for line in evaluations.splitlines(): values = line.strip().split() if feature_format == 'aleph': class_val = values[-1] if class_val == positive_class: values[-1] = 'positive' else: values[-1] = 'negative' data['data'].append(values) return arff.dumps(data) elif format == 'csv': data = '' for line in evaluations.splitlines(): values = line.strip().split() data = data + ','.join(values) + '\n' return data return 'unsupported format'
def to_filesystem( self, directory: str, store_model: bool = True, ) -> None: """ The inverse of the from_filesystem method. Serializes a run on the filesystem, to be uploaded later. Parameters ---------- directory : str a path leading to the folder where the results will be stored. Should be empty store_model : bool, optional (default=True) if True, a model will be pickled as well. As this is the most storage expensive part, it is often desirable to not store the model. """ if self.data_content is None or self.model is None: raise ValueError('Run should have been executed (and contain ' 'model / predictions)') os.makedirs(directory, exist_ok=True) if not os.listdir(directory) == []: raise ValueError('Output directory {} should be empty'.format( os.path.abspath(directory))) run_xml = self._create_description_xml() predictions_arff = arff.dumps(self._generate_arff_dict()) # It seems like typing does not allow to define the same variable multiple times with open(os.path.join(directory, 'description.xml'), 'w') as fh: # type: TextIO fh.write(run_xml) with open(os.path.join(directory, 'predictions.arff'), 'w') as fh: fh.write(predictions_arff) if store_model: with open(os.path.join(directory, 'model.pkl'), 'wb') as fh_b: # type: IO[bytes] pickle.dump(self.model, fh_b) if self.flow_id is None: self.flow.to_filesystem(directory) if self.trace is not None: self.trace._to_filesystem(directory)
def normalizeFunction(self, data, technique, folderName, arffs): t = time.time() # Choose Standardization Technique if technique == 'MinMaxScaler': scaler = MinMaxScaler() if technique == 'StandardScaler': scaler = StandardScaler() if technique == 'MaxAbsScaler': scaler = MaxAbsScaler() if technique == 'RobustScaler': scaler = RobustScaler() print("Convertendo para Normalização {}..".format(technique)) print("Transformando em Dataframe..") # Convert data into DataFrame df = pd.DataFrame(data['data']) # Get features number length = df.iloc[0, :].values print("Número de Features: {}".format(len(length) - 1)) # Create a label column labels = df.iloc[:, len(length) - 1].values # Save features without labels data_aux = df.iloc[:, 0:(len(length) - 1)].values # Normalize data data_normalized = scaler.fit_transform(data_aux) # Adding the labels to normalized data data_normalized = np.concatenate((data_normalized, np.vstack(labels)), axis=1) # Replacing data with normalized samples data['data'] = data_normalized # Creating Folder if doesnt exists try: os.mkdir(folderName + "Normalized_Arffs", 755) print("Criando pasta onde será salvo os arquivos arffs..") except: print( "Pasta já existente, apenas sobrescrevendo os arquivos arffs.." ) # Saving arff in text file print("Salvando arff..") newArffFile = open( folderName + "Normalized_Arffs/" + arffs[:-5] + "_" + technique + ".arff", "w") newArffFile.write(arff.dumps(data)) newArffFile.close() print("Processo para o arquivo {} terminado. (Tempo de execução: {})". format(arffs, time.time() - t)) print("")
def test_files(self): fname = os.path.join(SRC_DIR, 'example.arff') data = [ ['blonde', 17.2, 1], ['blue', 27.2, 2], ['blue', 18.2, 3], ] arff.dump(fname, data, relation='diabetics_data', names=('hair_color', 'age', 'patno')) data = list(arff.load(os.path.join(SRC_DIR, fname))) arff_rows = arff.dumps(data) reparsed_data = list(arff.loads(arff_rows)) data = [list(row) for row in data] reparsed_data = [list(row) for row in reparsed_data] self.assertEqual(data, reparsed_data)
def generate(arff_file): ou = open(arff_file, "w") dataset = { 'description': 'Motion sensor dataset', 'relation': 'whatever', 'attributes': [ ('chair_prev', 'REAL'), ('bath_prev', 'REAL'), ('down_prev', 'REAL'), ('up_prev', 'REAL'), ('chair_post', 'REAL'), ('bath_post', 'REAL'), ('down_post', 'REAL'), ('up_post', 'REAL'), ('a_prev', 'REAL'), ('a_post', 'REAL'), ('tag', ['walk', 'chair', 'bath', 'down', 'up']) ] } sql = """select * from motion order by event_timestamp asc;""" m.execute(sql) data = [] counter = 0 for record in m: # print(record) row = [] ts = float(record[5]) prev = get_prev_obj(ts) post = get_post_obj(ts) for item in prev: row.append(item) for item in post: row.append(item) a = get_a(ts) for item in a: row.append(item) if record[-1] is None: row.append('?') else: row.append(record[-1]) data.append(row) counter += 1 print(counter) dataset['data'] = data ou.write(arff.dumps(dataset)) ou.close()
def write_cv(self, splits): ''' write cv.arff ''' content = { "relation": "CV_Folds", "attributes": [("instance_id", "STRING"), ("repetition", "NUMERIC"), ("fold", "NUMERIC")], "data": [] } for split, idx in zip(splits, range(1, self.FOLDS + 1)): for inst_name in split: content["data"].append([inst_name, 1, idx]) print(arff.dumps(content))
def to_filesystem(self, output_directory, store_model=True): """ The inverse of the from_filesystem method. Serializes a run on the filesystem, to be uploaded later. Parameters ---------- output_directory : str a path leading to the folder where the results will be stored. Should be empty store_model : bool if True, a model will be pickled as well. As this is the most storage expensive part, it is often desirable to not store the model. """ if self.data_content is None or self.model is None: raise ValueError( 'Run should have been executed (and contain model / predictions)' ) try: os.makedirs(output_directory) except OSError as e: if e.errno == errno.EEXIST: pass else: raise e if not os.listdir(output_directory) == []: raise ValueError('Output directory should be empty') run_xml = self._create_description_xml() predictions_arff = arff.dumps(self._generate_arff_dict()) with open(os.path.join(output_directory, 'description.xml'), 'w') as f: f.write(run_xml) with open(os.path.join(output_directory, 'predictions.arff'), 'w') as f: f.write(predictions_arff) if store_model: with open(os.path.join(output_directory, 'model.pkl'), 'wb') as f: pickle.dump(self.model, f) if self.trace is not None: self.trace._to_filesystem(output_directory)
def write_cv(self, splits): ''' write cv.arff ''' content = {"relation" : "CV_Folds", "attributes": [ ("instance_id", "STRING"), ("repetition", "NUMERIC"), ("fold", "NUMERIC") ], "data": [] } for split, idx in zip(splits, range(1, self.FOLDS+1)): for inst_name in split: content["data"].append([inst_name, 1, idx]) print(arff.dumps(content))
def dump_dataset(features, feature_format, evaluations, train_context, format='arff', positive_class=None): if format == 'arff': data = { 'attributes': [], 'data': [], 'description': '', 'relation': 'default' } n_features = len(features.splitlines()) for i in range(1, n_features + 1): feature = ('f%d' % i, ['+', '-']) data['attributes'].append(feature) target = train_context.target_table if not target in train_context.orng_tables: raise Exception('Target table is not preloaded in memory! Please select the `dump data` parameter in the converter widget.') if feature_format == 'aleph': target_vals = ('negative', 'positive') else: orng_target = train_context.orng_tables[target] target_vals = tuple(sorted(orng_target.domain.classVar.values)) class_attr = ('class', target_vals) data['attributes'].append(class_attr) for line in evaluations.splitlines(): values = line.strip().split() if feature_format == 'aleph': class_val = values[-1] if class_val == positive_class: values[-1] = 'positive' else: values[-1] = 'negative' data['data'].append(values) return arff.dumps(data) elif format == 'csv': data = '' for line in evaluations.splitlines(): values = line.strip().split() data = data + ','.join(values) + '\n' return data return 'unsupported format'
def to_filesystem(self, output_directory, store_model=True): """ The inverse of the from_filesystem method. Serializes a run on the filesystem, to be uploaded later. Parameters ---------- output_directory : str a path leading to the folder where the results will be stored. Should be empty store_model : bool if True, a model will be pickled as well. As this is the most storage expensive part, it is often desirable to not store the model. """ if self.data_content is None or self.model is None: raise ValueError('Run should have been executed (and contain model / predictions)') try: os.makedirs(output_directory) except OSError as e: if e.errno == errno.EEXIST: pass else: raise e if not os.listdir(output_directory) == []: raise ValueError('Output directory should be empty') run_xml = self._create_description_xml() predictions_arff = arff.dumps(self._generate_arff_dict()) with open(os.path.join(output_directory, 'description.xml'), 'w') as f: f.write(run_xml) with open(os.path.join(output_directory, 'predictions.arff'), 'w') as f: f.write(predictions_arff) if store_model: with open(os.path.join(output_directory, 'model.pkl'), 'wb') as f: pickle.dump(self.model, f) if self.trace is not None: self.trace._to_filesystem(output_directory)
def export_arff(file, export_file, conf): """Takes a JSON list of incidents, processes them, and exports an ARFF file""" conf = json.load(open(conf)) data = prepare.prepare(file, conf) # Structure of export export_data = { 'attributes': [], 'data': [], 'relation': 'TrafficData' } paths = [jp_parse(path[1]) for path in conf['attributes']] # print data[0] for incident in data: entry = [] for path in paths: results = path.find(incident) if len(results) > 0: if isinstance(results[0].value, basestring): entry.append(results[0].value.replace(" ", "-")) else: entry.append(results[0].value) else: entry.append(None) export_data['data'].append(entry) for index,attr in enumerate(conf['attributes']): if attr[2] in ['NUMERIC', 'REAL', 'INTEGER']: export_data['attributes'].append((attr[0], attr[2])) else: # Otherwise assume discrete vals = list(set([incident[index] for incident in export_data['data']])) export_data['attributes'].append((attr[0], vals)) export_data['description'] = "\n".join( [str(datetime.datetime.now()), json.dumps(conf, indent=2, separators=(',', ': '))]) with open(export_file, "w") as f: f.write(arff.dumps(export_data))
def save_to_arff(cls, X, y, endian = "big", save_sparse = False): X = X.todok() y = y.todok() x_prefix = 0 y_prefix = 0 x_attributes = [(u'X{}'.format(i),u'NUMERIC') for i in xrange(X.shape[1])] y_attributes = [(u'y{}'.format(i), [unicode(0),unicode(1)]) for i in xrange(y.shape[1])] if endian == "big": y_prefix = X.shape[1] relation_sign = -1 attributes = x_attributes + y_attributes elif endian == "little": x_prefix = y.shape[1] relation_sign = 1 attributes = y_attributes + x_attributes else: raise ValueError("Endian not in {big, little}") if save_sparse: data = [{} for r in xrange(X.shape[0])] else: data = [[0 for c in xrange(X.shape[1] + y.shape[1])] for r in xrange(X.shape[0])] for keys, value in X.iteritems(): data[keys[0]][x_prefix + keys[1]] = value for keys, value in y.iteritems(): data[keys[0]][y_prefix + keys[1]] = value dataset = { u'description': u'traindata', u'relation': u'traindata: -C {}'.format(y.shape[1] * relation_sign), u'attributes': attributes, u'data': data } return arff.dumps(dataset)
def ARFFCreation(): dataSet = ARFFDataset(Stripped, nonStripped) attList = [ ('Gazetteer', ['TRUE', 'FALSE']), ('CapitalLetter', ['TRUE', 'FALSE']), ('Preposition', ['TRUE', 'FALSE']), ('FollowingWord', ['TRUE', 'FALSE']), ('Place', ['yes', 'no']) ] obj = { 'description': u'', 'relation': 'PlaceNames', 'attributes': attList, 'data': dataSet, } with open('.\CreatedCSVs\TestData.arff', 'a') as f: f.write(arff.dumps(obj))
def save_to_arff(cls, X, y, endian="little", save_sparse=True): """Method for loading ARFF files as numpy array Parameters ---------- filename : string Path to ARFF file labelcount: integer Number of labels in the ARFF file endian: string{"big", "little"} Whether the ARFF file contains labels at the beginning of the attributes list ("big" endianness, MEKA format) or at the end ("little" endianness, MULAN format) input_feature_type: numpy.type as string The desire type of the contents of the return 'X' array-likes, default 'i8', should be a numpy type, see http://docs.scipy.org/doc/numpy/user/basics.types.html encode_nominal: boolean Whether convert categorical data into numeric factors - required for some scikit classifiers that can't handle non-numeric input featuers. save_sparse: boolean Whether to read arff file as a sparse file format, liac-arff breaks if sparse reading is enabled for non-sparse ARFFs. Returns ------- data: dictionary {'X': scipy sparse matrix with input_feature_type elements, 'y': scipy sparse matrix of binary (int8) label vectors } The dictionary containing the data frame, with 'X' key storing the input space array-like of input feature vectors and 'y' storing labels assigned to each input vector, as a binary indicator vector (i.e. if 5th position has value 1 then the input vector has label no. 5) """ X = X.todok() y = y.todok() x_prefix = 0 y_prefix = 0 x_attributes = [(u'X{}'.format(i), u'NUMERIC') for i in xrange(X.shape[1])] y_attributes = [(u'y{}'.format(i), [unicode(0), unicode(1)]) for i in xrange(y.shape[1])] if endian == "big": y_prefix = X.shape[1] relation_sign = -1 attributes = x_attributes + y_attributes elif endian == "little": x_prefix = y.shape[1] relation_sign = 1 attributes = y_attributes + x_attributes else: raise ValueError("Endian not in {big, little}") if save_sparse: data = [{} for r in xrange(X.shape[0])] else: data = [[0 for c in xrange(X.shape[1] + y.shape[1])] for r in xrange(X.shape[0])] for keys, value in X.iteritems(): data[keys[0]][x_prefix + keys[1]] = value for keys, value in y.iteritems(): data[keys[0]][y_prefix + keys[1]] = value dataset = { u'description': u'traindata', u'relation': u'traindata: -C {}'.format(y.shape[1] * relation_sign), u'attributes': attributes, u'data': data } return arff.dumps(dataset)
if(surgeryType != ""): surgeryCount += 1 elif(radiationType != ""): radiationCount += 1 behavior = line[223] # BEHAVIOR RECODE FOR ANALYSIS histology = line[225:227] # HISTOLOGY RECODE - BROAD GROUPINGS causeOfDeath = line[271] # SEER OTHER CAUSE OF DEATH CLASS survivalMonths = line[305:309] # Survival months - presumed alive data['data'].append([type, maritalStatus, race, tumorSize, survivalMonths]) # Write arff arffContents = arff.dumps(data) output = open('cancer.arff', 'w') output.write(arffContents) # Count things print "Number of records: " + str(numLines) print "Number of patients: " + str(len(patientIDs)) numRecordCounts = {} for patientID in patientIDs : if(patientIDs[patientID] in numRecordCounts): numRecordCounts[patientIDs[patientID]] += 1 else: numRecordCounts[patientIDs[patientID]] = 1 print "Number of patients with X number of records: "
def hbp_submit_search_criteria(input_dict): import orange import tempfile import arff import os dset = arff.load(open(os.path.dirname(os.path.abspath(__file__)) + "/new_adni.arff", "rb")) b = {} query = input_dict["query"] b["attributes"] = [] if query["classification"] != None: b["attributes"].append(("Classification", [u"CN", u"AD", u"LMCI", u"EMCI", u"SMC"])) else: query["classification"] = [None] if query["geo"] != None: b["attributes"].append( ("geo", [u"Africa", u"Asia", u"Australia", u"Europe", u"North America", u"South America"]) ) else: query["geo"] = [None] if query["age"] != None: b["attributes"].append(("age", [u"1-18", u"18-24", u"25-34", u"35-44", u"45-54", u"55-64", u"65+"])) else: query["age"] = [None] fields = [] for c in query["classification"]: for g in query["geo"]: for a in query["age"]: fields.append((c, g, a)) b["attributes"].append(("count", "NUMERIC")) b["description"] = "" b["relation"] = "HBP" b["data"] = [] for field in fields: d = [] if field[0] != None: d.append(field[0]) if field[1] != None: d.append(field[1]) if field[2] != None: d.append(field[2]) counts = 0 for i in dset["data"]: if ( (i[-1] == field[0] or field[0] == None) and (i[0] == field[2] or field[2] == None) and (i[1] == field[1] or field[1] == None) ): counts = counts + 1 d.append(counts) b["data"].append(d) # a = arff.load_data() f = tempfile.NamedTemporaryFile(delete=False, suffix=".arff") f.write(arff.dumps(b)) f.close() output_dict = {} output_dict["results"] = orange.ExampleTable(f.name) return output_dict
def handle(self, *args, **options): params = json.load(open(args[0])) source_name = params['database'] label_table_name = None label_column_name = None for key,value in params['label'].iteritems(): label_table_name = key label_column_name = value # Add parameters for setting ranges... past = datetime.datetime(datetime.MINYEAR, 1, 1) future = datetime.datetime(datetime.MAXYEAR, 12, 31) for ds in DataSource.objects.filter(name__contains=source_name): table_names = ds.table_names() table_columns = {} for name in table_names: table_columns[name] = ds.table_columns(name) # print(json.dumps(table_columns, indent=2)) points = ds.fetch_data(label_table_name, label_column_name, past, future) label_value_name = label_table_name + '_' + label_column_name categorical_values = {} rows = [] row_keys = [] for point in points: row_dict = {} point_time = point[0] label_value = point[1] row_dict[label_value_name] = label_value for table, columns in table_columns.iteritems(): fetched = ds.fetch_nearest(point_time, table, columns) if len(fetched) > 0: for i in range(0, len(columns)): column = columns[i] column_key = table + '_' + column[0] if row_keys.count(column_key) == 0: row_keys.append(column_key) if column[1] == 'text' or column[1] == 'boolean': column_values = set(['?']) try: column_values = categorical_values[column_key] except KeyError: categorical_values[column_key] = column_values column_values.add(slugify(unicode(str(fetched[i])))) if fetched[i] == True or fetched[i] == False: row_dict[column_key] = slugify(unicode(str(fetched[i]))) elif column[1] == 'text': row_dict[column_key] = slugify(unicode(fetched[i])) else: row_dict[column_key] = fetched[i] rows.append(row_dict) row_keys.sort() data = { 'relation': label_table_name + '_' + label_column_name, 'description': '' } attributes = [] ignore = [] for row_key in row_keys: value_def = 'REAL' if row_key in categorical_values: value_def = [] for value in categorical_values[row_key]: value_def.append(value) if value_def == 'REAL' or len(value_def) > 1: attributes.append((row_key, value_def)) else: ignore.append(row_key) data['attributes'] = attributes data_rows = [] for row_dict in rows: data_row = [] for row_key in row_keys: if ignore.count(row_key) == 0: try: data_row.append(row_dict[row_key]) except KeyError: data_row.append(None) data_rows.append(data_row) data['data'] = data_rows print('dumping....') print(arff.dumps(data)) print('done.')
def write_to_arff(data, filename): with open(filename, 'w') as f: f.write(arff.dumps(data))
def main(): lno=1 word_tot=0 corr=0 init_corr=0 beng_statistics = {} beng_statistics = ng.ngram_prof("./beng_train.txt",beng_statistics) eng_statistics = {} ''' text = " ".join(brown.words()) tokenizer = RegexpTokenizer("[a-zA-Z'`]+") text = tokenizer.tokenize(text) text = " ".join(text) brown_words=open("./brown_words.txt",'w') brown_words.write(text) brown_words.close() ''' eng_statistics = ng.ngram_prof("./brown_words.txt",eng_statistics) lang_stats={} lang_stats.update({'e':eng_statistics}) lang_stats.update({'b':beng_statistics}) #fin=open("./beng_corpus.txt",'r') fin=open("./BanglaEnglish_FIRE2013_AnnotatedDev.txt",'r') #fout_pred=open("./predicted_tags_arff.txt",'w') #fout_corr=open("./corrected_tags_arff.txt",'w') word_list=[] eng_dic=[] beng_dic=[] ngram=[] surround=[] corr_tag=[] data=[] sent=fin.readline() while(sent): ############ Only for Facebook corpus (COMMENT for FIRE) #sent = re.sub(r'[^\w\s]','',sent) ########################################## words=[] sent=sent.split() for elem in sent: ############ Only for FIRE CORPUS (COMMENT for Facebook) elem=elem.split('\\') corr_tag.append(elem[1][0]) elem=elem[0] ################################# elem.strip() words.append(elem) type_map = defaultdict(str) type_count = defaultdict(int) word_count=0 for word in words: word=word.strip(" ") word_list.append(word) word_count+=1 beng_rat=0 eng_rat=0 if(beng.beng_word(word)): beng_rat=1 if(eng.eng_search(word)): eng_rat=1 ############ REVERT 1/6 #eng_dic.append(eng_rat) #beng_dic.append(beng_rat) ############## word_statistics={} lang_ratio = {} grams = ngrams(word,4,pad_left=True,pad_right=True,left_pad_symbol=' ',right_pad_symbol=' ') grams=list(grams) ng_list=[] for j in range (len(grams)): ng_list.append(''.join(grams[j])) word_statistics = ng.ngram_hash(ng_list,word_statistics) word_statistics = sorted(word_statistics.items(), key=operator.itemgetter(1), reverse=True) for lang, ngrams_statistics in lang_stats.items(): distance = ng.compare_ng_prof(ngrams_statistics,word_statistics) lang_ratio.update({lang:distance}) ng_lang_result = min(lang_ratio, key = lang_ratio.get).upper() ngram.append(ng_lang_result) if(beng_rat and not eng_rat): ############# REVERT 2/6 beng_dic.append('2') ################### type_word="Bengali word" elif(eng_rat and not beng_rat): ############# REVERT 3/6 beng_dic.append('0') ################### type_word="English word" else: ############# REVERT 4/6 beng_dic.append('1') ################### if(ng_lang_result=='B'): type_word="Bengali word" else: type_word="English word" type_map[word]=type_word type_count[type_word]+=1 #print(str(word)+"(Detect:"+str(type_map[word])+")") if((type_count["English word"])>(type_count["Bengali word"])): default="e" else: default="b" print(str(lno)+default) lno+=1 for i in range(len(words)): word_count=0 type_count["English word"]=0 type_count["Bengali word"]=0 if(i>1 and i<(len(words)-2)): word_count=4 for j in range(i-2,min((len(words)-1),(i+3))): if(j!=i): type_count[type_map[words[j]]]+=1 elif (i<=1): word_count=min(i+2,len(words)-1) for j in range(min((len(words)-1),(i+3))): if(j!=i): type_count[type_map[words[j]]]+=1 elif(i>=(len(words)-2)): word_count= (len(words)-i)+1 for j in range(i-2,len(words)): if(j!=i): type_count[type_map[words[j]]]+=1 if(word_count): surround.append(str(type_count["Bengali word"]/word_count)) #if((type_count["Bengali word"]/word_count)>=0.5): # surround.append('1') #else: # surround.append('0') else: surround.append('0') sent=fin.readline() ############ Only for Facebook corpus (COMMENT for FIRE) #real_tag = open("./real_tags.txt",'r') #line = real_tag.readline().split() #for word in line: # tags=word.split('\\') # corr_tag.append(tags[1]) #real_tag.close() ###################################### for i in range(len(word_list)): vector=[] # vector.append(word_list[i]) ########REVERT 5/6 #vector.append(int(eng_dic[i])) ############## vector.append(int(beng_dic[i])) vector.append(ngram[i]) vector.append(float(surround[i])) vector.append(corr_tag[i]) data.append(vector) obj={ 'description': u'', 'relation': 'langid', 'attributes': [ #############REVERT 6/6 #('eng_dict','NUMERIC'), #################### ('beng_dict','NUMERIC'), ('ngram','STRING'), ('beng_surr','REAL'), ('real_tag','STRING') ], 'data':data, } final = arff.dumps(obj) #FIRE final_file=open("./lang_id_fire.arff",'w') # #Facebook #final_file=open("./lang_id.arff",'w') # final_file.write(final) final_file.close() fin.close()
import arff import pickle attributes = [] nFFT = 64 baseStr = "FFT_" for i in range(nFFT): name = baseStr + str(i) attributes.append((name,u'REAL')) attributes.append(("Ang",u'REAL')) #print(attributes) relation = "Dist_FFT" dataF = [] samples = 120 for i in range(samples): filename = str(i+1) + ".p" f = open(filename,"rb") p = pickle.load(f) f.close() p.pop(-2) dataF.append(p) data = {u'attributes': attributes, u'data': dataF, u'description': u'', u'relation': relation} f = open("myData2.arff","w") f.write(arff.dumps(data)) f.close()
def test_encode_source(self): obj = arff.loads(ARFF_SOURCE) result = arff.dumps(obj) expected = ARFF_DESTINY self.assertEqual(result, expected)
def save_to_arff(X, y, label_location="end", save_sparse=True, filename=None): """Method for dumping data to ARFF files Parameters ---------- X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features) input feature matrix y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels) binary indicator matrix with label assignments label_location: string {"start", "end"} (default is "end") whether the ARFF file will contain labels at the beginning of the attributes list ("start", MEKA format) or at the end ("end", MULAN format) save_sparse: boolean Whether to save in ARFF's sparse dictionary-like format instead of listing all zeroes within file, very useful in multi-label classification. filename : str or None Path to ARFF file, if None, the ARFF representation is returned as string Returns ------- str or None the ARFF dump string, if filename is None """ X = X.todok() y = y.todok() x_prefix = 0 y_prefix = 0 x_attributes = [(u'X{}'.format(i), u'NUMERIC') for i in range(X.shape[1])] y_attributes = [(u'y{}'.format(i), [str(0), str(1)]) for i in range(y.shape[1])] if label_location == "end": y_prefix = X.shape[1] relation_sign = -1 attributes = x_attributes + y_attributes elif label_location == "start": x_prefix = y.shape[1] relation_sign = 1 attributes = y_attributes + x_attributes else: raise ValueError("Label location not in {start, end}") if save_sparse: data = [{} for r in range(X.shape[0])] else: data = [[0 for c in range(X.shape[1] + y.shape[1])] for r in range(X.shape[0])] for keys, value in list(X.items()): data[keys[0]][x_prefix + keys[1]] = value for keys, value in list(y.items()): data[keys[0]][y_prefix + keys[1]] = value dataset = { u'description': u'traindata', u'relation': u'traindata: -C {}'.format(y.shape[1] * relation_sign), u'attributes': attributes, u'data': data } arff_data = arff.dumps(dataset) if filename is None: return arff_data with open(filename, 'w') as fp: fp.write(arff_data)
def hbp_interactive_analysis_post(postdata, input_dict, output_dict): import orange import tempfile import arff import os import numpy d = input_dict["Dataset"] dset = arff.load(open(os.path.dirname(os.path.abspath(__file__)) + "/new_adni.arff", "rb")) b = {} b["attributes"] = [] feature_names = [x.name for x in d.domain] if "Classification" in feature_names: b["attributes"].append(("Classification", [u"CN", u"AD", u"LMCI", u"EMCI", u"SMC"])) if "geo" in feature_names: b["attributes"].append( ("geo", [u"Africa", u"Asia", u"Australia", u"Europe", u"North America", u"South America"]) ) if "age" in feature_names: b["attributes"].append(("age", [u"1-18", u"18-24", u"25-34", u"35-44", u"45-54", u"55-64", u"65+"])) for v in postdata["variables"]: b["attributes"].append((v + "_avg", "NUMERIC")) b["attributes"].append((v + "_stdev", "NUMERIC")) b["attributes"].append(("count", "NUMERIC")) b["description"] = "" b["relation"] = "HBP" b["data"] = [] for i in d: new_i = [] if "Classification" in feature_names: clas = i[feature_names.index("Classification")].value new_i.append(clas) else: clas = None if "geo" in feature_names: geo = i[feature_names.index("geo")].value new_i.append(geo) else: geo = None if "age" in feature_names: age = i[feature_names.index("age")].value new_i.append(age) else: age = None sums = {} for v in postdata["variables"]: sums[v] = 0 vcounts = {} for v in postdata["variables"]: vcounts[v] = 0 vvalues = {} for v in postdata["variables"]: vvalues[v] = [] count = 0 attrs = [x[0] for x in dset["attributes"]] for ins in dset["data"]: if (ins[-1] == clas or clas == None) and (ins[0] == geo or geo == None) and (ins[1] == age or age == None): count = count + 1 for k, v in sums.items(): try: sums[k] = v + ins[attrs.index(k)] vcounts[k] = vcounts[k] + 1 vvalues[k].append(v) except: pass for v in postdata["variables"]: print vvalues[v] stdev = numpy.std(vvalues[v]) try: avg = sums[v] / vcounts[v] except: avg = -2 new_i.append(avg) new_i.append(stdev) new_i.append(count) b["data"].append(new_i) f = tempfile.NamedTemporaryFile(delete=False, suffix=".arff") f.write(arff.dumps(b)) f.close() output_dict = {} output_dict["results"] = orange.ExampleTable(f.name) return output_dict
def csv_to_arff(fileinput, type_list, relation_name, selected_attrs): with open(fileinput, 'r') as inputfile: data = csv.reader(inputfile, delimiter=',') arff_content = data_to_dict(data, type_list, relation_name, selected_attrs) return arff.dumps(arff_content)
#(u'Firstyrcumgpa', u'REAL') ] if isTestSet: template_attr = [] # Creat ARFF Template template = { u'attributes': template_attr, u'data': data, # list u'description': u'', u'relation': u'admission_stats' } # Save ARFF FILE with open(output_arffFile, 'w') as arffFile: arffFile.write(arff.dumps(template)) print 'Finished Classifying...' input_arff = output_arffFile if not useConverts: with open(input_arff, 'r+') as af: arffFile = arff.load(af) data = arffFile['data'] datalist = [] for row in data: replaceByConversion(row) datalist.append(row) print 'Finished Converting...' arffFile['data'] = datalist
def handle(self, *args, **options): count = 0 for job in ReportJob.objects.filter(job_start=None): params = json.loads(job.parameters) source_name = params['database'] label_table_name = None label_column_name = None for key,value in params['label'].iteritems(): label_table_name = key label_column_name = value job.job_start = datetime.datetime.now() job.save() # Add parameters for setting ranges... past = datetime.datetime(datetime.MINYEAR, 1, 1) future = datetime.datetime(datetime.MAXYEAR, 12, 31) for ds in DataSource.objects.filter(name__contains=source_name): table_names = ds.table_names() table_columns = {} for name in table_names: table_columns[name] = ds.table_columns(name) points = ds.fetch_data(label_table_name, label_column_name, past, future) label_value_name = label_table_name + '_' + label_column_name categorical_values = {} rows = [] row_keys = [] for point in points: row_dict = {} point_time = point[0] label_value = point[1] row_dict[label_value_name] = label_value for table, columns in table_columns.iteritems(): fetched = ds.fetch_nearest(point_time, table, columns) if len(fetched) > 0: for i in range(0, len(columns)): column = columns[i] column_key = slugify(table + '_' + column[0]) if row_keys.count(column_key) == 0: row_keys.append(column_key) if column[1] == 'text' or column[1] == 'boolean': column_values = set([None]) try: column_values = categorical_values[column_key] except KeyError: categorical_values[column_key] = column_values str_value = slugify(unicode(str(fetched[i]))) if str_value.strip() == '': str_value = 'empty_string' column_values.add(str_value) if fetched[i] == True or fetched[i] == False: row_dict[column_key] = slugify(unicode(str(fetched[i]))) elif column[1] == 'text': str_value = slugify(unicode(fetched[i])) if str_value.strip() == '': str_value = 'empty_string' row_dict[column_key] = str_value else: row_dict[column_key] = fetched[i] rows.append(row_dict) row_keys.sort() data = { 'relation': slugify(label_table_name + '_' + label_column_name), 'description': '' } attributes = [] ignore = [] for row_key in row_keys: value_def = 'REAL' if row_key in categorical_values: value_def = [] for value in categorical_values[row_key]: value_def.append(value) if ('_sensor_dt_' in row_key) == False and ('randomnoiseprobe' in row_key) == False and (value_def == 'REAL' or len(value_def) > 2 or (((None in value_def) == False) and len(value_def) > 1)): attributes.append((row_key, value_def)) else: ignore.append(row_key) data['attributes'] = attributes data_rows = [] for row_dict in rows: data_row = [] for row_key in row_keys: if ignore.count(row_key) == 0: try: data_row.append(row_dict[row_key]) except KeyError: data_row.append(None) data_rows.append(data_row) data['data'] = data_rows job.result_file.save(str(job.pk) + '_' + str(uuid.uuid4()), ContentFile(arff.dumps(data))) job.result_type = 'text/arff' job.job_end = datetime.datetime.now() job.save() count += 1 print(str(count) + ' job(s) run.')