class TrainTestSets(object): def __init__(self, *args): self.train = DataSet() self.test = DataSet() self.setOptions(args) def getTrainingSet(self): return self.train def getTestingSet(self): return self.test def setTrainingSet(self, inTrain): self.train = inTrain def setTestingSet(self, inTest): self.test = inTest def setOptions(self, arguments): newDataSet = DataSet() for num in range(0, len(arguments)): if arguments[num] == "-t": newDataSet.load(arguments[num+1]) self.setTestingSet(newDataSet) break elif arguments[num] == "-T": newDataSet.load(arguments[num+1]) self.setTrainingSet(newDataSet) break def __print__(self): self.train.__print__() self.test.__print__()
def __init__(self, sess): self.sess = sess self.CATEGORY_NUM = 151 self.IMAGE_SIZE = [224, 224] self.IMAGE_CHANNEL = 3 data_dir = '/home/give/Documents/dataset/ADEChallengeData2016' self.dataset = DataSet(data_dir) self.learning_rate = 1e-5 self.itertator_number = int(1e+5) self.BATCH_SIZE = 80 self.imgs = tf.placeholder( tf.float32, shape=[ self.BATCH_SIZE, self.IMAGE_SIZE[0], self.IMAGE_SIZE[0], self.IMAGE_CHANNEL ] ) self.y_ = tf.placeholder( tf.float32, shape=[ self.BATCH_SIZE, self.IMAGE_SIZE[0], self.IMAGE_SIZE[1], self.CATEGORY_NUM ] ) self.vgg = vgg16(self.imgs, self.sess, skip_layers=['fc6', 'fc7', 'fc8']) self.inference()
def ReadCSV_OneChannel(self,iChannel): setTest=DataSet(homeCSV=self.homeCSV,listCSV=self.listCSV4Test,labels=self.labels,numClasses=self.numClasses,branch4Train=self.branch4Train,resize=self.resize,numProcess=0) data=None label=None uid=None counterCSV=0 for iCSV in tqdm(self.listCSV4Test[iChannel]): counterCSV+=1 if (self.numFilesCut>0) and (counterCSV>self.numFilesCut): continue iReadCSV=self.homeCSV+iChannel+'/'+iCSV iReadClass=self.labels[iChannel] iData,iLabel,iUid, iClass=setTest.ReadCSV_OneFile(iClass=iReadClass, iCSV=iReadCSV) if data is None: data=iData label=iLabel uid=iUid else: data=np.r_[data,iData] label=np.r_[label,iLabel] uid=np.r_[uid,iUid] setTest.SetDataLabel(data=data,label=label,uid=uid) return (setTest, iChannel)
class Data: training = None testing = None def __init__(self, path, featureCols, labelCol, trainSize,preProc=False, addbias =False): ''' path is absolute path is a csv file ''' self.data = pd.read_csv(filepath_or_buffer=path, usecols=featureCols + labelCol) self.data = self.data.sample(frac=1).reset_index(drop=True) # frac=1 is taking all the dataset if preProc: self.features = self.data[featureCols].to_numpy(dtype='float64') self.labels = self.data[labelCol].to_numpy(dtype='float64') else: self.features = self.data[featureCols].to_numpy() self.labels = self.data[labelCol].to_numpy() self.labels = self.labels.flatten() self.n = self.features.shape[1] + 1 if addbias else 0 # Number of features with bias included self.m = self.features.shape[0] self.training = DataSet(self.features[:int(trainSize * self.m)], self.labels[:int(trainSize * self.m)], preProc,addbias) self.testing = DataSet(self.features[int(trainSize * self.m):],self.labels[int(trainSize * self.m):], False, addbias) if preProc: self.testing.applyNormalization(self.training.normFeatures, self.training.normLabels)
def read_img_sets(image_dir, image_size, validation_size=0): class DataSets: pass data_sets = DataSets() images, labels, ids, cls, cls_map = load_data(image_dir, image_size) images, labels, ids, cls = shuffle(images, labels, ids, cls) if isinstance(validation_size, float): validation_size = int(validation_size * images.shape[0]) test_images = images[:validation_size] test_labels = labels[:validation_size] test_ids = ids[:validation_size] test_cls = cls[:validation_size] train_images = images[validation_size:] train_labels = labels[validation_size:] train_ids = ids[validation_size:] train_cls = cls[validation_size:] data_sets.train = DataSet(train_images, train_labels, train_ids, train_cls) data_sets.test = DataSet(test_images, test_labels, test_ids, test_cls) return data_sets, cls_map
def init_isbi2012_train(): isbi2012_train = DataSet(meta_folder, "isbi2012_train") raw_path = os.path.join(data_path, "raw/train-volume.h5") raw_key = "data" # nasims baseline prob map inp_path = os.path.join( data_path, "probabilities/old_probs/nasims_oldbaseline_train.h5") inp_key = "exported_data" isbi2012_train.add_raw(raw_path, raw_key) isbi2012_train.add_input(inp_path, inp_key) # 2d wsdt on namsis pmap seg_path0 = os.path.join( data_path, "watersheds/old_watersheds/ws_dt_nasims_baseline_train.h5") seg_key = "superpixel" isbi2012_train.add_seg(seg_path0, seg_key) # layerwise gt gt_path = os.path.join(data_path, "groundtruth/gt_cleaned.h5") isbi2012_train.add_gt(gt_path, "data") meta.add_dataset("isbi2012_train", isbi2012_train)
def __init__(self, args): self.dataName = args.dataName self.dataSet = DataSet(self.dataName) self.shape = self.dataSet.shape self.maxRate = self.dataSet.maxRate self.train = self.dataSet.train self.test = self.dataSet.test self.negNum = args.negNum self.testNeg = self.dataSet.getTestNeg(self.test, 99) self.add_embedding_matrix() self.add_placeholders() self.userLayer = args.userLayer self.itemLayer = args.itemLayer self.add_model() self.add_loss() self.lr = args.lr self.add_train_step() self.checkPoint = args.checkPoint self.init_sess() self.maxEpochs = args.maxEpochs self.batchSize = args.batchSize self.topK = args.topK self.earlyStop = args.earlyStop
def setUp(self): self.DataSet = DataSet("./data/199801.txt.tmp") self.DataSet.load_dict() self.hashindexer = HashIndexer(self.DataSet) self.hashindexer.build_indexer() self.trieindexer = TrieIndexer(self.DataSet) self.trieindexer.build_indexer()
def __init__(self, args, density): self.dataset = DataSet(args.dataType, density) self.dataType = self.dataset.dataType self.density = self.dataset.density self.shape = self.dataset.shape self.train = self.dataset.train self.test = self.dataset.test self.epochNum = args.epochNum self.batchSize = args.batchSize self.layers = args.layers self.regLayers = args.regLayers self.lr = args.lr self.decay = args.decay self.optimizer = args.optimizer self.verbose = args.verbose self.store = args.store self.modelPath = args.modelPath self.resultPath = args.resultPath self.model = self.compile_model() self.run()
def read_data_sets(data_dir): filename = "cifar-100-python.tar.gz" print("getting data") SOURCE_URL = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz' local_file = base.maybe_download(filename, data_dir, SOURCE_URL) print('Extracting', filename) train_images, train_labels = [], [] test_images, test_labels = [], [] with gfile.Open(data_dir + "/" + filename, 'rb') as f, tarfile.open(fileobj=f) as tar: for x in tar.getnames(): if "data_batch" in x: i, l = _get_data(tar.extractfile(x)) train_images.extend(i.reshape((i.shape[0], 32, 32, 3))) train_labels.extend(l) if "test_batch" in x: i, l = _get_data(tar.extractfile(x)) test_images.extend(i.reshape((i.shape[0], 32, 32, 3))) test_labels.extend(l) train_images = np.array(train_images) test_images = np.array(test_images) train_labels = np.array(train_labels) test_labels = np.array(test_labels) train = DataSet(train_images, train_labels, dtype=dtypes.uint8, depth=100) test = DataSet(test_images, test_labels, dtype=dtypes.uint8, depth=100) return base.Datasets(train=train, validation=None, test=test)
def _create_trees(self, X_train, y_train): dataset = DataSet(X_train, y_train) self._trees = [ DecisionTree(dataset.subset_from_ratio(self.ratio_samples), self.coefficient, self.values, self._max_depth) for _ in range(self._num_trees) ]
def parse(self, dsId, maxResults=0): job = self.gate.getDs(dsId, maxResults) if not job: raise TypeError('Parser received illegal dataset ID') job = job[0] #get data as a list of rows instead of columns colData = map(None, *job["data"] ) dataset = DataSet( job["title"].encode('utf-8'), job["id"].encode('utf-8'), job["columns"][0]["time_granularity"].encode('utf-8')) for col, item in enumerate(colData): #if type(colData[0]) != tuple: # raise TypeError('Data is not of type: Tuple') #else: tl = TimeLine( job["columns"][col]["title"].encode('utf-8'), job["columns"][col]["cid"].encode('utf-8'), colData[col] ) dataset.append(tl) return dataset
def readFile(self, file): ds = DataSet([]) names = file.readline() names = names[1:] curName = "" # for all char's if it is a ";" create a node with the name up until the, else add the char to the name for char in names: if char == ";" or char == "\n": node = Node(curName, []) ds.addNote(node) curName = "" else: curName = curName + char numNode = 0 nodes = ds.getNodes() # skips the first line because of 8 for line in file: # remove everything up to and including the first ";" line = line[len(nodes[numNode].getName()) + 1:] count = 0 i = 0 while i < len(line) - 1: if int(line[i]) != 0: nodes[numNode].addLink([nodes[count], int(line[i])]) count += 1 i += 2 numNode += 1 return ds
def get_data_loader(self, train_path_list, valid_path_list, test_path_list): train_data_set = DataSet(train_path_list) valid_data_set = DataSet(valid_path_list) test_data_set = DataSet(test_path_list) train_data_loader = DataLoader( train_data_set, pin_memory=True, batch_size=self.h_params.train.batch_size, shuffle=True, num_workers=self.h_params.resource.num_workers, drop_last=True) valid_data_loader = DataLoader( valid_data_set, pin_memory=True, batch_size=self.h_params.train.batch_size, shuffle=False, num_workers=self.h_params.resource.num_workers, drop_last=True) test_data_loader = DataLoader( test_data_set, batch_size=self.h_params.train.batch_size, shuffle=False, num_workers=self.h_params.resource.num_workers, drop_last=False) return train_data_loader, valid_data_loader, test_data_loader
def __init__(self, args, density): self.dataSet = DataSet(args, density) self.dataType = self.dataSet.dataType self.density = self.dataSet.density self.shape = self.dataSet.shape self.train = self.dataSet.train self.test = self.dataSet.test self.epochNum = args.epochNum self.batchSize = args.batchSize self.gruLayers = args.gruLayers self.gtfLayers = args.gtfLayers self.regLayers = args.regLayers self.dropLayers = args.dropLayers self.lr = args.lr self.decay = args.decay self.optimizer = args.optimizer self.verbose = args.verbose self.preTraining = args.preTraining self.store = args.store self.modelPath = args.modelPath self.imagePath = args.imagePath self.resultPath = args.resultPath self.model = self.load_model() self.run()
class OType: def __init__(self, topics_n, id): self.id = id self.dSet = DataSet(O_TYPES_STR[id]) self.N = topics_n self.topics = [] for i in range(self.N): self.topics.append(Topic(i)) def add_item(self, probabilities, cand): #officer = db.officer[line] #if officer == 1: # y = 2 self.dSet.add_item(cand) for i in range(self.N): self.topics[i].add_item(probabilities[i], cand) def print_topics(self): output = "Officer Type {}\n".format(O_TYPES_STR[self.id]) output = "{}\n{}\n".format(output, self.dSet.print_data()) for i in range(self.N): output = "{}{}\n".format(output, self.topics[i].print_datasets()) return output
def __init__(self, plotName, fileName=None, dataClass=None, showPlots=True): DataSet.__init__(self, fileName=fileName, dataClass=dataClass) print('-'*70) print('Establishing {} Plot'.format(plotName)) print('-'*70) self.plotName = plotName #TODO: fix this to acocmodate non-colorbar plots self.figure = plt.figure(figsize=(6,5)) self.axes = self.figure.add_axes([0.1, 0.1, 0.825*5/6, 0.825]) self.colorBarAxes = self.figure.add_axes([0.825, 0.1, 0.05, 0.825]) self.animationImages = [[] for _ in range(len(self.blockData.keys()))] if showPlots != True: matplotlib.use('Agg') time = min(self.blockData.keys()) self.blocks = self.blockData[time].keys() self.zones = self.zoneData[time].keys() self.corners = self.cornerData[time].keys() self.contacts = self.contactData[time].keys() self.gridPoints = self.gridPointData[time].keys() self.domains = self.domainData[time].keys()
def __init__(self, topics_n, id): self.id = id self.dSet = DataSet(O_TYPES_STR[id]) self.N = topics_n self.topics = [] for i in range(self.N): self.topics.append(Topic(i))
def setOptions(self, arguments): for num in range(0, len(arguments[0])): if arguments[0][num] == "-k": self.k = int(arguments[0][num+1]) elif arguments[0][num] == "-t": newDataSet = DataSet() newDataSet.load(arguments[0][num+1]) self.instances = newDataSet
def deriveGeneStructure(all_gene_file): """ copy of structures/main.py find out how to load object from another script """ # might need to switch directory to structures from DataSet import DataSet from GeneFamily import GeneFamily """Create all Gene objects""" data = DataSet() for l in all_gene_file[1:]: data.addGene(l.split()) """Create a list of Species""" def generateSpeciesDict(): for g in data.genesDict: currentSpecies = data.genesDict[g].species if currentSpecies not in data.speciesDict: data.addSpecies(currentSpecies) generateSpeciesDict() """Make Family Dictionary""" def generateFamilyDict(): for g in data.genesDict: currentGene = data.genesDict[g] if currentGene.family not in data.familiesDict: currentFamily = GeneFamily(currentGene.family, data.speciesDict) data.addFamily(currentFamily.familyName, currentFamily) else: currentFamily = data.familiesDict[currentGene.family] currentFamily.addToFamily(currentGene) generateFamilyDict() """Make a dictionary which gives information about gene order """ geneOrder = {} for s in data.speciesDict: geneOrder[s] = {} for g in data.genesDict: currentGene = data.genesDict[g] try: geneOrder[currentGene.species][currentGene.ctg].append( currentGene) #pass gene object except KeyError: geneOrder[currentGene.species][currentGene.ctg] = [currentGene] #sort genes in their contig for s in geneOrder: for c in geneOrder[s]: geneOrder[s][c].sort(key=lambda gene: int(gene.start)) return data, geneOrder
def __init__(self , point_len , dense = True ): self.weights = [1.] * point_len self.data = DataSet(point_len , dense) for i in range(point_len): self.data.append() self._keys = xrange(point_len) self._len = point_len self.outs_counter = Counter() self.point_ins = defaultdict(set)
def discretization_test(raw, filename): with open(filename, "w") as f: f.write("n;precision;recall;accuracy;f1\n") dataset = DataSet(raw) for i in [5, 7, 10, 15, 20, 40]: dataset.discretize_values(i) res = cross_validation(dataset) res = [str(i)] + [str(a) for a in res] f.write(";".join(res) + "\n")
def setUp(self): self.data_set = DataSet("../input") self.data_set.load_train( os.path.join( self.data_set.cache_dir, "segment_numseg23_target@AB@CD@E@_ratio0.2_rand0_TRAIN.csv")) self.data_set.load_test( os.path.join( self.data_set.cache_dir, "segment_numseg23_target@AB@CD@E@_ratio0.2_rand0_TEST.csv"))
def __init__(self, centre, radius, dataClass=None, fileName=None, ): DataSet.__init__(self, dataClass=dataClass, fileName=fileName) self.centre = centre self.radius = radius self.singleBlock = False if len(self.contactData) == 0: self.singleBlock = True self.calculateHomogenizationParameters()
def composer(dict_data, sub_index, offset = 0): x = dict_data['train'].data[sub_index['train'], :] + offset; y = dict_data['train'].labels[sub_index['train'], :]; train = DataSet(x, y, onehot = False); x = dict_data['validation'].data[sub_index['validation'], :] + offset; y = dict_data['validation'].labels[sub_index['validation'], :]; validation = DataSet(x, y, onehot = False); x = dict_data['test'].data[sub_index['test'], :] + offset; y = dict_data['test'].labels[sub_index['test'], :]; test = DataSet(x, y, onehot = False); return {'train':train, 'validation':validation, 'test':test};
def __init__(self): """ Constructor. @ In, None @ Out, None """ DataSet.__init__(self) self.name = 'PointSet' self.type = 'PointSet' self.printTag = self.name self._neededForReload = [] # PointSet doesn't need anything to reload
def _readMoreXML(self, xmlNode): """ Initializes data object based on XML input. @ In, xmlNode, xml.etree.ElementTree.Element or InputData.ParameterInput, input specification @ Out, None """ DataSet._readMoreXML(self, xmlNode) # default to taking last point if no other spec was used # TODO throw a warning here, once we figure out how to give message handler in all cases if self._selectInput is None: self._selectInput = ('inputRow', -1)
def setOptions(self, arguments): newDataSet = DataSet() for num in range(0, len(arguments)): if arguments[num] == "-t": newDataSet.load(arguments[num+1]) self.setTestingSet(newDataSet) break elif arguments[num] == "-T": newDataSet.load(arguments[num+1]) self.setTrainingSet(newDataSet) break
class DataSetTester(unittest.TestCase): def setUp(self): self.dataSet = DataSet("Atvinnuleysi", "v28", "Year") self.timeline = TimeLine("Title1", "id1") def testInsertItem(self): self.dataSet.append(self.timeline) self.dataSet.append(self.timeline) self.assertEqual(len(self.dataSet), 2)
def test_data_to_records(): from DataSet import DataSet d = DataSet(FILE='demo/sample.csv') r = None for row in d.iterate(): r = row break for row in d.to_records(): assert r[1] == row[1].split(';') break
def __init__(self): """ Constructor. @ In, None @ Out, None """ DataSet.__init__(self) self.name = 'HistorySet' self.type = 'HistorySet' self.printTag = self.name self._tempPivotParam = None self._neededForReload = [ ] # HistorySet doesn't need anything special to load, since it's written in cluster-by-sample CSV format
def __init__(self): os.environ["CUDA_VISIBLE_DEVICES"] = "1" self._set_args() self.args.dim = 6144 self.args.disLayer = [6144, 6144] self.args.geneLayer = [6144, 6144] self.args.disLayer_s = [4096, 4096] self.args.geneLayer_s = [4096, 4096] self.args.maxEpochs = 100 self.args.negNum = 50 self.args.l2_weight = 1e-5 self.data_set = DataSet() self.train()
def rules_generated_test(raw, filename): with open(filename, "w") as f: f.write("n;rules\n") dataset = DataSet(raw) for i in [3, 4, 5, 6, 7, 8, 9, 10]: dataset.discretize_values(i) results = [] for j in range(3): # liczba prób values = dataset.cross_validation(5) for v in values: bayes = ILA(v["train"]) results.append(len(bayes.rules)) f.write(str(i) + ";" + str(mean(results)) + "\n")
def run_test(): dictionary = DP.read_dict(dict_file) raw_test, choices = DP.read_test(test_file, choices_file) test = DataSet(raw_test, len(dictionary), cut=False) # RNN Parameters N_input = test.datalen N_class = len(dictionary) N_iter = N_epoch * N_input # Input x = tf.placeholder(dtype=tf.int32, shape=[batch_size, None]) y = tf.placeholder(dtype=tf.int32, shape=[batch_size, None]) embeddings = tf.Variable(tf.random_uniform([N_class, N_hidden], -1.0, 1.0)) embed = tf.nn.embedding_lookup(embeddings, x) y_reshape = tf.reshape(y, [-1]) # Weights w = tf.Variable(tf.random_normal([N_hidden, N_class])) b = tf.Variable(tf.random_normal([N_class])) # RNN pred = RNN(embed, w, b) # accuracy correct_pred = tf.equal(tf.argmax(pred, 1), tf.cast(y_reshape, tf.int64)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) init = tf.global_variables_initializer() ans = [] # with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: with tf.Session() as sess: sess.run(init) saver = tf.train.Saver() saver.restore(sess, model_file) for i in range(N_input): batch_x, _ = test.next_batch(batch_size=1) spaceID = np.argwhere(batch_x[0] == SPACE)[0, 0] prob = sess.run(pred, feed_dict={x: batch}) best_choice = np.argmax(prob[spaceID - 1, choices[i]]) ans.append(best_choice) return np.array(ans)
def init_testds(): ds = DataSet("/home/consti/Work/data_neuro/cache/testds", "ds_test") raw_path = "/home/consti/Work/data_neuro/test_block/test-raw.h5" seg_path = "/home/consti/Work/data_neuro/test_block/test-seg.h5" prob_path = "/home/consti/Work/data_neuro/test_block/test-probs.h5" gt_path = "/home/consti/Work/data_neuro/test_block/test-gt.h5" ds.add_raw(raw_path, "data") ds.add_input(prob_path, "data") ds.add_seg(seg_path, "data") ds.add_gt(gt_path, "data") meta.add_dataset("ds_test", ds_test)
def main(): m = MtGox() d = DataSet() while True: m.updateDataSet(d) if m.getSell() > SELL_THRESHOLD: m.sellBTC(m.getBTC()*PERCENT_SELL,m.getSell()) if m.getBuy() < BUY_THRESHOLD: m.buyBTC((m.getUSD()*PERCENT_BUY)/m.getBuy(), m.getBuy()) m.updateDataSet(d) os.system("clear") m.printTitle() d.printData() m.printFunds() m.printOrders() m.printStatus() time.sleep(UPDATE_TIME)
def per_sentence_bionlp_fscores(test_filename, test_dir, gold_dir): import tempfile from DataSet import DataSet from cStringIO import StringIO test_dir = path(test_dir) gold_dir = path(gold_dir) sentences = DataSet.from_filenames(test_filename) for docid, sentences_in_doc in sentences.group_by_metadata('DOC'): # if docid != '9015187': # if docid != '9081693': # if docid != '9257843': # if docid != '8108127': # if docid != '9115366': # if docid != '9361029': # continue print 'DOC:', docid our_total_proposed = 0 our_total_matched = 0 for sentence in sentences_in_doc: parse = sentence.gold_parse sentence.parses = [parse] our_score_components = parse.bionlp_fscore_components(sentence) matched, gold, proposed = our_score_components our_total_proposed += proposed our_total_matched += matched conll_version = StringIO() parse.write_conll(conll_version, include_metadata=False, sentence=sentence) conll_version.seek(0) conll_version = conll_version.read() import BioNLPConversionDB converter = BioNLPConversionDB.get_converter() bionlp_events_string = converter.convert(conll_version) if 1: print 'Events ---' print bionlp_events_string print 'Events ---' print "Ours:", our_score_components temp_test_dir = path(tempfile.mkdtemp(prefix=docid + '-')) temp_test_filename = path(temp_test_dir/docid + '.a2.t1') temp_test_file = file(temp_test_filename, 'w') temp_test_file.write(bionlp_events_string) temp_test_file.close() real_score_components = real_evaluation_bionlp_components(temp_test_dir, gold_dir) print 'Real:', real_score_components if our_score_components != real_score_components: real_evaluation_bionlp_components(temp_test_dir, gold_dir, show_output=True) temp_test_dir.rmtree() if our_score_components != real_score_components: raise 'mismatch'
def per_sentence_bionlp_fscores_nbest(test_filenames, gold_dir): import tempfile from DataSet import DataSet from cStringIO import StringIO gold_dir = path(gold_dir) sentences = DataSet.from_filenames(*test_filenames) for docid, sentences_in_doc in sentences.group_by_metadata('DOC'): # if docid != '9361029': # continue print 'DOC:', docid our_total_proposed = 0 our_total_matched = 0 for i, sentence in enumerate(sentences_in_doc): for j, parse in enumerate(sentence): print "DOC:", docid, 'Sentence:', i, 'Parse:', j our_score_components = parse.bionlp_fscore_components(sentence) matched, gold, proposed = our_score_components our_total_proposed += proposed our_total_matched += matched conll_version = StringIO() parse.write_conll(conll_version, include_metadata=False, sentence=sentence) conll_version.seek(0) conll_version = conll_version.read() import BioNLPConversionDB converter = BioNLPConversionDB.get_converter() bionlp_events_string = converter.convert(conll_version) if 0: print 'Events ---' print bionlp_events_string print 'Events ---' temp_test_dir = path(tempfile.mkdtemp(prefix=docid + '-')) temp_test_filename = path(temp_test_dir/docid + '.a2.t1') temp_test_file = file(temp_test_filename, 'w') temp_test_file.write(bionlp_events_string) temp_test_file.close() real_score_components = real_evaluation_bionlp_components(temp_test_dir, gold_dir) if our_score_components != real_score_components: real_evaluation_bionlp_components(temp_test_dir, gold_dir, show_output=True) temp_test_dir.rmtree() if our_score_components != real_score_components: print "Ours:", our_score_components print 'Real:', real_score_components print 'Events ---' print bionlp_events_string print 'Events ---' raise 'mismatch'
def __init__(self, *args): self.train = DataSet() self.test = DataSet() self.setOptions(args)
def setOptions(self, arguments): for num in range(0, len(arguments[0])): if arguments[0][num] == "-t": newDataSet = DataSet() newDataSet.load(arguments[0][num+1]) self.train(newDataSet)
class knn(Classifier): def __init__(self, *args): super(knn, self).__init__(*args) self.k = 3 self.instances = DataSet() self.setOptions(args) def train(self, inDataSet): self.instances = inDataSet def classify(self, input): neighbors = [] if type(input) == Example: for index, item in enumerate(self.instances.getExamples().getExamplesList()): if len(neighbors) < self.k: tempNeighbor = neighbor() tempNeighbor.setNeighbor(self.instances.getAttributes().getAttributesList()[self.instances.getAttributes().getClassIndex()].domain[item.values[self.instances.getAttributes().getClassIndex()]], self.distance(input, item)) neighbors.append(tempNeighbor) else: highestDist = -1 highestIndex = -1 for num in range(0, len(neighbors)): if num < len(neighbors)-1: if neighbors[num].distance >= neighbors[num + 1].distance: highestDist = neighbors[num].distance highestIndex = num else: highestDist = neighbors[num + 1].distance highestIndex = num + 1 elif neighbors[num] < highestDist: highestDist = neighbors[num] highestIndex = num if self.distance(input, self.instances.getExamples().getExamplesList()[index]) < highestDist: newNeighbor = neighbor() newNeighbor.setNeighbor(self.instances.getAttributes().getAttributesList()[self.instances.getAttributes().getClassIndex()].domain[item.values[self.instances.getAttributes().getClassIndex()]], self.distance(input,item)) neighbors[highestIndex] = newNeighbor return self.vote(neighbors) elif type(input) == DataSet: rightCount = 0 for index, item in enumerate(self.instances.getExamples().getExamplesList()): if self.classify(self.instances.getExamples().getExamplesList()[index]) == self.instances.getAttributes().getClassAttribute().domain[self.instances.getExamples().getExamplesList()[index].values[self.instances.getExamples().attributes.getClassIndex()]]: rightCount += 1 performance = Performance() performance.setPerf(rightCount, len(self.instances.getExamples().getExamplesList())) return performance def setOptions(self, arguments): for num in range(0, len(arguments[0])): if arguments[0][num] == "-k": self.k = int(arguments[0][num+1]) elif arguments[0][num] == "-t": newDataSet = DataSet() newDataSet.load(arguments[0][num+1]) self.instances = newDataSet def distance(self, observation, example): total = 0 for num in range(0, len(observation.attributes.getAttributesList())-1): if observation.values[num] != example.values[num]: total += 1 return total def vote(self, neighbors): voteDict = {} for index, items in enumerate(neighbors): if items.classifier in voteDict.keys(): voteDict[items.classifier] += 1 else: voteDict[items.classifier] = 1 return max(voteDict, key = voteDict.get)
def __init__(self, *args): super(knn, self).__init__(*args) self.k = 3 self.instances = DataSet() self.setOptions(args)
def evaluate(self, Classifier, *args): proportion = 0 average = 0.0 accuracies = [] performance = Performance() trainingSet = DataSet() for num in range(0, len(args[0])): if args[0][num] == "-t": trainingSet.load(args[0][num+1]) if args[0][num] == "-T": testSet = DataSet() testSet.load(args[0][num+1]) if args[0][num] == "-p": proportion = float(args[0][num+1]) for items in range(0, int(proportion * len(trainingSet.getExamples().getExamplesList()))): trainingSet.getExamples().add(trainingSet.getExamples().getExamplesList()[items]) trainingSet.setAttributes(trainingSet.getAttributes()) if type(Classifier) == ID3: Classifier.train(trainingSet) performance = Classifier.classify(testSet) return str(performance) else: print "Error in Evaluator:evaluate" performance = Classifier.classify(testSet) return str(performance) for num in range(0, self.folds): testSet = DataSet() trainSet = DataSet() for items in trainingSet.getExamples().getExamplesList(): randomNum = random.randint(0,self.folds-1) if randomNum != num: testSet.getExamples().add(items) else: trainingSet.getExamples().add(items) testSet.setAttributes(trainingSet.getAttributes()) trainSet.setAttributes(trainingSet.getAttributes()) if (len(trainingSet.attributes.attributes) > 0): trainSet = trainingSet Classifier.train(trainSet) tempPerformance = Classifier.classify(testSet) accuracies.append(tempPerformance.accuracy) average += tempPerformance.accuracy performance += tempPerformance return str(performance) + " +- " + str(self.stdDev(accuracies, average))
@return integer indicating the class of the unknown data. """ types = { "levenshtein": Distance().levenshtein, "l": Distance().levenshtein, 0: Distance().levenshtein , "hamming": Distance().hamming, "h": Distance().hamming, 1: Distance().hamming , "euclidean": Distance().euclidean, "e": Distance().euclidean, 2: Distance().euclidean , "manhattan": Distance().manhattan, "m": Distance().manhattan, 3: Distance().manhattan , "chebyshev": Distance().chebyshev, "c": Distance().chebyshev, 4: Distance().chebyshev } results = [types[distanceType](x.getValue(), data) for x in self.trainset] results = [(i,x) for i,x in enumerate(results)] kernels = sorted(results, key = lambda x:x[1])[:3] kernels = [self.trainset[i].getLabel() for i,x in kernels] kernels = [(n, kernels.count(n)) for n in set(kernels)] return sorted(kernels, key = lambda x:x[1], reverse = True)[0][0] if __name__ == "__main__": from DataSet import DataSet ds = DataSet("C:\\Users\\a5rjqzz\\Desktop\\Python\\pyClassifiers\\data\\IBk\\sample_set_lang.gla") bk = IBk() bk.train(ds.getExamples()) kn = ds.convert("y n n") cl = bk.classify(kn, 3) print cl print ds.getAttributes(1)[-1].getLabel(cl)
tp = [(a,b) for a,b in accuracyResults if a == 1 and b ==1] tn = [(a,b) for a,b in accuracyResults if a == 0 and b ==0] precision = float(len(tp))/(len(tp) + len(fp)) recall = float(len(tp))/(len(tp) + len(fn)) return 1./((a*(1/precision))+((1-a)*1/recall)) if __name__ == "__main__": from DataSet import DataSet from NaiveBayes import NaiveBayes from IBk import IBk fileIn = "C:\\Users\\a5rjqzz\\Desktop\\Python\\pyClassifiers\\data\\IBk\\sample_set_life.gla" ds = DataSet(fileIn) nb = NaiveBayes() es = Estimator() ib = IBk() for i in xrange(30):# train, test = ds.getTrainTestSet() crossValida = ds.getCrossValidationSet(2) #nb.train(ds) #results = nb.test(test) #print es.accuracy(results) #ib.train(train) #results = ib.test(test)
f = dataPath + "IBk\\sample_set_cars.gla" f = dataPath + "IBk\\sample_set_tennis.gla" #f = dataPath + "IBk\\sample_set_numbers.gla" #f = dataPath + "IBk\\sample_set_fish.gla" #f = dataPath + "IBk\\sample_set_life.gla" #f = dataPath + "IBk\\sample_set_word.gla" #f = dataPath + "DataSet_Client Document Preparation for Engine Tuning.gla" f = dataPath + "HospitalDocuments.gla" f = dataPath + "DataSets\\20160126_1501_ClientSiteData.gla" f = dataPath + "DataSets\\20160129_1322_ClientSiteData.gla" f = dataPath + "DataSets\\20160129_1358_ClientSiteData.gla" f = dataPath + "DataSets\\20160201_1530_ClientSiteData.gla" ds = DataSet(f) dt = DecisionTree() es = Estimator() pr = Prune() a = ds.getAttributes() b, c, d = ds.getTrainValidateTestSet(.7) #b, d, c = ds.getTrainValidateTestSet(.7) #b, d = ds.getTrainTestSet() #print len(b), len(c), len(d) dt.train(b,a, 4, 3) output = dt.test(d) print "Single DT on c: {0}%".format(round(es.accuracy(output)*100, 2)) print "train\t\t", len(b), b.getAllLabels()
append = results.append for example in examples: append((self.classify(example.getData()), example.getLabel()))#, example.getLabel(), example.getValues() print results return results if __name__=="__main__": import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import random import os print os.getcwd() ds = DataSet("..//..//data//ml//test_weather.gla") p = Perceptron(dataset=ds, epochs=10) print "Perceptron test:", p.classify([0,0,1,1]) p.test(ds.getExamples()) attribute1 = [n for n in xrange(10)] attribute2a = [random.sample(range(50)[:35],1)[0] for n in xrange(5)] attribute2b = [random.sample(range(50)[20:],1)[0] for n in xrange(5)] class0examples = [[attribute1[n], attribute2a[n], 0] for n in xrange(5)] class1examples = [[attribute1[n], attribute2b[n], 1] for n in xrange(5)] for exs in class1examples+class0examples: #print exs
def generateMapFile(self): """ Creates the MapFile object that encodes a map file publishing the complex outputs and writes it to disk. :returns: string with the path to the map file generated. None if no map file was generated (no complex outputs present). """ if(self.outputs is None) or (len(self.outputs) != len(self.execution.processOutputs)): self.logger.error(self.ERR_08) raise Exception(self.ERR_08) #self.map = UMN.MapFile(self.processId) self.map = MapFile(self.processId) self.map.shapePath = self.pathFilesGML self.map.epsgCode = self.epsg self.map.mapTemplate = self.mapTemplate self.map.imagePath = self.imagePath self.map.imageURL = self.imageURL self.map.mapServerURL = self.mapServerURL self.map.mapFilesPath = self.mapFilesPath self.map.otherProjs = self.otherProjs self.map.meta_fees = self.meta_fees self.map.meta_accessconstraints = self.meta_accessconstraints self.map.meta_keywordlist = self.meta_keywordlist self.map.meta_addresstype = self.meta_addresstype self.map.meta_address = self.meta_address self.map.meta_city = self.meta_city self.map.meta_stateorprovince = self.meta_stateorprovince self.map.meta_postcode = self.meta_postcode self.map.meta_country = self.meta_country self.map.meta_contactelectronicmailaddress = self.meta_contactelectronicmailaddress self.map.meta_contactperson = self.meta_contactperson self.map.meta_contactorganization = self.meta_contactorganization self.map.meta_contactposition = self.meta_contactposition self.map.meta_role = self.meta_role self.map.meta_contactvoicetelephone = self.meta_contactvoicetelephone self.map.meta_contactfacsimiletelephone = self.meta_contactfacsimiletelephone self.map.meta_contactinstructions = self.meta_contactinstructions self.map.meta_hoursofservice = self.meta_hoursofservice for output in self.execution.processOutputs: output.writeToDisk(self.pathFilesGML); providedTitle = self.outputs[output.identifier] dataSet = DataSet(output.filePath, providedTitle, output.identifier) self.dataSets.append(dataSet) layerEPSG = dataSet.getEPSG() if (layerEPSG == None): layerEPSG = self.map.epsgCode if dataSet.dataType == dataSet.TYPE_VECTOR: #* style = UMN.MapStyle() style = MapStyle() #* layer = UMN.VectorLayer( layer = VectorLayer( output.filePath, dataSet.getBBox(), layerEPSG, output.identifier, providedTitle) type = str(dataSet.getGeometryType()) if type <> None: layer.layerType = type else: layer.layerType = "Polygon" self.logger.debug("The layer type: " + str(dataSet.getGeometryType())) layer.addStyle(style) self.map.addLayer(layer) self.logger.debug("Generated layer " + layer.name + " of type " + layer.layerType + ".") elif dataSet.dataType == dataSet.TYPE_RASTER: #layer = UMN.RasterLayer( layer = RasterLayer( output.filePath, dataSet.getBBox(), layerEPSG, output.identifier, providedTitle) layer.setBounds(dataSet.getMaxValue(), dataSet.getMinValue()) self.map.addLayer(layer) self.logger.debug("Generated layer " + layer.name + " of type raster.") else: self.logger.warning(self.WARN_02 + output.identifier + self.WARN_03) self.logger.debug("Guessed mime type for this layer: " + str(dataSet.getMimeType())) print "The pixel res: " + str(dataSet.getPixelRes()) if (len(self.map.layers) > 0): try : self.map.writeToDisk() except Exception, e: self.logger.error(self.ERR_07 + str(e)) raise Exception(self.ERR_07 + str(e)) return self.logger.info(self.SUCC_02 + self.map.filePath()) return self.map.filePath()
def __init__(self, mainDataFile, mediaTypeDataFile): self.mainDataFile = mainDataFile self.mediaTypeDataFile = mediaTypeDataFile DataSet.__init__(self)
def translate_email_data(self, email_data: EmailData): rv = DataSet() for email in email_data.emails: rv.add_data(self.translate_email(email)) return rv
def setUp(self): self.dataSet = DataSet("Atvinnuleysi", "v28", "Year") self.timeline = TimeLine("Title1", "id1")
arg = sys.argv[i] # all args should have an equal sign equal_index = arg.find('=') if equal_index != -1: first = arg[0:equal_index] second = arg[equal_index+1:len(arg)] if first == '--ratio': trainingRatio = float(second) elif first == '--method': method = second elif first == '--n': N = int(second) elif first == '--iterations': iterations = int(second) else: dataset = DataSet(first, second) datasets.append(dataset) print("Finished adding composer: %s, with data: %s" % (dataset.composerName, dataset.dataPath)) else: print "Couldn't parse arg", arg if method == "ngram": print "Using Ngram method with N =", N numCorrectMozart = 0 numPredictedMozart = 0 numCorrectHaydn = 0 numPredictedHaydn = 0 for j in range(0, iterations): training, testing, composers = zip(*[dataset.getTrainingAndTestingSets(trainingRatio, 20) for dataset in datasets])