def main(): (ppiF, goAnnotation, outF) = sys.argv[1:] ppis = utils.readData(ppiF, np.array([0,1]), np.array([0])) prot2GO = utils.readData(goAnnotation, np.array([1]), np.array([0])) out = getGoldStandard(ppis, prot2GO) outFH = open(outF, "w") outFH.write( "IDA\tIDB\tLabel\n" + "\n".join(out)) outFH.close()
def runExperiments(features, es, logFile): # Reading the data into an array data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) data = m.conceptPreprocessing(data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Looping over different feature parameters for featTypes in features: #for x in [True, False]: #es.fs_confidence = x logFile.write('Executing for ' + ','.join(featTypes) + ' model.\n') es.featTypes = featTypes if es.svmParamSweep: result_params = m.param_sweep_svm(data, es, gammaSweep=False, nFolds=10, verbose=False, random_seed=44) for name in result_params: logFile.write( str(name) + ": " + str(result_params[name]) + '\n') else: estimator = m.getEstimator(es) if es.bootstrap: results = m.eval_bootstrapped_crossVal(estimator, data, bootstrap_data, es, 10, printTree=False) else: results = m.evalCrossval(estimator, data, es, 10, printTree=False) for name in results: logFile.write(str(name) + ": " + str(results[name]) + '\n')
def __init__(self, name): self.con = lite.connect(name) with self.con: cur = self.con.cursor() print "Database",name,"loaded" sql = utils.readData(PATH+"/sqlScripts/player_save.sql") cur.executescript(sql)
def OnPlayerSelect(self, e): index = e.GetSelection() if index==wx.NOT_FOUND: return player = self.GetSelectedPlayer() if player == None: print "PlayerPanel -> OnPlayerSelect: player == None" else: stats = player.GetStats() value = stats_string.format(stats['AC'], stats['Listen'], stats['Spot'], stats['Search'], stats['Move_Silently'], stats['Hide']) self.plstats_text.SetValue(value) items = player.GetItems() self.UpdateItemLbox(items) bstory_filename = path+'/../backstories/'+player.GetName()+'.txt' if os.path.isfile(bstory_filename): bstory_text = utils.readData(bstory_filename) self.bstory_tctrl.SetValue(bstory_text) else: self.bstory_tctrl.Clear()
def runMain(): print('Running ...') try: data = readData(INPUT_FILE_NAME) except Exception as e: print('ERROR 1: File I/O Error While Reading Input Data! App terminating ...') return try: dataOutputList = processData(HEADERS_LIST, data) except Exception as f: print('ERROR 2: Error Processing Data! App terminating ...') return try: writeOutputToFile(OUTPUT_FILE_NAME, HEADERS_LIST, dataOutputList) writeData(HEADERS_LIST, 1) for element in dataOutputList: writeData(element, 0) except Exception as g: print('ERROR 3: File I/O Error While Writing Output Data! App terminating ...') print(g) return
def main(): (rawDataF, mappingF, outF) = sys.argv[1:] mapData = utils.readData(mappingF, np.array([0]), np.array([1])) mapping = {} mappingFH = open(mappingF) mappingFH.readline() for line in mappingFH: line = line.rstrip() if len(line.split("\t")) != 2: continue (wormID, geneName) = line.split("\t") (geneName, wormID) = line.split("\t") if geneName not in mapping: mapping[geneName] = set([]) if wormID not in mapping: mapping[wormID] = set([]) mapping[geneName].add(wormID) mapping[wormID].add(geneName) outFH = open(outF, "w") dataFH = open(rawDataF) outFH.write(dataFH.readline()) for line in dataFH: line = line.rstrip() lineSplit = line.split("\t") idA = lineSplit[0] mapA = mapID(idA, mapping) # mapB = mapID(idB, mapping) if mapA == "" : continue print >> outFH, "%s\t%s" % (mapA, "\t".join(lineSplit[1:])) dataFH.close() outFH.close()
def train_test_data_read(train_label_file, train_text_file, test_label_file, test_text_file, word_ebd_file, all_text_file): trainCorpus, testCorpus = utils.readData(train_label_file, train_text_file, test_label_file, test_text_file, word_ebd_file, all_text_file) train_x, train_y = corpus_read(trainCorpus) test_x, test_y = corpus_read(testCorpus) return train_x, train_y, test_x, test_y
def main(): (proteinComplexes, goAnnotation, outF) = sys.argv[1:] complexes2prot = utils.readData(proteinComplexes, np.array([1]), np.array([0])) prot2GO = utils.readData(goAnnotation, np.array([1]), np.array([0])) complexes2prot = filterComplexes(complexes2prot, 2,40) print len(complexes2prot) complexes2prot = mergeComplexes(complexes2prot) print len(complexes2prot) prot2complexes = getProtsToComplex(complexes2prot) out = getGoldStandard(prot2complexes, prot2GO) outFH = open(outF, "w") outFH.write( "IDA\tIDB\tLabel\n" + "\n".join(out)) outFH.close()
def evaluateForFeats(feats): log = '' for run in feats: run = ','.join(run) data = utils.readData(cfg.PATH_OUTPUT+run + '/', cfg.PATH_PREPROCESSED_TEST) gs = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST) log += str([x.key for x in data]) + '\n' log += str([x.key for x in gs]) + '\n' labels = [x.severity for x in data] labels_gs = [x.severity for x in gs] log += str(labels) + '\n' log += str(labels_gs) + '\n' log += str("Scores for " + run + ": \n") log += m.getScores(labels_gs, labels) log += '\n\n' return log;
def increaseDataSize(bacteria_name, fasta_file, p_bed_path, n_bed_path, genome_file_path, output_folder, both_sides_padding_size=20): print("INCREASING BED SEQ SIZES: ", bacteria_name, "\n", 10 * "__") if not os.path.exists(output_folder): print("CREATING: ", output_folder) os.mkdir(output_folder) output_folder = "{}/{}".format(output_folder, bacteria_name) if not os.path.exists(output_folder): os.mkdir(output_folder) input_p_bed_df = BedTool(p_bed_path).to_dataframe() input_n_bed_df = BedTool(n_bed_path).to_dataframe() print("INPUT - P {} N {}".format(input_p_bed_df.shape, input_n_bed_df.shape)) print("BED SLOP POSITIVE") os.system("bedtools slop -b {} -i {} -g {} > {}/positive.bed".format( both_sides_padding_size, p_bed_path, genome_file_path, output_folder)) print("BED SLOP NEGATIVES") os.system("bedtools slop -b {} -i {} -g {} > {}/negative.bed".format( both_sides_padding_size, n_bed_path, genome_file_path, output_folder)) print("GET FASTA POSITIVE") os.system( "bedtools getfasta -fi {} -bed {}/positive.bed -s -fo {}/positive.fasta" .format(fasta_file, output_folder, output_folder)) print("GET FASTA NEGATIVES") os.system( "bedtools getfasta -fi {} -bed {}/negative.bed -s -fo {}/negative.fasta" .format(fasta_file, output_folder, output_folder)) p_df, n_df = readData( "{}/positive.fasta".format(output_folder), "{}/negative.fasta".format(output_folder), ) p_bed_df = BedTool("{}/positive.bed".format(output_folder)).to_dataframe() n_bed_df = BedTool("{}/negative.bed".format(output_folder)).to_dataframe() print("P: ", p_df.shape, " N: ", n_df.shape, "P: ", p_bed_df.shape, " N: ", n_bed_df.shape) p_bed_df["sequence"] = p_df.values n_bed_df["sequence"] = n_df.values p_bed_df["label"] = [1] * len(p_df) n_bed_df["label"] = [0] * len(n_df) dataset_df = pd.concat([p_bed_df, n_bed_df]) print("SAVING DATASET: P {} + N {} = {}".format(p_bed_df.shape, n_bed_df.shape, dataset_df.shape)) p_bed_df.to_csv("{}/positive.csv".format(output_folder)) n_bed_df.to_csv("{}/negative.csv".format(output_folder)) dataset_df.to_csv("{}/dataset.csv".format(output_folder)) print(dataset_df.head())
def LR(train_label_file, train_text_file, test_label_file, test_text_file, word_ebd_file, all_text_file): # sent=utils.Sent() train_set, test_set = utils.readData(train_label_file, train_text_file, test_label_file, test_text_file, word_ebd_file, all_text_file) print("len of train data: ", len(train_set)) len_train = len(train_set) print("len of test data: ", len(test_set)) len_test = len(test_set) # print("label of train data=",train_set[0].label) # print("embedding of train data=",train_set[0].emb) # Training dataset train_x = [] train_y = [] for i in range(0, len_train): x = train_set[i].emb train_x.append(x) y = train_set[i].label #if you use 20ng remember to us y=np.int(y) y = np.int(y) train_y.append(y) train_x = np.array(train_x) train_y = np.array(train_y) # Test dataset test_x = [] test_y = [] for i in range(0, len_test): x = test_set[i].emb test_x.append(x) y = test_set[i].label y = np.int(y) test_y.append(y) test_x = np.array(test_x) test_y = np.array(test_y) # Logistic Regression if (len(test_set) < 7000): logreg = LogisticRegression() logreg.fit(train_x, train_y) y_pred = logreg.predict(test_x) else: logreg = LogisticRegression(multi_class="multinomial", solver='newton-cg') logreg.fit(train_x, train_y) y_pred = logreg.predict(test_x) # with open('y_pred.pkl', 'wb') as f: # pickle.dump(y_pred, f) # with open('test_y.pkl', 'wb') as f: # pickle.dump(test_y, f) return y_pred, test_y
def main(useAnnotatorWeighing=True): ''' This script evaluates the system performance for the different runs on testdata, created with testSetup.py ''' #runs = ['DSM+1,DIST_HIER','CONCEPTSwithoutContext','CONCEPTS+CONTEXT', 'BOW', 'DSM+2','CATEGORICAL_QUESTIONSET,QUESTIONSET,LONG_QUESTIONSET','DSM','SNOMED+1','DSM+1'] runs = ['DSM+1'] for run in runs: data = utils.readData(cfg.PATH_OUTPUT + run + '/', cfg.PATH_PREPROCESSED_TEST) gs = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST) print([x.key for x in data]) print([x.key for x in gs]) labels = [x.severity for x in data] labels_gs = [x.severity for x in gs] print(labels) print(labels_gs) print("Scores for", run, ": ") m.getScores(labels_gs, labels)
def __init__(self, filePath, actor, joints): """ filePath - path to file of FoB data joints - collection of joints, ordered by heaviness """ self.filePath = filePath self.actor = actor self.joints = joints self.controlJoints = [self.actor.controlJoint(None, "modelRoot", joint) for joint in self.joints] self.exposedJoints = [self.actor.exposeJoint(None, "modelRoot", joint) for joint in self.joints] self.data = utils.readData(filePath, config.num_sensors) self.dataStream = izip(*self.data) self.prevFobSegments = segmentsFromJoints(self.exposedJoints[:-1], self.exposedJoints[1:]) self.prevJointSegments = segmentsFromJoints(self.exposedJoints[:-1], self.exposedJoints[1:]) self.stopped = False self.paused = False
def main(): # src/AddAnotationtoPPI_data.py data/WORM/final_PPI_ratio5.ids.txt data/WORM/worm_gene_uniprot.map.txt data/WORM/final_PPI_ratio5.gprofiler_out.map.txt data/WORM/Human_elegans_uniprot_names.txt data/WORM/uniprotGeneNameMap.txt data/WORM/human.var.txt data/WORM/gene.omim.combined data/WORM/allComplexes.tab test/test.out (protIDsF, worm_gene2UniprotF, goMapF, othologMapF, human_gene2UniprotF, snpF, disF, complexF, outF) = sys.argv[1:] protIDs = utils.readData(protIDsF, np.array([0]), np.array([0])) worm_gene2uniprot = utils.readData(worm_gene2UniprotF, np.array([0]), np.array([1])) goData = utils.readData(goMapF, np.array([0]), np.array([1,2])) orthmap = utils.readData(othologMapF, np.array([1]), np.array([0])) mapData = utils.readData(human_gene2UniprotF, np.array([1]), np.array([0])) snpData = utils.readData(snpF, np.array([0]), np.array([1,2,3]), mapData) disData = utils.readData(disF, np.array([0]), np.array([1]), primKeyMap= mapData) complexData = utils.readData(complexF, np.array([0]), np.array([2])) header = "GeneName\tUniprotIDs\tHumanOrthologIDs\tEnriched_GO_terms\tSNP\tDisease\tComplexe" cats = header.split("\t") counts = {"wAnno" : set([])} for cat in cats: counts[cat] = set([]) outFH = open(outF, "w") print >> outFH, header for protID in protIDs: line = protID[0] line += "\t" + annoToString(getAnnotation(protID, worm_gene2uniprot)) line += "\t" + annoToString(getAnnotation(protID, orthmap, [worm_gene2uniprot])) line += "\t" + annoToString(getAnnotation(protID, goData)) line += "\t" + annoToString(getAnnotation(protID, snpData, [worm_gene2uniprot, orthmap])) line += "\t" + annoToString(getAnnotation(protID, disData, [worm_gene2uniprot, orthmap])) line += "\t" + annoToString(getAnnotation(protID, complexData, [worm_gene2uniprot, orthmap])) lineSplit = line.split("\t") for i in range(len(lineSplit)): col = lineSplit[i] if col != "-": counts[cats[i]].add(protID[0]) for i in range(3,len(lineSplit)): col = lineSplit[i] if col != "-": counts["wAnno"].add(protID[0]) print >> outFH, line for cat in counts: print "%s\t%i" % (cat, len(counts[cat]))
def reSaveTextData(in_data, out_data): dataset, batch_num = utils.readData(in_data, batch_size=7002) # read all data sample = dataset[0] text_dict = utils.sampletxt2data(sample) text = list(text_dict.values()) img_name = list(text_dict.keys()) # label_name_dict = {'overall_sentiment_int':4} y_dict, y = utils.sampley2data(sample) df = pd.DataFrame(list( zip(img_name, text, list(y[0]), list(y[1]), list(y[2]), list(y[3]), list(y[4]))), columns=[ 'img_name', 'text', 'Humour', 'Sarcasm', 'offensive', 'Motivational', 'Overall_Sentiment' ]) df.to_csv(out_data, sep=',', encoding='utf-8', index=False)
item3 = c * x**2 / (1. + (x / Is3)**3) y = A * np.exp(-(item1 + item2 + item3) * L) return y def otherOptimization(func, theta0, datas, labels): result = scipy.optimize.minimize(func, theta0, (datas, labels), method='Powell', options={'maxiter': 100}) #result = scipy.optimize.minimize(func, theta0, (datas, labels), method='Nelder-Mead') return result datas, labels = utils.readData('datas.xlsx') datas = np.array(datas) labels = np.array(labels) #theta0 = (.2, 200., 3., 1., .01, 20., 500.0) theta0 = (.2, 100000., 100., 1., 0.01, 20., 500.0 ) # Try so many initializations, total bad result = otherOptimization(objectFunction, theta0, datas, labels) print(result) minVal = np.min(datas) maxVal = np.max(datas) theta = [a for a in result.x]
def main(): ( clusterF, worm_gene2UniprotF, othologMapF, disF, human_gene2UniprotF, oboF, omim_doid_F, omim2NameF, gene2DOIDF, outF, ) = sys.argv[1:] obo = readOntology(oboF, False) omim_doid = utils.readData(omim_doid_F, np.array([2]), np.array([0])) omim_name = utils.readData(omim2NameF, np.array([0]), np.array([1])) orthmap = utils.readData(othologMapF, np.array([1]), np.array([0])) worm_gene2uniprot = utils.readData(worm_gene2UniprotF, np.array([0]), np.array([1])) doid2Name = utils.readData(oboF, np.array([0]), np.array([1])) mapData = utils.readData(human_gene2UniprotF, np.array([1]), np.array([0])) disData = utils.readData(disF, np.array([0]), np.array([1]), primKeyMap=mapData) stats = importr("stats") doid2Gene = {} gene2DOIDFH = open(gene2DOIDF) for line in gene2DOIDFH: line = line.rstrip() (gene, thisDOID) = line.split("\t") if not thisDOID.startswith("DOID"): continue allParents = set([]) # allParents = getAll(thisDOID, obo) allParents.add(thisDOID) for doid in allParents: if doid not in doid2Gene: doid2Gene[doid] = set([]) doid2Gene[doid].add(gene) gene2DOIDFH.close() allGenes = 20313 allProts = 1713 # TOUPDATE when new complexes are used outFH = open(outF, "w") clusterFH = open(clusterF) for line in clusterFH: line = line.rstrip("\n") cluster_genes = line.split("\t")[0].split(",") cluster_prots = set([]) cluster_omim = set([]) doid_cluster_counts = {} for gene in cluster_genes: cluster_prots.update(getAnnotation((gene,), orthmap, [worm_gene2uniprot])) cluster_omim.update(getAnnotation((gene,), disData, [worm_gene2uniprot, orthmap])) doidIDs = mapIDs(cluster_omim, omim_doid) if len(doidIDs) > 0: for doidID in doidIDs: parentDOID = set([]) # parentDOID = getAll(doidID[0], obo) parentDOID.add(doidID[0]) for allIDs in parentDOID: if allIDs not in doid_cluster_counts: doid_cluster_counts[allIDs] = set([]) doid_cluster_counts[allIDs].add(gene) pvals = [] ids = [] for doidID in doid_cluster_counts: if doidID == "DOID:0000004" or doidID == "---": continue if len(doid_cluster_counts[doidID]) == 1: continue mat = [[allProts, allGenes], [len(doid_cluster_counts[doidID]), len(doid2Gene[doidID])]] pval = fisher_exact(mat)[1] pvals.append(pval) ids.append(doidID) pvals = stats.p_adjust(FloatVector(pvals), method="BH") enrichedDOIDs = set([]) for i in range(len(pvals)): if pvals[i] <= 0.05: enrichedDOIDs.add( "%s,%s,%i,%.4f" % (ids[i], doid2Name[(ids[i],)][0], len(doid_cluster_counts[ids[i]]), pvals[i]) ) tmp = set() for mim in cluster_omim: if mim in omim_name: tmp.add((mim[0], omim_name[mim][0][0])) doidIDs = mapIDs(cluster_omim, omim_doid) cluster_omim = tmp print >> outFH, "%s\t%s\t%s\t%s" % ( line, annoToString(cluster_prots, sepB=","), annoToString(cluster_omim), ";".join(enrichedDOIDs), ) clusterFH.close() outFH.close()
init = tf.global_variables_initializer() saver_latest = tf.train.Saver(max_to_keep=1) saver_acc = tf.train.Saver(max_to_keep=1) saver_loss = tf.train.Saver(max_to_keep=1) with sess.as_default(): init.run() if min_indices > max_indices: raise AssertionError('min_indices cannot be larger than min_indices') if not train_labels_path: train_labels_path = os.path.join(os.path.dirname(train_images_path), 'labels') src_files, src_labels_list, total_frames = readData(train_images_path, train_images_ext, train_labels_path, train_labels_ext) if start_id < 0: if end_id < 0: raise AssertionError('end_id must be non negative for random selection') elif end_id >= total_frames: raise AssertionError('end_id must be less than total_frames for random selection') print('Using {} random images for training'.format(end_id + 1)) img_ids = np.random.choice(total_frames, end_id + 1, replace=False) else: if end_id < start_id: end_id = total_frames - 1 print('Using all {} images for training'.format(end_id - start_id + 1)) img_ids = range(start_id, end_id + 1) if start_id < 0: log_template = '{:d}_{}_{}_{}_random_{}_{}'.format(
{'centroidDet': 'pretretedInputDataInitialCentroidsNear', 'recombMet': 'medianNewCentroids','distance': 'euclideanDistance'}, {'centroidDet': 'pretretedInputDataInitialCentroidsNear', 'recombMet': 'medianNewCentroids', 'distance': 'manhattanDistance'}, {'centroidDet': 'randomInitialCentroids', 'recombMet': 'averageNewCentroids', 'distance': 'euclideanDistance'}, {'centroidDet': 'randomInitialCentroids', 'recombMet': 'averageNewCentroids', 'distance': 'manhattanDistance'}, {'centroidDet': 'randomInitialCentroids', 'recombMet': 'medianNewCentroids', 'distance': 'euclideanDistance'}, {'centroidDet': 'randomInitialCentroids', 'recombMet': 'medianNewCentroids', 'distance': 'manhattanDistance'} ] # Nom des fichiers d'entree et sortie du programme INPUT_FILE_NAME = "irisData.txt" OUTPUT_FILE_NAME = "resultatsKMeans.txt" #Function que applique les tests def applyTest(tst,rawData): k = KMeans(tst['centroidDet'], tst['recombMet'], tst['distance']) return k.setupClusters(rawData) if __name__ == "__main__": data=readData(INPUT_FILE_NAME) resultTest='' testNum=1 for tst in tests: resultTest += '------------------------------------ Test N '+str(testNum)+' ------------------------------------\n' resultTest+='Centroid determination method: ' + tst['centroidDet'] +'\n' resultTest+='New centroid determination method: ' + tst['recombMet'] +'\n' resultTest += 'Distance method: ' + tst['distance'] + '\n' rslt, iterations=applyTest(tst, data) resultTest += 'Quantite iterations: ' + str(iterations) + '\n' resultTest+=printTestSummary(rslt) testNum+=1 plotResults(rslt, tst['centroidDet']+'_'+tst['recombMet']+'_'+tst['distance']) printToFile(resultTest)s
print 'Training model for single tags' clf = Pipeline([('vectorizer', DictVectorizer(sparse=False)),('classifier', LogisticRegression(solver='lbfgs',multi_class='multinomial'))]) clf.fit(X, y1) self.MaxEntClassifier = clf filename1 = self.modelFileMaxEnt pickle.dump(clf, open(filename1, 'wb')) print 'Training model for pairs of tags' clf2 = Pipeline([('vectorizer', DictVectorizer(sparse=False)),('classifier', LogisticRegression(solver='lbfgs',multi_class='multinomial'))]) clf2.fit(X, y2) self.TwoLabelClassifier = clf2 filename2 = self.modelFileMaxEntPair pickle.dump(clf2, open(filename2, 'wb')) self.tags = self.MaxEntClassifier.classes_ self.tagPairs = self.TwoLabelClassifier.classes_ else: print('Cannot fit in test mode') exit(0) if __name__ == '__main__': df_train, df_test, corpus, tags = readData('../data/') posTagger = MEMM() X_train, X_test, y_train1, y_test1, y_train2, y_test2 = posTagger.preprocess(df_train[:50000], df_test[:50000]) #print("Fitting model") #posTagger.fit(X_train, y_train1, y_train2) print ("Sample Tags using Viterbi decoding") posTagger.viterbiDecoding(df_test[:46])
from utils import readData from aco import ACO, World from plot import plot if __name__ == '__main__': #noCities, cost_matrix, points = readData("D:\\UBB_info_sem_4\\AI\\LAB\\Teme\\Lab5\\ulysses16.txt") noCities, cost_matrix, points = readData( "D:\\UBB_info_sem_4\\AI\\LAB\\Teme\\Lab5\\data.txt") #noCities, cost_matrix, points = readData("D:\\UBB_info_sem_4\\AI\\LAB\\Teme\\Lab5\\data2.txt") #noCities, cost_matrix, points = readData("D:\\UBB_info_sem_4\\AI\\LAB\\Teme\\Lab5\\data3.txt") paramACO = { "ant_count": 10, "generations": 100, "alpha": 1.0, "beta": 10.0, "rho": 0.5, "q": 10 } paramWorld = {"cost_matrix": cost_matrix, "noCities": noCities} aco = ACO(paramACO) graph = World(paramWorld) path, cost = aco.solve(graph) print("\n") print("\n") print("BEST: Cost: " + str(cost) + " \nPath: " + str(path)) plot(points, path)
def main(): (protAnnoF, oboQueryF, oboF, omim_doid_F, gene2DOIDF, outF) = sys.argv[1:] obo = readOntology(oboF, False) omim_doid = utils.readData(omim_doid_F, np.array([2]), np.array([0])) qnames = utils.readData(oboQueryF, np.array([0,1]), np.array([0])) protAnnoFH = open(protAnnoF) protAnnoFH.readline() prot2DOID = {} omimperOBO = {} COUNTperQTerm = {} for line in protAnnoFH: line = line.rstrip() prot = line.split("\t")[0] for m in re.finditer('OMIM:\d{6}', line): omimID = m.group(0) if (omimID, ) not in omim_doid: continue for doidID in omim_doid[(omimID, )]: parentDOID = getAll(doidID[0], obo) parentDOID.add(doidID[0]) for (queryDOID, name) in qnames: if queryDOID in parentDOID: if queryDOID not in COUNTperQTerm: COUNTperQTerm[queryDOID] = set([]) if queryDOID not in omimperOBO: omimperOBO[queryDOID] = set([]) COUNTperQTerm[queryDOID].add(prot) omimperOBO[queryDOID].add((prot, omimID)) protAnnoFH.close() allProts = len(utils.readData(protAnnoF, np.array([0]), np.array([0]))) doid2Gene= {} gene2DOIDFH = open(gene2DOIDF) for line in gene2DOIDFH: line = line.rstrip() (gene, thisDOID) = line.split("\t") if not thisDOID.startswith("DOID"): continue allParents = getAll(thisDOID, obo) allParents.add(thisDOID) for doid in allParents: if doid not in doid2Gene: doid2Gene[doid] = set([]) doid2Gene[doid].add(gene) gene2DOIDFH.close() allGenes = 20313 #len(allGenes) pvals = [] names = [] for (doid, name) in qnames: if doid not in doid2Gene: continue if doid not in COUNTperQTerm: continue mat = [[allProts, allGenes],[len(COUNTperQTerm[doid]), len(doid2Gene[doid])]] pval = fisher_exact(mat)[1] pvals.append(pval) names.append((doid, name)) stats = importr('stats') selNames = {} pvals = stats.p_adjust(FloatVector(pvals), method = 'BH') for i in range(len(pvals)): # if pvals[i]>0.05: continue selNames[names[i]] = pvals[i] catCounts = {} for selname in selNames: pval = selNames[selname] doid, name = selname if len(COUNTperQTerm[doid]) == 0: continue counts = len(COUNTperQTerm[doid]) omimCounts = len(omimperOBO[doid]) if counts not in catCounts: catCounts[counts] = set([]) catCounts[counts].add("%s\t%s\t%i\t%i\t%f" % (name, doid, counts, omimCounts, pval)) outFH = open(outF + ".dat", "w") print >> outFH, "Name\tDOID\tCounts\nAll sites\tNA\t%i" % (allProts) for counts in sorted(catCounts.keys(), reverse=True): print >> outFH, "\n".join(catCounts[counts]) print "\n".join(catCounts[counts]) outFH.close()
(options, args) = parser.parse_args() highestScore = 0.0 eId = 0 train_id2sent, train_id2pos, train_id2ner, train_id2nerBILOU, train_id2arg2rel, test_id2sent, test_id2pos, test_id2ner, test_id2nerBILOU, test_id2arg2rel = readCoNLL2004_prepared_corpus( ) words, w2i, c2i, nertags, postagCount = vocabNER(train_id2sent, train_id2pos, train_id2nerBILOU) id2arg2rel, rels, classweights = getRelVocab(train_id2arg2rel, train_id2nerBILOU) fulltrain_data = readData(train_id2sent, train_id2pos, c2i) test_data = readData(test_id2sent, test_id2pos, c2i) #print w2i #print c2i #print nertags #print postags train_data, train_id2arg2rel_train = {}, {} numInstances = len(fulltrain_data) / 5 * 4 count = 0 for index in fulltrain_data: train_data[index] = fulltrain_data[index] train_id2arg2rel_train[index] = train_id2arg2rel[index] count += 1
store = [data, 1] elif btw_hold(data, store[0]): store = ave_func(store, [data, 1]) else: if store[1] <= winsize: win.append(store) else: if len(win) > 0: for w in win: store[1] += w[1] win = [] fs.append(map(int, store)) store = [data, 1] if store is not None: fs.append(map(int, store)) results = [] index = 0 for f in fs: if len(results) == 0 or f[0] != results[-1][1]: results.append((indexes[index], f[0])) index += f[1] return results if __name__ == "__main__": if len(sys.argv) != 5: utils.printUsage(("datafile", "threshold", "winsize", "outputfile")) results = smooth(utils.readData(sys.argv[1], int), float(sys.argv[2]), int(sys.argv[3])) utils.writeData(sys.argv[4], results, '%d\t%d\n')
from utils import readData readData()
def __init__(self, filePath, points): self.filePath = filePath self.data = utils.readData(filePath, config.num_sensors) self.points = points self.dataStream = izip(*self.data)
#!/usr/bin/env python # coding: utf-8 # In[4]: import sys import numpy as np import open3d as o3d sys.path.insert(1, "../data") from utils import readData, readPointCloud # In[5]: ground_truth = readData("../data/01.txt") ground_truth = ground_truth[:77][:] # In[11]: def computeTransformation(point_ind): # final_pose = [] T = ground_truth[point_ind][:] T = np.reshape(T, (3, 4)) b = np.array([0, 0, 0, 1]) T = np.vstack((T, b)) # print (T.shape) return T # In[8]:
def driver(): data = utils.readData() clusterDataPairs(data, config.MIN_CLUSTERS, config.MAX_CLUSTERS)
results.append(item) else: item = QAPair() item.question = q item.answers = a item.begQue = ind[0] item.endQue = ind[1] item.begAns = ind[1] + 1 item.endAns = ind[2] results.append(item) return results if __name__ == '__main__': #TEST CODE data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) s = Segment() for d in data: text = d.getTextObject() segments = s.segment(text) # # print(segments) # for segment in segments: # # print("Question in TextObject:", text.get_covered_tokens(segment.begQue, segment.endQue)) # print("Question concepts:", text.get_covered_concepts(segment.begQue, segment.endQue)) # print("Question in segment:", segment.question) # # print("Answer in TextObject:", text.get_covered_tokens(segment.begAns, segment.endAns)) # print("Answer concepts:", text.get_covered_concepts(segment.begAns, segment.endAns))
def runForExperimentSettings(features, es): # Reading the train/test_data into an array train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Reading in bootstrap data as well when enabled if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) vectorizer = DictVectorizer() min_max_scalar = MinMaxScaler() # Looping over different feature parameters for featTypes in features: utils.out('Executing for ' + ','.join(featTypes) + ' model.') es.featTypes = featTypes estimator = m.getEstimator(es) m.generatePrimaryFeats(train_data, es) m.generatePrimaryFeats(test_data, es) utils.out('Generated primary features for train and test_data!') y_train = [d.severity for d in train_data] #else argument added here to not override the train_data/y_train setting, otherwise we can only do one featType at a time if es.bootstrap: m.generatePrimaryFeats(bootstrap_data, es) (train_datac, y_trainc) = m.get_bootstrapped_trainset(train_data, y_train, bootstrap_data, es, estimator, th_bs=0.6) else: train_datac = train_data y_trainc = y_train concatenated_data = [] concatenated_data.extend(train_datac) concatenated_data.extend(test_data) m.generateDataDrivenFeats(train_datac, concatenated_data, es) featurized = m.featurize(concatenated_data) train_feats = featurized[0:len(train_datac)] test_feats = featurized[len(train_datac):len(featurized)] # Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, y_trainc, es) train_feats, y_trainc, train_bucket = ss.runSampleSelection( train_feats, y_trainc, [i for i in range(len(train_datac))], es) x_train = vectorizer.fit_transform(train_feats) x_test = vectorizer.transform(test_feats) if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) weights_train = m.getWeights(train_datac, train_bucket, es.weighInterAnnot) model = m.train(estimator, x_train, y_trainc, weights_train, model=None) y_pred = m.test(x_test, estimator=model) # print(y_pred) for i, cur_data in enumerate(test_data): cur_data.predSev = y_pred[i] out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/' if not os.path.exists(out_dir): os.makedirs(out_dir) utils.genOutput(data=test_data, outDir=out_dir, dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')
for i in range(pcd.shape[0]): x = int(pcd[i, 0] - x_min) z = int(pcd[i, 2] - z_min) occupancy[x, z] += 1 occupancy /= pcd.shape[0] return occupancy > THRESH def numpy_to_image(arr: np.ndarray, path: str): cv2.imwrite(f'{path}.png',(arr * 255).astype(np.uint8)) if __name__ == "__main__": transf = utils.readData(DATASET_TR_PATH) if os.path.exists(RESULT_PATH_1): shutil.rmtree(RESULT_PATH_1) os.makedirs(RESULT_PATH_1) for filename in os.listdir(DATASET_PATH): arr = utils.readPointCloud(DATASET_PATH + filename)[:, :3] arr = utils.lidar_to_world(arr) ind = int(filename[:-4]) arr = utils.make_homogenous_and_transform(arr, transf[ind].reshape(3, 4)) pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(arr) pcd = pcd.voxel_down_sample(voxel_size = 1) arr = np.asarray(pcd.points)
if optim_type == "rmsprop": optimizer_D = optim.RMSprop(discriminator.parameters(), lr=lr) optimizer_G = optim.RMSprop(generator.parameters(), lr=lr) elif optim_type == "adam": optimizer_D = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.001, 0.8)) optimizer_G = optim.Adam(generator.parameters(), lr=lr, betas=(0.001, 0.8)) elif optim_type == "sgd": optimizer_D = optim.SGD(discriminator.parameters(), lr=lr, momentum=0.1) optimizer_G = optim.SGD(generator.parameters(), lr=lr, momentum=0.1) else: raise TypeError("optim type not found %s" % (optim_type)) #calculate background x = utils.readData(filepath) lx = np.floor(np.min(x[:, 0])) hx = np.ceil(np.max(x[:, 0])) ly = np.floor(np.min(x[:, 1])) hy = np.ceil(np.min(x[:, 1])) x_aix = np.arange(lx, hx, 0.01) y_aix = np.arange(ly, hy, 0.01) xx, yy = np.meshgrid(x_aix, y_aix) print(xx.shape, yy.shape) xx = torch.from_numpy(xx) yy = torch.from_numpy(yy) bc = torch.stack((xx, yy), dim=2) bc = bc.view(-1, 2) bc_cuda = bc.view(-1, 2).cuda().float()
end_id = args.end_id save_seg = args.save_seg save_stitched = args.save_stitched gpu_id = args.gpu_id loss_type = args.loss_type stitch_labels = args.stitch_labels show_img = args.show_img save_raw = args.save_raw psi_act_type = args.psi_act_type normalize_labels = args.normalize_labels n_layers = args.n_layers src_files, src_labels_list, total_frames = readData(images_path, images_ext) if end_id < start_id: end_id = total_frames - 1 eval_mode = False if labels_path and labels_ext: _, labels_list, labels_total_frames = readData(labels_path=labels_path, labels_ext=labels_ext) if labels_total_frames != total_frames: raise SystemError( 'Mismatch between no. of frames in GT and seg labels') eval_mode = True else: save_seg = True
name='train_labels') global_steps = tf.Variable(0, name="global_step", trainable=False) phPredction = ASRCNN(phTrainInput) loss = utils.computeLoss(phTrainTarget, phPredction) curr_lr_op = tf.train.exponential_decay(lr, global_steps, decay_step, decay_ratio, staircase=True) train_op = tf.train.AdamOptimizer(learning_rate=curr_lr_op).minimize( loss, global_step=global_steps) gpu_options = tf.GPUOptions(allow_growth=allow_growth) # data trainData1, testData, trainData2, trainTarget2, testTarget, trainTarget1, minNDVI, maxNDVI, perm = utils.readData( data_file, rcstart, rcend, opt.mode, data_scale) trainData = [trainData1, trainData2] trainTarget = [trainTarget1, trainTarget2] num_patches_x = (image_size - patch_size + patch_stride) // patch_stride num_patches_y = (image_size - patch_size + patch_stride) // patch_stride num_patches = num_patches_x * num_patches_y print(f'Extracting 80% for training,and 20% for validation ...') pos = np.int(np.ceil(num_patches * 2 * 0.2 / batch_size) * batch_size) valPerm = perm[:pos] trainPerm = perm[pos:] start_time = time.time() def train_one_epoch(sess, n_epoch, saver):
def main(useAnnotatorWeighing=True): """ This script allows for 10-fold cross validation over the data in the training set. Experiments only yield results, they don't yield annotated files. The standard deviation seen over the different folds for each metric are reported as well. Configure your model settings by modifying the ExperimentSettings object in the script. """ # Making folders from config # cfg.makeFolders() # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations features = [["DSM+2"], ["BOW"], ["DSM+1"], ["DSM"], ["SNOMED"], ["SNOMED+1"], ["DSM+2"], ["CONCEPTS"]] #features = [["DSM"],["DSM+1","DIST_HIER"],["DSM+1"], ["CATEGORICAL_QUESTIONSET","QUESTIONSET","LONG_QUESTIONSET"]] # Options: # 'CONCEPTS', 'DSM+1', 'DSM', 'DSM_HIER', 'MED', 'BOW', 'BOW_ANSWERS', 'CATEGORICAL_QUESTIONSET', 'QUESTIONSET' # 'WORD_VECTOR', 'WORD_VECTOR_ANSWERS', 'CONCEPT_VECTOR', 'DIST_WORDVECTOR', 'DIST_CONCEPTVECTOR' # 'CONCEPT_CLUSTERS', 'PREAMBLE_CLUSTERS' # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings) es = ExperimentSettings() es.fs_varianceFilter = True es.bootstrap = False es.ss_prototyping = False es.weighInterAnnot = False #es.ml_algorithm='XGBOOST' #es.ml_algorithm = 'RANDOM' '''es.removeDeniedConcepts=True es.removeUncertainConcepts=False es.splitDeniedConcepts=False es.splitFamilyConcepts=True''' es.removeDeniedConcepts = False es.splitDeniedConcepts = False es.splitUncertainConcepts = False es.splitFamilyConcepts = False #es.fs_confidence=True #es.fs_confidenceValueDistinction = True #es.fs_chiSquare = False #es.fs_varianceFilter = True #es.fs_varianceThreshold = 0.05 #es.fs_confidence = True #es.fs_informationGain = False #es.fs_confidenceWithCoverage = True #es.fs_confidenceTopK = 100 #es.fs_confidenceCoverageOverlap = 3 #es.fs_confidenceCutOff = 0.05''' # Reading the data into an array data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) data = m.conceptPreprocessing(data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Looping over different feature parameters for featTypes in features: #for x in [True, False]: #es.fs_confidence = x utils.out('Executing for ' + ','.join(featTypes) + ' model.') es.featTypes = featTypes if es.svmParamSweep: result_params = m.param_sweep_svm(data, es, gammaSweep=False, nFolds=10, verbose=False, random_seed=44) for name in result_params: print(str(name) + ":", result_params[name]) else: estimator = m.getEstimator(es) if es.bootstrap: results = m.eval_bootstrapped_crossVal(estimator, data, bootstrap_data, es, 10, printTree=False) else: results = m.evalCrossval(estimator, data, es, 10, printTree=False) for name in results: print(str(name) + ":", results[name])
# coding: utf-8 import sys from utils import readRelations, readData, wordStatForQuestion, wordStatForRelation, \ convert_data from data import data_static, argConfig, dataMgr from train import train_model if __name__ == "__main__": print "Start to read relations..." relation_list_seg, relation_list_seg_all = \ readRelations("KBQA_data/sq_relations/relation.2M.list") print "\n" print "Start to read training data..." training_data = readData( "KBQA_data/sq_relations/train.replace_ne.withpool") print "Start to read testing data..." testing_data = readData("KBQA_data/sq_relations/test.replace_ne.withpool", False) print "Start to read validation data" valid_data = readData("KBQA_data/sq_relations/valid.replace_ne.withpool", False) print "\n" print "start to get word dictionary for questions and relations" question_words = wordStatForQuestion(training_data) relation_words = wordStatForRelation(relation_list_seg, relation_list_seg_all, training_data) print "\n" print "Start to convert data to vectors..."
m = re.findall ( '<concepts_FILEUMLS(.*?)\/>', text, re.DOTALL) for n in m: text = text.replace("<concepts_FILEUMLS"+n+"/>\n","") text = text.replace("<concepts_FILEUMLS"+n+"/>","") return text if __name__ == '__main__': #cfg.makeFolders() #texts = utils.readData(cfg.PATH_INPUT) #for text in texts: # texts[text]["tokens"] = utils.dumbTokeniser(texts[text]["note"]) basic = cfg.PATH_TEST preprocessed = cfg.PATH_PREPROCESSED_TEST data = utils.readData(basic, preprocessed) #TODO reset this to 0.80 idf threshold matcher = DictionaryMatcher(4, 0.80) matcher.loadLibrary('FILEUMLS') #matcher.loadLibrary('DSM') # matcher.loadDefinitions() #matcher.saveModel() #matcher = matcher.loadModel('FILEUMLS') for d in data: text = d.getTextObject() #for line in matcher.processText(text, True): # print(line)
import Model import matplotlib.pyplot as plt from matplotlib.legend_handler import HandlerLine2D ''' a = '12 , 34 ' re_splitA = re.compile(r'[\s\,]+') print(re_splitA.split(a)) print(re.split(r'[\s\,]+', a)) print(re_splitA.split(a)[2] == '') # True b = [a for a in re_splitA.split(a) if a.isdigit()] print(b) # work ! ! ! ''' print('Test...') X, y = utils.readData('datas.xlsx') X = np.atleast_2d(X).T y = np.atleast_2d(y).T print('Data info\n') print('X = ', X.dtype, X.shape) print('y = ', y.dtype, y.shape) # Test datas x = np.atleast_2d(np.linspace(X.min(), X.max(), 2000)).T y_pred, sigma_pred = Model.gpSklearn(X, y, x) fig = plt.figure(1) plotf, = plt.plot(X, y, label='Origin')
def main(useAnnotatorWeighing=True): ''' This script runs the experiments by training on a trainset and testing on a test set. Also allows bootstrapping (which is hard coded in this script as well) Configure your model settings by modifying the ExperimentSettings object in the script. The output of these models are annotated files in the output folder, which can be evaluated (in metrics) using testEval.py ''' # Making folders from config # cfg.makeFolders() # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations features = [["DSM+1"]] #features = [["CONCEPTS"]]#['BOW'], # features = [["CONCEPTS"]] # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings) es = ExperimentSettings() # es.fs_varianceFilter = True # es.bootstrap = True # es.ss_prototyping = True # es.weighInterAnnot = False # es.ml_algorithm='RF' #remove these! # es.removeDeniedConcepts=False # es.splitFamilyConcepts=False # es.splitUncertainConcepts=False # Reading the train/test_data into an array train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Reading in bootstrap data as well when enabled if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) vectorizer = DictVectorizer() min_max_scalar = MinMaxScaler() # Looping over different feature parameters for featTypes in features: utils.out('Executing for ' + ','.join(featTypes) + ' model.') es.featTypes = featTypes estimator = m.getEstimator(es) m.generatePrimaryFeats(train_data, es) m.generatePrimaryFeats(test_data, es) utils.out('Generated primary features for train and test_data!') y_train = [d.severity for d in train_data] if es.bootstrap: m.generatePrimaryFeats(bootstrap_data, es) (train_data, y_train) = m.get_bootstrapped_trainset(train_data, y_train, bootstrap_data, es, estimator, th_bs=0.6) concatenated_data = [] concatenated_data.extend(train_data) concatenated_data.extend(test_data) m.generateDataDrivenFeats(train_data, concatenated_data, es) featurized = m.featurize(concatenated_data) train_feats = featurized[0:len(train_data)] test_feats = featurized[len(train_data):len(featurized)] # Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, y_train, es) train_feats, y_train, train_bucket = ss.runSampleSelection( train_feats, y_train, [i for i in range(len(train_data))], es) x_train = vectorizer.fit_transform(train_feats) x_test = vectorizer.transform(test_feats) if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) weights_train = m.getWeights(train_data, train_bucket, es.weighInterAnnot) model = m.train(estimator, x_train, y_train, weights_train, model=None) y_pred = m.test(x_test, estimator=model) # print(y_pred) for i, cur_data in enumerate(test_data): cur_data.predSev = y_pred[i] out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/' if not os.path.exists(out_dir): os.makedirs(out_dir) utils.genOutput(data=test_data, outDir=out_dir, dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')
def generateData( bacteria_index, save_csv=False, save_data=True, out_dir="./data/promoters/", nextflow_path='./nextflow', nextflow_pipeline="pipeline_unbalanced.nf", # 'pipeline_without_docker.nf' manually_balance_data=False): if (using_unbalanced): print("GENERATING UNBALANCED DATA WITH RATIO 1:10") else: print("GENERATE DATA") # bacteriaDir = "./bacteria" bacteria_report = {} if bacteria_index is None: index = createIndex() else: index = bacteria_index data_root = "./data/" if not os.path.exists(data_root): os.makedirs(data_root) w = csv.writer(open(data_root + "report.csv", "w")) vocab_size = None tokenizer = None hot_encoded_train_features = np.empty((0, 160), int) hot_encoded_train_labels = np.empty((0, ), int) hot_encoded_test_features = np.empty((0, 160), int) hot_encoded_test_labels = np.empty((0, ), int) hot_encoded_val_features = np.empty((0, 160), int) hot_encoded_val_labels = np.empty((0, ), int) tetra_freq_train_features = np.empty((0, 256), int) tetra_freq_train_labels = np.empty((0, ), int) tetra_freq_test_features = np.empty((0, 256), int) tetra_freq_test_labels = np.empty((0, ), int) tetra_freq_val_features = np.empty((0, 256), int) tetra_freq_val_labels = np.empty((0, ), int) rnn_token_train_features = np.empty((0, 37), int) rnn_token_train_labels = np.empty((0, ), int) rnn_token_test_features = np.empty((0, 37), int) rnn_token_test_labels = np.empty((0, ), int) rnn_token_val_features = np.empty((0, 37), int) rnn_token_val_labels = np.empty((0, ), int) global_rnn_complete = np.empty((0, 37), int) start_time = datetime.datetime.now().time().strftime('%H:%M:%S') bar = progressbar.ProgressBar(max_value=len(index)) for i, row in index.iterrows(): bacteria_start_time = datetime.datetime.now().time().strftime( '%H:%M:%S') # print("\n\n", 20*"*", i+1, f". {row['BACTERIA']}", 20*"*" ) print("\n\n {} {} {} {}".format(20 * "*", i + 1, row['BACTERIA'], 20 * "*")) #nextflow run main_pipeline.nf --bacteria ecoli && rsync outDir/ outDirOriginal/ -a --copy-links -v print("\n\n {} {} {} {}".format(20 * "*", i + 1, "NEXTFLOW DATA GENERATION", 20 * "*")) # print("\n\n", 10*"*", "NEXTFLOW DATA GENERATION",10*"*" ) stderr = None stdout = None if (nextflow_path is not None): print("\n\nGENERATING NEXTFLOW DATA USING PIPELINE: ", nextflow_pipeline, "\n\n") out = subprocess.Popen( [ nextflow_path, 'run', nextflow_pipeline, #'pipeline_without_docker.nf', # pipeline_unbalanced_without_docker.nf 'main_pipeline.nf', '--bacteria', str(row['BACTERIA']), ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = out.communicate() error_msg = "" print("\n\nOUT: \n\n", stdout) print("\n\nERRORS: \n\n ", stderr) bacteria_report[row['BACTERIA']] = { 'stdout': stdout, 'stderr': stderr } else: print("NEXTFLOW GENERATION SKIPPED.") if stderr == None: # print("\n\nConverting symlink to copy of file", row['BACTERIA']) # cmd = f"rsync outDir/{row['BACTERIA']} outDirOriginal/ -a --copy-links -v" if (nextflow_path is not None): cmd = "rsync " + out_dir + str( row['BACTERIA']) + " outDirOriginal/ -a --copy-links -v" out = os.popen(cmd).read() try: p_df, n_df = readData( out_dir + str(row['BACTERIA']) + "/positive.fasta", out_dir + str(row['BACTERIA']) + "/negative.fasta") p_bed_df = BedTool(out_dir + str(row['BACTERIA']) + "/positive.bed").to_dataframe() n_bed_df = BedTool(out_fir + str(row['BACTERIA']) + "/negative.bed").to_dataframe() p_bed_df["sequence"] = p_df.values n_bed_df["sequence"] = n_df.values p_bed_df["label"] = [1] * len(p_df) n_bed_df["label"] = [0] * len(n_df) dataset_df = pd.concat([p_bed_df, n_bed_df]) print("SAVING DATASET: P {} + N {} = {}".format( p_bed_df.shape, n_bed_df.shape, dataset_df.shape)) p_bed_df.to_csv(out_dir + str(row['BACTERIA']) + "/positive.csv") n_bed_df.to_csv(out_dir + str(row['BACTERIA']) + "/negative.csv") dataset_df.to_csv(out_dir + str(row['BACTERIA']) + "/dataset.csv") print("\n\n" + 10 * "*" + "FASTA TO HOT ENCODING" + 10 * "*") print("P: {} N: {}".format(len(p_df), len(n_df))) if (manually_balance_data and len(p_df) < len(n_df)): print( "Manually balancing Positives and Negatives. Decreasing Negatives from {} -> {}. Ratio {}:{}" .format(len(n_df), len(p_df), 1, len(p_df) * 100 / len(n_df))) n_df = n_df.sample(n=len(p_df)) print("FINAL DATA SHAPES -> P: {} N : {}".format( p_df.shape, n_df.shape)) hot_p_data, hot_n_data = fastaToHotEncoding(p_df, n_df) hot_encoded_dataset_df = joinPositiveAndNegative( hot_p_data, hot_n_data) print("\n\n", hot_encoded_dataset_df.head(), "\n\n") X_hot_train, X_hot_test, y_hot_train, y_hot_test = generateTrainAndTestSplit( hot_encoded_dataset_df.values) print(""" X: {} Y: {} TX: {} TY: {} """.format(X_hot_train.shape, y_hot_train.shape, X_hot_test.shape, y_hot_test.shape)) if (row["IS_TRAINING"] == True): hot_encoded_train_features = np.append( hot_encoded_train_features, X_hot_train, axis=0) hot_encoded_train_labels = np.append( hot_encoded_train_labels, y_hot_train, axis=0) hot_encoded_test_features = np.append( hot_encoded_test_features, X_hot_test, axis=0) hot_encoded_test_labels = np.append( hot_encoded_test_labels, y_hot_test, axis=0) else: print("\nAPPENDING TO VALIDATION DATA") hot_encoded_val_features = np.append( hot_encoded_val_features, X_hot_train, axis=0) hot_encoded_val_labels = np.append(hot_encoded_val_labels, y_hot_train, axis=0) hot_encoded_val_features = np.append( hot_encoded_val_features, X_hot_test, axis=0) hot_encoded_val_labels = np.append(hot_encoded_val_labels, y_hot_test, axis=0) print("\n\n", 10 * "*", "FASTA TO TETRA-NUCLEOTDE FRECUENCY", 10 * "*") tetra_n_array_positive = fastaToTetraNucletideDic( p_df.values, 1) tetra_n_array_negative = fastaToTetraNucletideDic( n_df.values, 0) joined_df = joinPositiveAndNegative(tetra_n_array_positive, tetra_n_array_negative) joined_df = joined_df.fillna(0) print("\nHEAD-FASTA TO TETRA-NUCLEOTDE FRECUENCY") print("\n\n", joined_df.head(), "\n\n") X_train, X_test, y_train, y_test = generateTrainAndTestSplit( joined_df.values) print(""" X: {} Y: {} TX: {} TY: {} """.format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)) if (row["IS_TRAINING"] == True): tetra_freq_train_features = np.append( tetra_freq_train_features, X_train, axis=0) tetra_freq_train_labels = np.append( tetra_freq_train_labels, y_train, axis=0) tetra_freq_test_features = np.append( tetra_freq_test_features, X_test, axis=0) tetra_freq_test_labels = np.append(tetra_freq_test_labels, y_test, axis=0) else: print("APPENDING TO VALIDATION DATA") tetra_freq_val_features = np.append( tetra_freq_val_features, X_train, axis=0) tetra_freq_val_labels = np.append(tetra_freq_val_labels, y_train, axis=0) tetra_freq_val_features = np.append( tetra_freq_val_features, X_test, axis=0) tetra_freq_val_labels = np.append(tetra_freq_val_labels, y_test, axis=0) print("\n\n", 10 * "*", "RNN DATA PROCESSING", 10 * "*") tetran_word_dataset = fasta_to_tetranucleotide_list( p_df.values, n_df.values) tetran_word_dataset = tetran_word_dataset.dropna() print("\n\n", tetran_word_dataset.head(), "\n\n") X_tetran_train, X_tetran_test, y_tetran_train, y_tetran_test = generateTrainAndTestSplit( tetran_word_dataset.values) print("""\n X: {} Y: {} TX: {} TY: {} COMPLETE: {} COMPLETE+LABELS: {} """.format( np.array(X_tetran_train).shape, np.array(y_tetran_train).shape, np.array(X_tetran_test).shape, np.array(y_tetran_test).shape, np.array(tetran_word_dataset.iloc[:, :-1].values).shape, np.array(tetran_word_dataset.values).shape)) if (row["IS_TRAINING"] == True): rnn_token_train_features = np.append( rnn_token_train_features, X_tetran_train, axis=0) rnn_token_train_labels = np.append(rnn_token_train_labels, y_tetran_train, axis=0) rnn_token_test_features = np.append( rnn_token_test_features, X_tetran_test, axis=0) rnn_token_test_labels = np.append(rnn_token_test_labels, y_tetran_test, axis=0) else: print("APPENDING TO VALIDATION DATA") rnn_token_val_features = np.append(rnn_token_val_features, X_tetran_train, axis=0) rnn_token_val_labels = np.append(rnn_token_val_labels, y_tetran_train, axis=0) rnn_token_val_features = np.append(rnn_token_val_features, X_tetran_test, axis=0) rnn_token_val_labels = np.append(rnn_token_val_labels, y_tetran_test, axis=0) global_rnn_complete = np.append( global_rnn_complete, tetran_word_dataset.iloc[:, :-1].values, axis=0) except Exception as e: print('\n\nFAILED : \n\n' + str(e)) print(traceback.format_exc()) error_msg = str(e) if (nextflow_path is not None): w.writerow([row['BACTERIA'], stdout, stderr, error_msg]) bar.update(i) bacteria_end_time = datetime.datetime.now().time().strftime('%H:%M:%S') bacteria_total_time = ( datetime.datetime.strptime(bacteria_end_time, '%H:%M:%S') - datetime.datetime.strptime(bacteria_start_time, '%H:%M:%S')) print("\n\nBACTERIA: ", row['BACTERIA'], " - TOTAL ELAPSED TIME: ", bacteria_total_time) print("\n\nTOKENIZING RNN DATASET\n\n") str_global_rnn_complete = tetranucleotide_list_to_string_list( global_rnn_complete) str_rnn_token_train_features = tetranucleotide_list_to_string_list( rnn_token_train_features) str_rnn_token_test_features = tetranucleotide_list_to_string_list( rnn_token_test_features) str_rnn_token_val_features = tetranucleotide_list_to_string_list( rnn_token_val_features) tokenizer = Tokenizer() tokenizer.fit_on_texts(str_global_rnn_complete) vocab_size = len(tokenizer.word_index) + 1 print("\nTokenizer Summary") print("\n document_count: ", tokenizer.document_count) print("\n vocab size: ", vocab_size) rnn_token_train_features = tokenizer.texts_to_sequences( str_rnn_token_train_features) rnn_token_test_features = tokenizer.texts_to_sequences( str_rnn_token_test_features) rnn_token_val_features = tokenizer.texts_to_sequences( str_rnn_token_val_features) # X_train_pad = pad_sequences(rnn_token_train_features, maxlen=37, padding="post") # X_test_pad = pad_sequences(rnn_token_test_features, maxlen=37, padding="post") rnn_token_train_features = np.array(rnn_token_train_features) rnn_token_test_features = np.array(rnn_token_test_features) rnn_token_val_features = np.array(rnn_token_val_features) print("\n\nTOTAL HOT ENCODING FEATURES" "\nHOT ENCODED FEATURE TRAIN", hot_encoded_train_features.shape, "\nHOT ENCODED LABELS TRAIN", hot_encoded_train_labels.shape, "\nHOT ENCODED FEATURE TEST", hot_encoded_test_features.shape, "\nHOT ENCODED LABELS TEST", hot_encoded_test_labels.shape, "\nHOT ENCODED FEATURE VAL", hot_encoded_val_features.shape, "\nHOT ENCODED LABELS VAL", hot_encoded_val_labels.shape, "\n") print( "\n\nTOTAL TETRA-NUCLEOTDE FRECUENCY FEATURES" "\nTETRA-FREQ FEATURE TRAIN", tetra_freq_train_features.shape, "\nTETRA-FREQ LABELS TRAIN", tetra_freq_train_labels.shape, "\nTETRA-FREQ FEATURE TEST", tetra_freq_test_features.shape, "\nTETRA-FREQ LABELS TEST", tetra_freq_test_labels.shape, "\nTETRA-FREQ FEATURE VAL", tetra_freq_val_features.shape, "\nTETRA-FREQ LABELS VAL", tetra_freq_val_labels.shape, "\n") print( "\n\nTOTAL RNN TETRANUCLEOTIDE STRING TOKEN SEQUENCES FEATURES" "\nRNN TOKEN FEATURE TRAIN", rnn_token_train_features.shape, "\nRNN TOKEN LABELS TRAIN", rnn_token_train_labels.shape, "\nRNN TOKEN FEATURE TEST", rnn_token_test_features.shape, "\nRNN TOKEN LABELS TEST", rnn_token_test_labels.shape, "\nRNN TOKEN FEATURE VAL", rnn_token_val_features.shape, "\nRNN TOKEN LABELS VAL", rnn_token_val_labels.shape, "\nRNN TOKEN ALL", global_rnn_complete.shape, "\nVocab", vocab_size, "\n") # Save files if (save_data): saveData(hot_encoded_train_features, hot_encoded_train_labels, hot_encoded_test_features, hot_encoded_test_labels, hot_encoded_val_features, hot_encoded_val_labels, tetra_freq_train_features, tetra_freq_train_labels, tetra_freq_test_features, tetra_freq_test_labels, tetra_freq_val_features, tetra_freq_val_labels, rnn_token_train_features, rnn_token_train_labels, rnn_token_test_features, rnn_token_test_labels, rnn_token_val_features, rnn_token_val_labels, vocab_size, tokenizer, save_csv) try: print("\n\nDeleting Temporary Files\n\n") os.system('rm -rf __pycache__') os.system('rm -rf .nextflow') #os.system('rm -rf outDirOriginal') #os.system('rm -rf work') #os.system('rm .nextflow.*') #os.system('mv -v *.genome ./data') #os.system('mkdir -p ./data/bacteria') #os.system('mv ./outDir/* ./data/bacteria') #os.system('rm -rf ./outDir') except Exception as e: print("\n\nError deleting temporary data. " + str(e)) else: print("NOT SAVING BINARY DATA") end_time = datetime.datetime.now().time().strftime('%H:%M:%S') total_time = (datetime.datetime.strptime(end_time, '%H:%M:%S') - datetime.datetime.strptime(start_time, '%H:%M:%S')) print("\n\nTOTAL ELAPSED TIME: ", total_time) return hot_encoded_train_features, \ hot_encoded_train_labels, \ hot_encoded_test_features, \ hot_encoded_test_labels, \ hot_encoded_val_features, \ hot_encoded_val_labels, \ tetra_freq_train_features, \ tetra_freq_train_labels, \ tetra_freq_test_features, \ tetra_freq_test_labels, \ tetra_freq_val_features, \ tetra_freq_val_labels, \ rnn_token_train_features, \ rnn_token_train_labels, \ rnn_token_test_features, \ rnn_token_test_labels, \ rnn_token_val_features, \ rnn_token_val_labels, \ vocab_size, tokenizer