def exportMorginFingerprint(): fin = open("%s/DrugBank/DrugBankNames.txt" % params.DATA_DIR) fsmileMissng = open("%s/DrugBank/MissingSMILEs.txt" % params.DATA_DIR, "w") dName2Morgan = dict() while True: line = fin.readline() if line == "": break parts = line.strip().split("||") drugName = parts[0].lower() # print(drugName) t = parts[1] smile = parts[-2] if t == "small molecule" and len(smile) > 2: try: v = genMorganBitVecFromSmiles(smile) # print(smile) dName2Morgan[drugName] = v except: fsmileMissng.write("%s\n" % drugName) continue fin.close() print(len(dName2Morgan)) utils.save_obj(dName2Morgan, "%s/DrugBank/DrugMorganDes" % params.DATA_DIR)
def make_transfer_graph(): c = {} toFromTracker = {} for d in os.listdir('.'): if os.path.isdir(d): try: _net = int(d) except ValueError as e: continue for i in range(10, 5010, 10): j = i - 1 if j not in c: c[j] = 0 toFromTracker[j] = [] for f in os.listdir(f'./{_net}'): if f'poet{j}_network' in f: c[j] += 1 toFromTracker[j].append( (_net, int(f.split("network_")[1].split("_")[0]))) df = pd.DataFrame.from_dict(c, orient='index') df.to_csv("transfers_per_attempt.csv") save_obj(toFromTracker, '.', 'exactTransfers') return
def exportSubG2(): fin = open("%s/JADER.txt" % params.JADER_OUT) foutDict = dict() dlen2SeCount = dict() nA = 0 print("Reading...") while True: line = fin.readline() if line == "": break nA += 1 print("\r%s" % nA, end="") parts = line.strip().split("$") drugCmb = parts[0] ses = parts[1] drugs = drugCmb.split(",") nD = len(drugs) drugs = sorted(drugs) sortNames = ",".join(drugs) fO = utils.get_dict(foutDict, nD, -1) if fO == -1: fO = open("%s/SUB/G%s" % (params.JADER_OUT, nD), "w") foutDict[nD] = fO fO.write("%s$%s\n" % (sortNames, ses)) if len(drugs) > 2 and len(drugs) <= 20: for i in range(len(drugs)): for j in range(i + 1, len(drugs)): d1 = drugs[i] d2 = drugs[j] pair = "%s,%s" % (d1, d2) try: f2 = foutDict[2] except: f2 = open("%s/SUB/G%s" % (params.JADER_OUT, 2), "w") foutDict[2] = f2 f2.write("%s$%s\n" % (pair, ses)) len2SeCount = utils.get_insert_key_dict(dlen2SeCount, nD, dict()) sess = ses.split(",") for se in sess: utils.add_dict_counter(len2SeCount, se) for k, v in foutDict.items(): v.close() d2 = dict() for k, v in dlen2SeCount.items(): kvs = utils.sort_dict(v) ks = [] for kv in kvs: kk, _ = kv ks.append(kk) d2[k] = ks utils.save_obj(d2, "%s/SUB/drugSize2CommonSEs" % params.JADER_OUT)
def fillMissingSMILEs(): fin = open("%s/DrugBank/MissingSMILEsF.txt" % params.DATA_DIR) lines = fin.readlines() d = utils.load_obj("%s/DrugBank/DrugMorganDes" % params.DATA_DIR) for line in lines: line = line.strip() parts = line.split("||") try: v = genMorganBitVecFromSmiles(parts[1]) except: print(parts[1]) d[parts[0].lower()] = v utils.save_obj(d, "%s/DrugBank/DrugMorganDes" % params.DATA_DIR)
def getAllDrugSet(): dirs = glob.glob("%s/*" % params.FADER_DIR) drugNameSet = dict() drugCombSet = dict() dMap = loadValidDrugMap() nSize = len(dMap) print("DMAP SIZE: ", nSize) for dir in dirs: path = getDrugFile(dir) assert os.path.isfile(path) getDrugSet(path, drugNameSet, drugCombSet, dMap) print("Saving...") utils.save_obj(drugNameSet, "%s/FDrugNameCount_%s" % (params.FADER_OUT, nSize)) utils.save_obj(drugCombSet, "%s/FDrugCombCount_%s" % (params.FADER_OUT, nSize))
def dieAndKillChildren(parent, pairs, stats): from utils.utils import save_obj save_obj(stats, os.path.join(f"{args.result_prefix}", f"results_{unique_run_id}"), f"pinsky_stats") # [pair.env.close() for pair in pairs] path = os.path.join(parent.root, parent.subfolders['alive_signals']) alive = os.listdir(path) for a in alive: os.remove(os.path.join(path, a)) # create #.txt.done files. parent.placeChildFlag(os.path.join(path, a) + '.done')
def statsCommonSes(): fin = open("%s/CADER.txt" % (params.CAD_OUT)) dSeCout = dict() while True: line = fin.readline() if line == "": break parts = line.strip().split("$") ses = parts[-1].split(",") for se in ses: utils.add_dict_counter(dSeCout, se) kvs = utils.sort_dict(dSeCout) ks = [] for kv in kvs: k, v = kv if v <= 20: continue ks.append(k) utils.save_obj(ks, "%s/SeTopList.txt" % params.CAD_OUT)
def exportSub(): fin = open("%s/FDrug2SeList_19814.txt" % params.FADER_OUT) foutDict = dict() dlen2SeCount = dict() nA = 0 print("Reading...") while True: line = fin.readline() if line == "": break nA += 1 print("\r%s" % nA, end="") parts = line.strip().split("$") drugCmb = parts[0] ses = parts[1] drugs = drugCmb.split(",") nD = len(drugs) sortNames = ",".join(sorted(drugs)) fO = utils.get_dict(foutDict, nD, -1) if fO == -1: fO = open("%s/SUB/%s" % (params.FADER_OUT, nD), "w") foutDict[nD] = fO fO.write("%s$%s\n" % (sortNames, ses)) len2SeCount = utils.get_insert_key_dict(dlen2SeCount, nD, dict()) sess = ses.split(",") for se in sess: utils.add_dict_counter(len2SeCount, se) for k, v in foutDict.items(): v.close() d2 = dict() for k, v in dlen2SeCount.items(): kvs = utils.sort_dict(v) ks = [] for kv in kvs: kk, _ = kv ks.append(kk) d2[k] = ks utils.save_obj(d2, "%s/SUB/drugSize2CommonSEs" % params.FADER_OUT)
def exportDrugCom2Side(): fin = open("%s/JADER.txt" % params.JADER_OUT) fout = open("%s/JADER2AllSeList.txt" % params.JADER_OUT, "w") dDrugComb2Se = dict() dDrugCombCount = dict() dDrugCom2Lenght = dict() drugCont = dict() seCount = dict() cc = 0 while True: line = fin.readline() if line == "": break cc += 1 line = line.strip() parts = line.split("$") drugCom = parts[0] dDrugCom2Lenght[drugCom] = len(drugCom.split(",")) ses = parts[1].split(",") utils.add_dict_counter(dDrugCombCount, drugCom, 1) for drug in drugCom.split(","): utils.add_dict_counter(drugCont, drug, 1) sesComb = utils.get_insert_key_dict(dDrugComb2Se, drugCom, dict()) for se in ses: utils.add_dict_counter(sesComb, se, 1) utils.add_dict_counter(seCount, se) kvs = utils.sort_dict(dDrugCombCount) for kv in kvs: k, v = kv seCountKv = utils.sort_dict(dDrugComb2Se[k]) sString = [] for seCountx in seCountKv: se,count = seCountx sString.append("%s:%s"% (se, count)) fout.write("%s:%s$%s$%s\n" % (k, v, len(sString), ",".join(sString))) fout.close() utils.save_obj(seCount, "%s/JADERSeCountFX" % params.JADER_OUT) utils.save_obj(dDrugCom2Lenght, "%s/DrugCombLength" % params.JADER_OUT) print(len(drugCont), len(seCount))
def cluster_experiments(learner, criterion, name): for clusters in range(0, 5): # clusters/4 clusters initially labeled print( '-' * 8, 'instances from {}/4 clusters initially labeled'.format(clusters), '-' * 8) torch.manual_seed(manual_seed) random.seed(manual_seed ) # makes sure initially_labeled are euqal in each round initially_labeled = [ initial_indices(clusters) for i in range(kwargs['rounds']) ] out = experiment(learner, criterion=criterion, initially_labeled=initially_labeled, **kwargs) utils.save_obj((out), result_folder + name + '{}'.format(clusters))
def createLearningDeltas(exp): deltas = {} for d in os.listdir('.'): try: lvl = int(d) except ValueError as e: continue files = os.listdir(f'./{lvl}') opt = [f for f in files if 'scores' in f] wins = [f for f in files if 'win' in f] sortedWins = sorted( wins, key=lambda x: int(x.split('poet')[1].split('.')[0])) sortedOpt = sorted(opt, key=lambda x: int(x.split("poet")[1].split("_")[0])) birth = int(sortedOpt[0].split('poet')[1].split('_')[0]) try: firstVictory = int(sortedWins[0].split('poet')[1].split('.')[0]) except IndexError as e: firstVictory = np.inf deltas[lvl] = firstVictory - birth save_obj(deltas, '.', f'{exp}.learningDelta')
def getAllDrugSEMap(): dirs = glob.glob("%s/*" % params.FADER_DIR) validSes = loadValidSEs() nSE = len(validSes) fout = open("%s/FDrug2SeList_%s.txt" % (params.FADER_OUT, nSE), "w") dMap = loadValidDrugMap() assert len(dMap) > 0 nSize = len(dMap) print("DMAP SIZE: ", nSize) seCount = dict() for dir in dirs: pathDrug = getDrugFile(dir) assert os.path.isfile(pathDrug) pathSE = getSEFile(dir) caseSEMap = getSideEffectSet(pathSE, seCount, validSes) getDrugSEMappingFile(pathDrug, fout, dMap, caseSEMap) print("Saving...") utils.save_obj(seCount, "%s/FSECount_%s_%s" % (params.FADER_OUT, nSize, nSE)) fout.close()
def exportOData(): dDrug2Id, _ = loadDictName2Id("%s/%sADrugs.txt" % (OUT_DIR, PREF)) dInd2Id, _ = loadDictName2Id("%s/%sAInd.txt" % (OUT_DIR, PREF)) dSe2Id, _ = loadDictName2Id("%s/%sASe.txt" % (OUT_DIR, PREF)) fin = open("%s/JADERInd.txt" % OUT_DIR) dList = [] while True: line = fin.readline() if line == "": break parts = line.strip().split("$") drugs = parts[0].split(",") inds = parts[2].split(",") ses = parts[-1].split(",") drugIds = [] indIds = [] seIds = [] if len(drugs) > 20: continue for drug in drugs: drugId = utils.get_dict(dDrug2Id, drug, -1) # print(drug, drugId) if drugId != -1: drugIds.append(drugId) for ind in inds: indId = utils.get_dict(dInd2Id, ind, -1) if indId != -1: indIds.append(indId) for se in ses: seId = utils.get_dict(dSe2Id, se, -1) if seId != -1: seIds.append(seId) # print(drugIds, indIds, seIds) dList.append([drugIds, indIds, seIds]) utils.save_obj(dList, "%s/DataDump.o" % OUT_DIR)
def createChildTask(self, run_id, work_dict, worker_id, task_id, poet_loop_counter, **kwargs): """ :param work_dict: dict of nns, envs, nn_ids, env_ids :param worker_id: child id (int) :param task_id: ADP TASK TYPE :param poet_loop_counter: poet number loop :return: """ work = { 'run_id': run_id, 'nns': work_dict['nn'], 'lvls': work_dict['env'], 'task_id': task_id, 'chromosome_ids': work_dict['nn_id'], 'env_ids': work_dict['env_id'], 'diff': work_dict['diff'], 'poet': poet_loop_counter, 'kwargs': kwargs } save_obj(work, os.path.join(self.root, self.subfolders['send_to_child']), f'child{worker_id}') available = os.path.join(self.root, self.subfolders['available_signals'], f'{worker_id}.txt') if not os.path.exists(available): time.sleep(10) if os.path.exists(available): os.remove(available)
output_path = "/home/upf/corpora/IberLEF2019/multitask" irosva_path = "/home/upf/corpora/IberLEF2019/IroSva/preprocessed_data" mexa3t_path = "/home/upf/corpora/IberLEF2019/MEX-A3T/preprocessed_data" haha_path = "/home/upf/corpora/IberLEF2019/HAHA/preprocessed_data" tass_path = "/home/upf/corpora/IberLEF2019/TASS/preprocessed_data" tasks = ["irosva", "haha", "mexa3t"] tag = "_".join(sorted(tasks)) word_index_files = [] word_index_files.append(os.path.join(irosva_path, "word_index_all.txt")) word_index_files.append(os.path.join(haha_path, "word_index_train.txt")) word_index_files.append(os.path.join(mexa3t_path, "word_index_all.txt")) word_index = get_word_index_from_files(word_index_files) word_index_all_path = os.path.join(output_path, "word_index_" + tag + ".txt") write_word_index(word_index, word_index_all_path) word_index_all_path = os.path.join(output_path, "word_index_" + tag + ".pkl") save_obj(word_index, word_index_all_path) word_index_all_path = os.path.join(output_path, "word_index_" + tag + ".pkl") word_index = load_obj(word_index_all_path)
def returnAnswer(self, answer): path = os.path.join(self.root, self.subfolders['send_to_parent']) save_obj(answer, path, f'answer{self.id}')
#emb_matrix_filename = "emb_matrix_fb_" #emb_matrix_path = os.path.join(output_path, emb_matrix_filename + ".pkl") #emb_matrix = load_obj(emb_matrix_path) word_index_path = os.path.join(output_path, "word_index_all.pkl") word_index = load_obj(word_index_path) dim = 100 w2v_path = "/home/abravo/corpora/IberLEF2019/regional_emb/es-MX-100d.vec" #dim = 300 #w2v_path = "/home/abravo/corpora/IberLEF2019/regional_emb/model_swm_300-6-10-low_es.w2v" emb_matrix = get_embedding_matrix(word_index, dim, w2v_path) emb_matrix_path = os.path.join(output_path, "emb_matrix" + ".pkl") save_obj(emb_matrix, emb_matrix_path) MODE_BOTH = True if MODE_BOTH: data_path = os.path.join(output_path, "data_train.pkl") data = load_obj(data_path) labels_path = os.path.join(output_path, "labels_train.pkl") labels = load_obj(labels_path) scores_path = os.path.join(output_path, "scores_train.pkl")
join_emb.eval() dataset = TextDataset(args.data_path, args.dictionary) print("Dataset size: ", len(dataset)) dataset_loader = DataLoader(dataset, batch_size=args.batch_size, num_workers=3, pin_memory=True, collate_fn=collate_fn_cap_padded) caps_enc = list() print("### Starting sentence embedding ###") end = time.time() for i, (caps, length) in enumerate(dataset_loader, 0): input_caps = caps.to(device) with torch.no_grad(): _, output_emb = join_emb(None, input_caps, length) caps_enc.append(output_emb.cpu().data.numpy()) if i % 100 == 99: print(str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " captions encoded - Time per batch: " + str((time.time() - end)) + "s") end = time.time() print("Processing done -> saving") caps_stack = np.vstack(caps_enc) save_obj(caps_stack, args.output_path) print("The data has been save to ", args.output_path)
COUNT_OF_FILES = 40 def get_reader(): reader = DocumentStreamReader(sys.stdin) return reader def create_indexes(encoding): reader = get_reader() indexer = Indexer(COUNT_OF_FILES, encoding) for doc in reader: doc_url = doc.url doc_text = doc.text if doc.HasField('text') else 0 if doc_text != 0: indexer.add_document_indexes(text=doc_text, url=doc_url) return indexer if __name__ == "__main__": encoding = sys.argv[1] indexer = create_indexes(encoding) for key, r_index in indexer.full_index.iteritems(): if key != "encoding": save_obj(r_index, "indexer_"+str(key)) save_obj(indexer.documents, "documents")
nlp = None tweet_col = 1 label_col = 2 if SAVE_WORD_INDEX: if not nlp: nlp = get_spacy_nlp('es_core_news_md', True) all_files = [preproc_train_path, preproc_test_path] word_index = get_word_index(nlp, all_files, True, 1) word_index_all_path = os.path.join(output_path, "word_index_all.txt") write_word_index(word_index, word_index_all_path) word_index_all_path = os.path.join(output_path, "word_index_all.pkl") save_obj(word_index, word_index_all_path) word_index_path = os.path.join(output_path, "word_index_all.pkl") word_index = load_obj(word_index_path) if SAVE_CHAR_INDEX: if not nlp: nlp = get_spacy_nlp('es_core_news_md', True) all_files = [preproc_train_path] char_index = get_word_index(nlp, all_files, True, tweet_col, True) char_index_all_path = os.path.join(output_path, "char_index_train.txt") write_word_index(char_index, char_index_all_path) char_index_all_path = os.path.join(output_path, "char_index_train.pkl") save_obj(char_index, char_index_all_path)
batch_size=args.batch_size, num_workers=6, pin_memory=True) imgs_enc = list() print("### Starting image embedding ###") end = time.time() for i, imgs in enumerate(dataset_loader, 0): input_imgs = imgs.to(device) print(input_imgs) with torch.no_grad(): output_emb, _ = join_emb(input_imgs, None, None) imgs_enc.append(output_emb.cpu().data.numpy()) if i % 100 == 99: print( str((i + 1) * args.batch_size) + "/" + str(len(dataset)) + " images encoded - Time per batch: " + str((time.time() - end)) + "s") end = time.time() print("Processing done -> saving") imgs_stack = np.vstack(imgs_enc) save_obj((imgs_stack, dataset.get_image_list()), args.output_path) print("The data has been save to ", args.output_path)
tweet_col = 0 label_col = 1 if SAVE_WORD_INDEX: if not nlp: nlp = get_spacy_nlp('es_core_news_md', True) all_files = [preproc_train_path, preproc_test_path] word_index = get_word_index(nlp, all_files, True, tweet_col, False) word_index_all_path = os.path.join(output_path, "word_index_all.txt") write_word_index(word_index, word_index_all_path) word_index_all_path = os.path.join(output_path, "word_index_all.pkl") save_obj(word_index, word_index_all_path) print("WORD INDEX PROCESSED!") word_index_all_path = os.path.join(output_path, "word_index_all.pkl") word_index = load_obj(word_index_all_path) if SAVE_CHAR_INDEX: if not nlp: nlp = get_spacy_nlp('es_core_news_md', True) all_files = [preproc_train_path, preproc_test_path] char_index = get_word_index(nlp, all_files, True, tweet_col, True) char_index_all_path = os.path.join(output_path, "char_index_all.txt") write_word_index(char_index, char_index_all_path)
def exportPolySes(): dDrug = dict() dSe = dict() dDrugComb2Ses = dict() fin = open("%s/PolySes.txt" % params.FADER_OUT) while True: line = fin.readline() if line == "": break line = line.strip() parts = line.split("\t") drugCom = parts[0] ses = parts[1] drugs = drugCom.split(",") if len(drugs) > params.MAX_N_DRUG: continue for drug in drugs: utils.get_update_dict_index(dDrug, drug) ses = ses.split(",") for se in ses: utils.get_update_dict_index(dSe, se) dDrugComb2Ses[drugCom] = ses nDrug = len(dDrug) nSe = len(dSe) nComb = len(dDrugComb2Ses) print("Drugs, Ses, Comb: ", nDrug, nSe, nComb) fout = open("%s/PolySe_%s" % (params.FADER_OUT, params.MAX_N_DRUG), "w") kvs = [] for drugCom, ses in dDrugComb2Ses.items(): fout.write("%s\t%s\n" % (drugCom, ",".join(ses))) kvs.append([drugCom.split(","), ses]) fout.close() random.shuffle(kvs) SEG_SIZE = int(nComb / params.K_FOLD) for iFold in range(params.K_FOLD): print("Generating fold...", iFold) tests = [] validates = [] trains = [] startTest = iFold * SEG_SIZE endTest = (iFold + 1) * SEG_SIZE if endTest > nComb: endTest = nComb startValid = endTest if iFold == params.K_FOLD - 1: startValid = 0 endValid = startValid + SEG_SIZE if endValid > nComb: endValid = nComb for j, kv in enumerate(kvs): if startTest <= j < endTest: seg = tests elif startValid <= j < endValid: seg = validates else: seg = trains seg.append(kv) utils.save_obj((dDrug, dSe, trains, tests, validates), "%s/_%s" % (params.FADER_KFOLD, iFold))