def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda, config, results_path, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile): """ Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA """ edges_hash = {} max_pv = -1000.0 max_pv_corr = -1000.0 mydb = db_util.getDBSchema(config) #config.get("mysql_jdbc_configs", "db") myuser = db_util.getDBUser(config) #config.get("mysql_jdbc_configs", "username") mypw = db_util.getDBPassword(config) #config.get("mysql_jdbc_configs", "password") myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) mysolr = db_util.getSolrPath(config) edges_file = open(pairwised_file) fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile) edge_table = mydb + ".mv_" + dataset_label + "_feature_networks" efshout = open(results_path + 'load_edges_' + dataset_label + '.sh','w') solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh','w') edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_pw_re.tsv','w') edges_out_pc = open(results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv','w') edges_meta_json = open(results_path + 'edges_out_' + dataset_label + '_meta.json','w') unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv' unmappedout = open(unmappedPath,'w') features_file = open(results_path + dataset_label + '_features_out.tsv','r') features_hash = {} for fl in features_file.readlines(): ftk = fl.strip().split("\t") features_hash[ftk[1]] = ftk features_file.close() validEdgeId = 1 invalidEdges = 0 dupeEdges = 0 totalEdges = 0 cnan = 0 pcc = 0 unMapped = 0 for line in edges_file: totalEdges += 1 line = line.strip() tokens = line.split('\t') if (len(tokens) < 11): if (validEdgeId == 1): print "Skipping header/line 1 for insufficient token reasons" continue print "ERROR: requires 11 tokens, found:" + str(len(tokens)) + " Skipping line\n" + line continue nodeA = tokens[0] nodeB = tokens[1] try: f1genescore = fIntHash[nodeA] except KeyError: f1genescore = 0 try: f2genescore = fIntHash[nodeB] except KeyError: f2genescore = 0 if (db_util.isUnmappedAssociation(nodeA, nodeB) and keep_unmapped == 0): unmappedout.write(nodeA + "\t" + nodeB + "\n") unMapped += 1 continue #nodeA = nodeA.replace('|', '_') #nodeB = nodeB.replace('|', '_') try: features_hash[nodeA] except KeyError: print "key error in resolving featureId for " + nodeA + " skipping edge." continue try: features_hash[nodeB] except KeyError: print "key error in resolving featureId for " + nodeB + " skipping edge." continue if (features_hash[nodeA] and features_hash[nodeB]): if (not edges_hash.get(nodeA + "_" + nodeB) and not edges_hash.get(nodeA + "_" + nodeB)): feature1id = ""#str(features_hash[nodeA]) feature2id = ""#str(features_hash[nodeB]) #This will need to be improve once all pairs has annotations try: feature1id = str(features_hash[nodeA][0]) except KeyError: print "ERROR: key error in resolving featureId for " + nodeA try: feature2id = str(features_hash[nodeB][0]) except: print "ERROR: key error in resolving featureId for " + nodeB edges_hash[nodeA + "_" + nodeB] = validEdgeId validEdgeId += 1 dataA = process_feature_alias(nodeA) label1_desc = "" dataB = process_feature_alias(nodeB) label2_desc = "" if (len(dataA) == 7): dataA.append("") nodeA = nodeA + ":" if (len(dataB) == 7): dataB.append("") nodeB = nodeB + ":" correlation_str = tokens[2] try: correlation = float(correlation_str) except ValueError: #Align correlation value to NaN cnan += 1 correlation = float('nan') correlation_str = '' numna = tokens[3] pv_str = tokens[4] bonf = tokens[5] pv_bonf_str = tokens[6] numnaf1 = tokens[7] pvf1_str = tokens[8] numnaf2 = tokens[9] pvf2_str = tokens[10] try: pv = str(pvlambda(float(pv_str))) pv_bonf = str(pvlambda(float(pv_bonf_str))) pvf1 = str(pvlambda(float(pvf1_str))) pvf2 = str(pvlambda(float(pvf2_str))) except ValueError: #error in pairwise script, ignore these associations for now continue; if (float(pv) > max_pv): max_pv = float(pv) if (float(pv_bonf) > max_pv_corr): max_pv_corr = float(pv_bonf) rho = str(db_util.sign(correlation)*abs(float(pv))) link_distance = 500000000 if ( len(tokens) >= 12 ): link_distance = int(tokens[11]) else: if (len(dataA) >=5 and len(dataB)>=5 and db_util.is_numeric(dataA[4]) >= 1 and db_util.is_numeric(dataB[4]) >= 1 and dataA[3] == dataB[3]): link_distance = abs(int(dataB[4]) - int(dataA[4])) edges_out_re.write(feature1id + "\t" + feature2id + "\t" + nodeA + "\t" + "\t".join(dataA) + "\t" + nodeB + "\t" + "\t".join(dataB) + "\t" + correlation_str + "\t" + numna + "\t" + pv + "\t" + bonf + "\t" + pv_bonf + "\t" + numnaf1 + "\t" + pvf1 + "\t" + numnaf2 + "\t" + pvf2 + "\t" + rho + "\t" + str(link_distance) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\n") if (do_pubcrawl == "yes"): #call andrea code getPairwiseInfo.processLine(line, edges_out_pc) pcc += 1 else: print "duplicated edge:" + nodeA + "_" + nodeB dupeEdges += 1 else: print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB invalidEdges += 1 print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" %(validEdgeId-1, dupeEdges, cnan, unMapped,unmappedPath, totalEdges, max_pv, max_pv_corr) edges_meta_json.write('{"max_logpv":%f}' %(max_pv)) edges_file.close() edges_out_re.close() edges_out_pc.close() edges_meta_json.close() unmappedout.close() efshout.write("#!/bin/bash\n") efshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb)) efshout.write("load data local infile '" + edges_out_re.name + "' replace INTO TABLE " + edge_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") efshout.write("\ncommit;") efshout.write("\nEOFMYSQL") efshout.close() print "Begin pairwise db bulk upload " + time.ctime() os.system("sh " + efshout.name) #create sharded association files for solr import solrshout.write("#!/bin/bash\n"); solrshout.write("python createPWShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.close() print "Begin pairwise solr upload " + time.ctime() os.system("sh " + solrshout.name) if (do_pubcrawl == "yes"): print "senting Pubcrawl notification to " + contacts smtp.main("*****@*****.**", contacts, "Notification - New Pairwise Associations for PubCrawl", "New pairwise associations ready for PubCrawl load\n" + edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" + edges_out_re.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + " \n\n")
def Email(adjuntos): smtp.main(adjuntos) gui.msgbox('Mensaje Enviado')
def main(mydict): print("开始训练时间:") start_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print(start_time) # py脚本额外参数 args = get_args() # main函数传入参数 my_data_dir = mydict["data_dir"] my_tensorboard = mydict["tensorboard"] my_checkpoint = mydict["checkpoint"] my_ifSE = mydict["ifSE"] my_l1loss = mydict["l1loss"] if my_l1loss: l1loss = 0.1 # 0.1 # l1loss = my_l1value else: l1loss = 0.0 if args.opts: cfg.merge_from_list(args.opts) cfg.freeze() start_epoch = 0 # checkpoint_dir = Path(args.checkpoint) checkpoint_dir = Path(my_checkpoint) checkpoint_dir.mkdir(parents=True, exist_ok=True) # create model_dir print("=> creating model_dir '{}'".format("se_resnext50_32x4d")) # model = get_model(model_name="se_resnext50_32x4d") model = my_model(my_ifSE) if cfg.TRAIN.OPT == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=cfg.TRAIN.LR, momentum=cfg.TRAIN.MOMENTUM, weight_decay=cfg.TRAIN.WEIGHT_DECAY) else: optimizer = torch.optim.Adam(model.parameters(), lr=cfg.TRAIN.LR) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) # optionally resume from a checkpoint resume_path = args.resume if resume_path: print(Path(resume_path).is_file()) if Path(resume_path).is_file(): print("=> loading checkpoint '{}'".format(resume_path)) checkpoint = torch.load(resume_path, map_location="cpu") start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})".format( resume_path, checkpoint['epoch'])) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) else: print("=> no checkpoint found at '{}'".format(resume_path)) if args.multi_gpu: model = nn.DataParallel(model) if device == "cuda": cudnn.benchmark = True # 损失计算准则 criterion = nn.CrossEntropyLoss().to(device) train_dataset = FaceDataset_ceface(my_data_dir, "train", img_size=cfg.MODEL.IMG_SIZE, augment=True, age_stddev=cfg.TRAIN.AGE_STDDEV) train_loader = DataLoader(train_dataset, batch_size=cfg.BATCH_SIZE, shuffle=True, num_workers=cfg.TRAIN.WORKERS, drop_last=True) val_dataset = FaceDataset_ceface(my_data_dir, "valid", img_size=cfg.MODEL.IMG_SIZE, augment=False) val_loader = DataLoader(val_dataset, batch_size=cfg.BATCH_SIZE, shuffle=False, num_workers=cfg.TRAIN.WORKERS, drop_last=False) scheduler = StepLR(optimizer, step_size=cfg.TRAIN.LR_DECAY_STEP, gamma=cfg.TRAIN.LR_DECAY_RATE, last_epoch=start_epoch - 1) best_val_mae = 10000.0 train_writer = None val_mae_list = [] train_loss_list = [] val_loss_list = [] if my_tensorboard is not None: opts_prefix = "_".join(args.opts) train_writer = SummaryWriter(log_dir=my_tensorboard + "/" + opts_prefix + "_train") val_writer = SummaryWriter(log_dir=my_tensorboard + "/" + opts_prefix + "_val") for epoch in range(start_epoch, 80): # cfg.TRAIN.EPOCHS): # train train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, device, l1loss) train_loss_list.append(train_loss) # validate val_loss, val_acc, val_mae = validate(val_loader, model, criterion, epoch, device, l1loss) val_mae_list.append(val_mae) val_loss_list.append(val_loss) if my_tensorboard is not None: train_writer.add_scalar("loss", train_loss, epoch) train_writer.add_scalar("acc", train_acc, epoch) val_writer.add_scalar("loss", val_loss, epoch) val_writer.add_scalar("acc", val_acc, epoch) val_writer.add_scalar("mae", val_mae, epoch) if val_mae < best_val_mae or val_mae > 0: print( f"=> [epoch {epoch:03d}] best val mae was improved from {best_val_mae:.3f} to {val_mae:.3f}" ) best_val_mae = val_mae # checkpoint if val_mae < 4.0: model_state_dict = model.module.state_dict( ) if args.multi_gpu else model.state_dict() torch.save( { 'epoch': epoch + 1, 'arch': cfg.MODEL.ARCH, 'state_dict': model_state_dict, 'optimizer_state_dict': optimizer.state_dict() }, str( checkpoint_dir.joinpath( "epoch{:03d}_{:.5f}_{:.4f}.pth".format( epoch, val_loss, val_mae)))) else: print( f"=> [epoch {epoch:03d}] best val mae was not improved from {best_val_mae:.3f} ({val_mae:.3f})" ) # adjust learning rate scheduler.step() print("=> training finished") print(f"additional opts: {args.opts}") print(f"best val mae: {best_val_mae:.3f}") print("结束训练时间:") end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print(end_time) print("训练耗时: " + smtp.date_gap(start_time, end_time)) # 发邮件 smtp.main( dict_={ "共训练epochs: ": cfg.TRAIN.EPOCHS, "训练耗时: ": smtp.date_gap(start_time, end_time), "最低val_mae: ": best_val_mae, "平均val_mae: ": np.array(val_mae_list).mean(), "vale_mae_list: ": val_mae_list, "train_loss_list: ": train_loss_list, "val_loss_list: ": val_loss_list, "MODEL.IMG_SIZE: ": cfg.MODEL.IMG_SIZE, "BATCH_SIZE: ": cfg.BATCH_SIZE, "LOSS.l1: ": l1loss, "TRAIN.LR: ": cfg.TRAIN.LR, "TRAIN.LR_DECAY_STEP: ": cfg.TRAIN.LR_DECAY_STEP, "TRAIN.LR_DECAY_RATE:": cfg.TRAIN.LR_DECAY_RATE, "TRAIN.OPT: ": cfg.TRAIN.OPT, "MODEL.ARCH:": cfg.MODEL.ARCH }) return best_val_mae
"l1loss": False }) time.sleep(180) main({ "data_dir": data_dir, "tensorboard": tf_log, "checkpoint": ckpt, "ifSE": True, "l1loss": False }) time.sleep(180) main({ "data_dir": data_dir, "tensorboard": tf_log, "checkpoint": ckpt, "ifSE": True, "l1loss": True }) time.sleep(180) main({ "data_dir": data_dir, "tensorboard": tf_log, "checkpoint": ckpt, "ifSE": True, "l1loss": True }) ########################################################################################################### end_time = smtp.print_time("全部训练结束!!!") print(smtp.date_gap(start_time, end_time)) smtp.main(dict_={"ceface全部训练耗时: ": smtp.date_gap(start_time, end_time)})
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda, config, results_path, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile): """ Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA """ edges_hash = {} max_pv = -1000.0 max_pv_corr = -1000.0 mydb = db_util.getDBSchema(config) #config.get("mysql_jdbc_configs", "db") myuser = db_util.getDBUser( config) #config.get("mysql_jdbc_configs", "username") mypw = db_util.getDBPassword( config) #config.get("mysql_jdbc_configs", "password") myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) mysolr = db_util.getSolrPath(config) edges_file = open(pairwised_file) fIntHash = parse_features_rfex.get_feature_interest_hash( featureInterestingFile) edge_table = mydb + ".mv_" + dataset_label + "_feature_networks" efshout = open(results_path + 'load_edges_' + dataset_label + '.sh', 'w') solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh', 'w') edges_out_re = open( results_path + 'edges_out_' + dataset_label + '_pw_re.tsv', 'w') edges_out_pc = open( results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv', 'w') edges_meta_json = open( results_path + 'edges_out_' + dataset_label + '_meta.json', 'w') unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv' unmappedout = open(unmappedPath, 'w') features_file = open(results_path + dataset_label + '_features_out.tsv', 'r') features_hash = {} for fl in features_file.readlines(): ftk = fl.strip().split("\t") features_hash[ftk[1]] = ftk features_file.close() validEdgeId = 1 invalidEdges = 0 dupeEdges = 0 totalEdges = 0 cnan = 0 pcc = 0 unMapped = 0 for line in edges_file: totalEdges += 1 line = line.strip() tokens = line.split('\t') if (len(tokens) < 11): if (validEdgeId == 1): print "Skipping header/line 1 for insufficient token reasons" continue print "ERROR: requires 11 tokens, found:" + str( len(tokens)) + " Skipping line\n" + line continue nodeA = tokens[0] nodeB = tokens[1] try: f1genescore = fIntHash[nodeA] except KeyError: f1genescore = 0 try: f2genescore = fIntHash[nodeB] except KeyError: f2genescore = 0 if (db_util.isUnmappedAssociation(nodeA, nodeB) and keep_unmapped == 0): unmappedout.write(nodeA + "\t" + nodeB + "\n") unMapped += 1 continue #nodeA = nodeA.replace('|', '_') #nodeB = nodeB.replace('|', '_') try: features_hash[nodeA] except KeyError: print "key error in resolving featureId for " + nodeA + " skipping edge." continue try: features_hash[nodeB] except KeyError: print "key error in resolving featureId for " + nodeB + " skipping edge." continue if (features_hash[nodeA] and features_hash[nodeB]): if (not edges_hash.get(nodeA + "_" + nodeB) and not edges_hash.get(nodeA + "_" + nodeB)): feature1id = "" #str(features_hash[nodeA]) feature2id = "" #str(features_hash[nodeB]) #This will need to be improve once all pairs has annotations try: feature1id = str(features_hash[nodeA][0]) except KeyError: print "ERROR: key error in resolving featureId for " + nodeA try: feature2id = str(features_hash[nodeB][0]) except: print "ERROR: key error in resolving featureId for " + nodeB edges_hash[nodeA + "_" + nodeB] = validEdgeId validEdgeId += 1 dataA = process_feature_alias(nodeA) label1_desc = "" dataB = process_feature_alias(nodeB) label2_desc = "" if (len(dataA) == 7): dataA.append("") nodeA = nodeA + ":" if (len(dataB) == 7): dataB.append("") nodeB = nodeB + ":" correlation_str = tokens[2] try: correlation = float(correlation_str) except ValueError: #Align correlation value to NaN cnan += 1 correlation = float('nan') correlation_str = '' numna = tokens[3] pv_str = tokens[4] bonf = tokens[5] pv_bonf_str = tokens[6] numnaf1 = tokens[7] pvf1_str = tokens[8] numnaf2 = tokens[9] pvf2_str = tokens[10] try: pv = str(pvlambda(float(pv_str))) pv_bonf = str(pvlambda(float(pv_bonf_str))) pvf1 = str(pvlambda(float(pvf1_str))) pvf2 = str(pvlambda(float(pvf2_str))) except ValueError: #error in pairwise script, ignore these associations for now continue if (float(pv) > max_pv): max_pv = float(pv) if (float(pv_bonf) > max_pv_corr): max_pv_corr = float(pv_bonf) rho = str(db_util.sign(correlation) * abs(float(pv))) link_distance = 500000000 if (len(tokens) >= 12): link_distance = int(tokens[11]) else: if (len(dataA) >= 5 and len(dataB) >= 5 and db_util.is_numeric(dataA[4]) >= 1 and db_util.is_numeric(dataB[4]) >= 1 and dataA[3] == dataB[3]): link_distance = abs(int(dataB[4]) - int(dataA[4])) edges_out_re.write(feature1id + "\t" + feature2id + "\t" + nodeA + "\t" + "\t".join(dataA) + "\t" + nodeB + "\t" + "\t".join(dataB) + "\t" + correlation_str + "\t" + numna + "\t" + pv + "\t" + bonf + "\t" + pv_bonf + "\t" + numnaf1 + "\t" + pvf1 + "\t" + numnaf2 + "\t" + pvf2 + "\t" + rho + "\t" + str(link_distance) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\n") if (do_pubcrawl == "yes"): #call andrea code getPairwiseInfo.processLine(line, edges_out_pc) pcc += 1 else: print "duplicated edge:" + nodeA + "_" + nodeB dupeEdges += 1 else: print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB invalidEdges += 1 print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" % ( validEdgeId - 1, dupeEdges, cnan, unMapped, unmappedPath, totalEdges, max_pv, max_pv_corr) edges_meta_json.write('{"max_logpv":%f}' % (max_pv)) edges_file.close() edges_out_re.close() edges_out_pc.close() edges_meta_json.close() unmappedout.close() efshout.write("#!/bin/bash\n") efshout.write( "mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" % (myhost, myport, myuser, mypw, mydb)) efshout.write("load data local infile '" + edges_out_re.name + "' replace INTO TABLE " + edge_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n") efshout.write("\ncommit;") efshout.write("\nEOFMYSQL") efshout.close() print "Begin pairwise db bulk upload " + time.ctime() os.system("sh " + efshout.name) #create sharded association files for solr import solrshout.write("#!/bin/bash\n") solrshout.write("python createPWShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") solrshout.write( "curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write( "curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write( "curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.close() print "Begin pairwise solr upload " + time.ctime() os.system("sh " + solrshout.name) if (do_pubcrawl == "yes"): print "senting Pubcrawl notification to " + contacts smtp.main( "*****@*****.**", contacts, "Notification - New Pairwise Associations for PubCrawl", "New pairwise associations ready for PubCrawl load\n" + edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" + edges_out_re.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + " \n\n")
if __name__ == '__main__': # fgnet train 82 group start_time = smtp.print_time("全部开始训练!!!") fgnet_root = cfg.dataset.fgnet_leave1out best_val_mae_arr = [] for i in range(1, 83): tmp = str(i) if i > 9 else "0" + str(i) data_dir = Path(fgnet_root).joinpath(tmp) best_val_mae_arr.append(main(str(data_dir))) print( f"fgnet all train finished and best_val_mae_arr is:{best_val_mae_arr}") end_time = smtp.print_time("全部训练结束!!!") print(smtp.date_gap(start_time, end_time)) smtp.main( dict_={ "fgnet全部训练耗时: ": smtp.date_gap(start_time, end_time), "best_val_mae_arr": best_val_mae_arr }) time.sleep(600) # sleep 10 min # fgnet_align train 82 group start_time = smtp.print_time("全部开始训练!!!") fgnet_align_root = cfg.dataset.fgnet_align_leave1out best_val_mae_arr = [] for i in range(1, 83): tmp = str(i) if i > 9 else "0" + str(i) data_dir = Path(fgnet_align_root).joinpath(tmp) best_val_mae_arr.append(main(str(data_dir))) print( f"fgnet_align all train finished and best_val_mae_arr is:{best_val_mae_arr}"
def process_associations_rfex(dataset_label, matrixfile, associationsfile, config, annotations, collapse_direction, reverse_direction, results_path, pv_lambda, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile): mydb = db_util.getDBSchema(config) myuser = db_util.getDBUser(config) mypw = db_util.getDBPassword(config) myhost = db_util.getDBHost(config) myport = db_util.getDBPort(config) mysolr = db_util.getSolrPath(config) if (not os.path.isfile(associationsfile)): print associationsfile + " does not exist; unrecoverable ERROR" sys.exit(-1) associations_table = mydb + ".mv_" + dataset_label + "_feature_networks" print "Begin processing associations %s Applying processing_pubcrawl %s" %(time.ctime(), do_pubcrawl) fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile) edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w') associations_in = open(associationsfile,'r') annotation_hash, ftype = parse_features_rfex.process_feature_annotations(annotations) fshout = open(results_path + 'load_sql_associations_' + dataset_label + '.sh','w') solrshout = open(results_path + 'load_solr_assocations_' + dataset_label + '.sh','w') unmappedPath = results_path + 'edges_out_' + dataset_label + '_rface_unmapped.tsv' unmappedout = open(unmappedPath,'w') features_file = open(results_path + dataset_label + '_features_out.tsv','r') features_hash = {} for fl in features_file.readlines(): ftk = fl.strip().split("\t") features_hash[ftk[1]] = ftk features_file.close() aliasid_file = open(results_path + dataset_label + '_features_alias_id.tsv','r') aliasid_hash = {} for fl in aliasid_file.readlines(): ftk = fl.strip().split("\t") aliasid_hash[ftk[0]] = ftk aliasid_file.close() tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w') pubcrawl_tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_pc.tsv','w') lc = 0 edgeCount = 0 pcc = 0 unMapped = 0 pvalueCutCount = 0 impCut = 0 lines = associations_in.readlines() associations_in.close() associations_dic = {} for line in lines: lc = lc + 1 columns = line.strip().split('\t') if (len(columns) < 5): print "Missing required tokens in associations lineIndex %i lineValue %s" %(lc, line) continue f1alias = columns[0] #afm_ids will be used for directionality collapsing, if needed f1afm_id = columns[0] f2afm_id = columns[1] if (len(f1alias.split(":")) < 3): annotated_feature = annotation_hash.get(f1alias) if (annotated_feature == None): print "ERROR: Target feature %s is not in afm/annotation %i" %(f1alias, len(annotation_hash)) continue f1alias = annotated_feature.replace("\t", ":") f2alias = columns[1] if (len(f2alias.split(":")) < 3): annotated_feature = annotation_hash.get(f2alias) if (annotated_feature == None): print "ERROR: Predictor feature %s is not in afm/annotation" %(f2alias) continue f2alias = annotated_feature.replace("\t", ":") try: f1genescore = fIntHash[f1alias] except KeyError: f1genescore = 0 try: f2genescore = fIntHash[f2alias] except KeyError: f2genescore = 0 f1data = f1alias.split(':') f2data = f2alias.split(':') if len(f1data) > 4: f1data[3] = f1data[3][3:] if len(f2data) > 4: f2data[3] = f2data[3][3:] if (len(f1data) <= 7 and (f1data[1] == 'CLIN' or f1data[1] == 'SAMP')): f1alias = ":".join(f1data[0:3]) + ":::::" f1data = f1alias.split(':') elif (len(f1data) == 7): f1data.append("") if (len(f2data) <= 7 and (f2data[1] == 'CLIN' or f2data[1] == 'SAMP')): f2alias = ":".join(f2data[0:3]) + ":::::" f2data = f2alias.split(':') elif (len(f2data) == 7): f2data.append("") f1aliasOmic = f1alias f2aliasOmic = f2alias #for annotations try: f1id = features_hash[f1alias][0] except KeyError: try: f1id = aliasid_hash[f1alias][1] f1aliasOmic = aliasid_hash[f1alias][2] f1data = f1aliasOmic.split(':') f1data[3] = f1data[3][3:] except KeyError: print "Skipping Key error with alias1 " + f1alias continue try: f2id = features_hash[f2alias][0]#f2alias.split(":")[-1] except KeyError: try: f2id = aliasid_hash[f2alias][1] f2aliasOmic = aliasid_hash[f2alias][2] f2data = f2aliasOmic.split(':') f2data[3] = f2data[3][3:] except KeyError: print "Skipping Key error with alias2 " + f2alias continue pvalue = float(columns[2]) pvalue = str(pv_lambda(pvalue)) importance = columns[3] correlation = columns[4] patientct = columns[5] if (db_util.isUnmappedAssociation(f1alias, f2alias) and keep_unmapped == 0): unmappedout.write(f1alias + "\t" + f2alias + "\n") unMapped += 1 continue rhoscore = "" link_distance = -1 if (len(f1data) >=5 and len(f2data)>=5 and db_util.is_numeric(f1data[4]) >= 1 and db_util.is_numeric(f2data[4]) >= 1 and f1data[3] == f2data[3]): link_distance = abs(int(f2data[4]) - int(f1data[4])) if (collapse_direction == 0): associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" else: #check whether (f1 -> f2 or f2 -> f1) exists, if yes, take the more important #if not, store pair if ((associations_dic.get(f1afm_id + "_" + f2afm_id) == None) and (associations_dic.get(f2afm_id + "_" + f1afm_id) == None)): associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" else: existingLink = associations_dic.get(f1afm_id + "_" + f2afm_id) ekey = f1afm_id + "_" + f2afm_id if (existingLink == None): existingLink = associations_dic.get(f2afm_id + "_" + f1afm_id) ekey = f2afm_id + "_" + f1afm_id prevImportance = existingLink.split("\t")[3] if (float(importance) > float(prevImportance)): associations_dic[ekey] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" if (reverse_direction == 1): associations_dic[f2afm_id + "_" + f1afm_id] = f2aliasOmic + "\t" + f1aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + str(f2genescore) + "\t" + str(f1genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n" edgeCount = edgeCount + 1 edgeCount = edgeCount + 1 if (do_pubcrawl == "yes"): getRFACEInfo.processLine(line, pubcrawl_tsvout) pcc += 1 for ei in associations_dic: tsvout.write(associations_dic[ei]) fshout.write("#!/bin/bash\n") fshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb)) fshout.write("load data local infile '" + tsvout.name + "' replace INTO TABLE " + associations_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';") fshout.write("\nEOFMYSQL\n") tsvout.close() unmappedout.close() pubcrawl_tsvout.close() fshout.close() print "\nReport: ValidEdges %i ImportanceCutoff %i edges filtered %i \nunMapped Edges %i Saved to %s" %(len(associations_dic), impCut, pvalueCutCount, unMapped, unmappedPath) print "Begin RF-ACE db bulk upload %s os.system sh %s" %(time.ctime(), fshout.name) os.system("sh " + fshout.name) solrshout.write("#!/bin/bash\n") solrshout.write("python createRFShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n") solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n") solrshout.close() print "Begin rface solr upload " + time.ctime() os.system("sh " + solrshout.name) if (do_pubcrawl == 'yes'): smtp.main("*****@*****.**", contacts, "Notification - New RFAce " + dataset_label + " Associations for PubCrawl", "New RFAce associations ready for PubCrawl load\n" + pubcrawl_tsvout.name + "\n" + str(pcc) + " Total Edges\n" + tsvout.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + "\n\n") print "Done processing associations %s" %(time.ctime()) associations_dic = None