Esempio n. 1
0
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda, config, results_path, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile):
	"""
	Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash
	Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA	
	"""
	edges_hash = {}
	max_pv = -1000.0
	max_pv_corr = -1000.0
	mydb = db_util.getDBSchema(config) #config.get("mysql_jdbc_configs", "db")
	myuser = db_util.getDBUser(config) #config.get("mysql_jdbc_configs", "username")
	mypw = db_util.getDBPassword(config) #config.get("mysql_jdbc_configs", "password")
	myhost = db_util.getDBHost(config)
	myport = db_util.getDBPort(config)
	mysolr = db_util.getSolrPath(config)
	edges_file = open(pairwised_file)
	fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile)
	edge_table = mydb + ".mv_" + dataset_label + "_feature_networks" 
	efshout = open(results_path + 'load_edges_' + dataset_label + '.sh','w')
	solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh','w')
	edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_pw_re.tsv','w')
	edges_out_pc = open(results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv','w')
	edges_meta_json = open(results_path + 'edges_out_' + dataset_label + '_meta.json','w')
	unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv'
	unmappedout = open(unmappedPath,'w')
	features_file = open(results_path + dataset_label + '_features_out.tsv','r')
	features_hash = {}
	for fl in features_file.readlines():
		ftk = fl.strip().split("\t")
		features_hash[ftk[1]] = ftk
		features_file.close()

	validEdgeId = 1
	invalidEdges = 0
	dupeEdges = 0
	totalEdges = 0
	cnan = 0
	pcc = 0
	unMapped = 0
	for line in edges_file:
		totalEdges += 1 
		line = line.strip()
		tokens = line.split('\t')
		if (len(tokens) < 11):
			if (validEdgeId == 1):
				print "Skipping header/line 1 for insufficient token reasons"
				continue
			print "ERROR: requires 11 tokens, found:" + str(len(tokens)) + " Skipping line\n" + line
			continue
		nodeA = tokens[0]
		nodeB = tokens[1]

		try:
			f1genescore = fIntHash[nodeA]
		except KeyError:
			f1genescore = 0
		try:
			f2genescore = fIntHash[nodeB]
		except KeyError:
			f2genescore = 0

		if (db_util.isUnmappedAssociation(nodeA, nodeB) and keep_unmapped == 0):
			unmappedout.write(nodeA + "\t" + nodeB + "\n")
			unMapped += 1
			continue
		#nodeA = nodeA.replace('|', '_')
		#nodeB = nodeB.replace('|', '_')
		try:
			features_hash[nodeA]
		except KeyError:
			print "key error in resolving featureId for " + nodeA + " skipping edge."
			continue
		try:
			features_hash[nodeB]
		except KeyError:
			print "key error in resolving featureId for " + nodeB + " skipping edge."
			continue

		if (features_hash[nodeA] and features_hash[nodeB]):
			if (not edges_hash.get(nodeA + "_" + nodeB) and not edges_hash.get(nodeA + "_" + nodeB)):
				feature1id = ""#str(features_hash[nodeA]) 
				feature2id = ""#str(features_hash[nodeB])
				#This will need to be improve once all pairs has annotations 
				try:
					feature1id = str(features_hash[nodeA][0])
				except KeyError:
					print "ERROR: key error in resolving featureId for " + nodeA
				try:
					feature2id = str(features_hash[nodeB][0])
				except:
					print "ERROR: key error in resolving featureId for " + nodeB

				edges_hash[nodeA + "_" + nodeB] = validEdgeId
				validEdgeId += 1
				dataA = process_feature_alias(nodeA)
				label1_desc = ""
				dataB = process_feature_alias(nodeB)
				label2_desc = ""
				if (len(dataA) == 7):
					dataA.append("")
					nodeA = nodeA + ":"
				if (len(dataB) == 7):
					dataB.append("")
					nodeB = nodeB + ":"
				correlation_str = tokens[2]
				try:
					correlation = float(correlation_str)
				except ValueError:
					#Align correlation value to NaN
					cnan += 1
					correlation = float('nan')
					correlation_str = ''
				numna = tokens[3]
				pv_str = tokens[4]
				bonf = tokens[5]
				pv_bonf_str = tokens[6]
				numnaf1 = tokens[7]
				pvf1_str = tokens[8]
				numnaf2 = tokens[9]
				pvf2_str = tokens[10]
				try:
					pv = str(pvlambda(float(pv_str)))
					pv_bonf = str(pvlambda(float(pv_bonf_str)))
					pvf1 = str(pvlambda(float(pvf1_str)))
					pvf2 = str(pvlambda(float(pvf2_str)))
				except ValueError:
					#error in pairwise script, ignore these associations for now
					continue;

				if (float(pv) > max_pv):
					max_pv = float(pv)
				
				if (float(pv_bonf) > max_pv_corr):
					max_pv_corr = float(pv_bonf)

				rho = str(db_util.sign(correlation)*abs(float(pv)))
				
				link_distance = 500000000
				if ( len(tokens) >= 12 ):
  					link_distance = int(tokens[11])
				else:
					if (len(dataA) >=5 and len(dataB)>=5 and db_util.is_numeric(dataA[4]) >= 1 and db_util.is_numeric(dataB[4]) >= 1 and dataA[3] == dataB[3]):
						link_distance = abs(int(dataB[4]) - int(dataA[4]))
				edges_out_re.write(feature1id + "\t" + feature2id + "\t" + nodeA + "\t" + "\t".join(dataA) + "\t" + nodeB + "\t" + "\t".join(dataB) + "\t" + correlation_str + "\t" + numna + "\t" + pv + "\t" + bonf + "\t" + pv_bonf + "\t" + numnaf1 + "\t" + pvf1 + "\t" + numnaf2 + "\t" + pvf2 + "\t" + rho + "\t" + str(link_distance) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\n")
				if (do_pubcrawl == "yes"):
					#call andrea code
					getPairwiseInfo.processLine(line, edges_out_pc)
					pcc += 1
			else:
				print "duplicated edge:" + nodeA + "_" + nodeB
				dupeEdges += 1
		else:
			print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB
			invalidEdges += 1
	print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" %(validEdgeId-1, dupeEdges, cnan, unMapped,unmappedPath, totalEdges, max_pv, max_pv_corr)	
	edges_meta_json.write('{"max_logpv":%f}' %(max_pv))
	edges_file.close()
	edges_out_re.close()
	edges_out_pc.close()
	edges_meta_json.close()	
	unmappedout.close()
	efshout.write("#!/bin/bash\n")
	efshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb))
	efshout.write("load data local infile '" + edges_out_re.name + "' replace INTO TABLE " + edge_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
	efshout.write("\ncommit;")
	efshout.write("\nEOFMYSQL")
	efshout.close()
	print "Begin pairwise db bulk upload " + time.ctime() 
	os.system("sh " + efshout.name)
	#create sharded association files for solr import
	solrshout.write("#!/bin/bash\n");
	solrshout.write("python createPWShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n") 
	solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
 	solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv  -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.close()
	print "Begin pairwise solr upload " + time.ctime()
	os.system("sh " + solrshout.name)
	if (do_pubcrawl == "yes"):
		print "senting Pubcrawl notification to " + contacts
		smtp.main("*****@*****.**", contacts, "Notification - New Pairwise Associations for PubCrawl", "New pairwise associations ready for PubCrawl load\n" + edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" + edges_out_re.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + " \n\n")
Esempio n. 2
0
def Email(adjuntos):
    smtp.main(adjuntos)
    gui.msgbox('Mensaje Enviado')
def main(mydict):
    print("开始训练时间:")
    start_time = time.strftime('%Y-%m-%d %H:%M:%S',
                               time.localtime(time.time()))
    print(start_time)
    # py脚本额外参数
    args = get_args()
    # main函数传入参数
    my_data_dir = mydict["data_dir"]
    my_tensorboard = mydict["tensorboard"]
    my_checkpoint = mydict["checkpoint"]
    my_ifSE = mydict["ifSE"]
    my_l1loss = mydict["l1loss"]
    if my_l1loss:
        l1loss = 0.1  # 0.1
        # l1loss = my_l1value
    else:
        l1loss = 0.0

    if args.opts:
        cfg.merge_from_list(args.opts)

    cfg.freeze()
    start_epoch = 0

    # checkpoint_dir = Path(args.checkpoint)
    checkpoint_dir = Path(my_checkpoint)
    checkpoint_dir.mkdir(parents=True, exist_ok=True)

    # create model_dir
    print("=> creating model_dir '{}'".format("se_resnext50_32x4d"))
    # model = get_model(model_name="se_resnext50_32x4d")
    model = my_model(my_ifSE)

    if cfg.TRAIN.OPT == "sgd":
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=cfg.TRAIN.LR,
                                    momentum=cfg.TRAIN.MOMENTUM,
                                    weight_decay=cfg.TRAIN.WEIGHT_DECAY)
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=cfg.TRAIN.LR)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # optionally resume from a checkpoint
    resume_path = args.resume
    if resume_path:
        print(Path(resume_path).is_file())
        if Path(resume_path).is_file():
            print("=> loading checkpoint '{}'".format(resume_path))
            checkpoint = torch.load(resume_path, map_location="cpu")
            start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                resume_path, checkpoint['epoch']))
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        else:
            print("=> no checkpoint found at '{}'".format(resume_path))

    if args.multi_gpu:
        model = nn.DataParallel(model)

    if device == "cuda":
        cudnn.benchmark = True

    # 损失计算准则
    criterion = nn.CrossEntropyLoss().to(device)
    train_dataset = FaceDataset_ceface(my_data_dir,
                                       "train",
                                       img_size=cfg.MODEL.IMG_SIZE,
                                       augment=True,
                                       age_stddev=cfg.TRAIN.AGE_STDDEV)
    train_loader = DataLoader(train_dataset,
                              batch_size=cfg.BATCH_SIZE,
                              shuffle=True,
                              num_workers=cfg.TRAIN.WORKERS,
                              drop_last=True)

    val_dataset = FaceDataset_ceface(my_data_dir,
                                     "valid",
                                     img_size=cfg.MODEL.IMG_SIZE,
                                     augment=False)
    val_loader = DataLoader(val_dataset,
                            batch_size=cfg.BATCH_SIZE,
                            shuffle=False,
                            num_workers=cfg.TRAIN.WORKERS,
                            drop_last=False)

    scheduler = StepLR(optimizer,
                       step_size=cfg.TRAIN.LR_DECAY_STEP,
                       gamma=cfg.TRAIN.LR_DECAY_RATE,
                       last_epoch=start_epoch - 1)
    best_val_mae = 10000.0
    train_writer = None
    val_mae_list = []
    train_loss_list = []
    val_loss_list = []

    if my_tensorboard is not None:
        opts_prefix = "_".join(args.opts)
        train_writer = SummaryWriter(log_dir=my_tensorboard + "/" +
                                     opts_prefix + "_train")
        val_writer = SummaryWriter(log_dir=my_tensorboard + "/" + opts_prefix +
                                   "_val")

    for epoch in range(start_epoch, 80):  # cfg.TRAIN.EPOCHS):
        # train
        train_loss, train_acc = train(train_loader, model, criterion,
                                      optimizer, epoch, device, l1loss)
        train_loss_list.append(train_loss)

        # validate
        val_loss, val_acc, val_mae = validate(val_loader, model, criterion,
                                              epoch, device, l1loss)
        val_mae_list.append(val_mae)
        val_loss_list.append(val_loss)

        if my_tensorboard is not None:
            train_writer.add_scalar("loss", train_loss, epoch)
            train_writer.add_scalar("acc", train_acc, epoch)
            val_writer.add_scalar("loss", val_loss, epoch)
            val_writer.add_scalar("acc", val_acc, epoch)
            val_writer.add_scalar("mae", val_mae, epoch)

        if val_mae < best_val_mae or val_mae > 0:
            print(
                f"=> [epoch {epoch:03d}] best val mae was improved from {best_val_mae:.3f} to {val_mae:.3f}"
            )
            best_val_mae = val_mae
            # checkpoint
            if val_mae < 4.0:
                model_state_dict = model.module.state_dict(
                ) if args.multi_gpu else model.state_dict()
                torch.save(
                    {
                        'epoch': epoch + 1,
                        'arch': cfg.MODEL.ARCH,
                        'state_dict': model_state_dict,
                        'optimizer_state_dict': optimizer.state_dict()
                    },
                    str(
                        checkpoint_dir.joinpath(
                            "epoch{:03d}_{:.5f}_{:.4f}.pth".format(
                                epoch, val_loss, val_mae))))
        else:
            print(
                f"=> [epoch {epoch:03d}] best val mae was not improved from {best_val_mae:.3f} ({val_mae:.3f})"
            )

        # adjust learning rate
        scheduler.step()

    print("=> training finished")
    print(f"additional opts: {args.opts}")
    print(f"best val mae: {best_val_mae:.3f}")
    print("结束训练时间:")
    end_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    print(end_time)
    print("训练耗时: " + smtp.date_gap(start_time, end_time))
    # 发邮件
    smtp.main(
        dict_={
            "共训练epochs: ": cfg.TRAIN.EPOCHS,
            "训练耗时: ": smtp.date_gap(start_time, end_time),
            "最低val_mae: ": best_val_mae,
            "平均val_mae: ": np.array(val_mae_list).mean(),
            "vale_mae_list: ": val_mae_list,
            "train_loss_list: ": train_loss_list,
            "val_loss_list: ": val_loss_list,
            "MODEL.IMG_SIZE: ": cfg.MODEL.IMG_SIZE,
            "BATCH_SIZE: ": cfg.BATCH_SIZE,
            "LOSS.l1: ": l1loss,
            "TRAIN.LR: ": cfg.TRAIN.LR,
            "TRAIN.LR_DECAY_STEP: ": cfg.TRAIN.LR_DECAY_STEP,
            "TRAIN.LR_DECAY_RATE:": cfg.TRAIN.LR_DECAY_RATE,
            "TRAIN.OPT: ": cfg.TRAIN.OPT,
            "MODEL.ARCH:": cfg.MODEL.ARCH
        })
    return best_val_mae
        "l1loss": False
    })
    time.sleep(180)
    main({
        "data_dir": data_dir,
        "tensorboard": tf_log,
        "checkpoint": ckpt,
        "ifSE": True,
        "l1loss": False
    })
    time.sleep(180)
    main({
        "data_dir": data_dir,
        "tensorboard": tf_log,
        "checkpoint": ckpt,
        "ifSE": True,
        "l1loss": True
    })
    time.sleep(180)
    main({
        "data_dir": data_dir,
        "tensorboard": tf_log,
        "checkpoint": ckpt,
        "ifSE": True,
        "l1loss": True
    })
    ###########################################################################################################
    end_time = smtp.print_time("全部训练结束!!!")
    print(smtp.date_gap(start_time, end_time))
    smtp.main(dict_={"ceface全部训练耗时: ": smtp.date_gap(start_time, end_time)})
Esempio n. 5
0
def process_pairwise_edges(dataset_label, matrixfile, pairwised_file, pvlambda,
                           config, results_path, do_pubcrawl, contacts,
                           keep_unmapped, featureInterestingFile):
    """
	Include edges where nodes are in original set, direction does not matter so do not populate edge if A->B if B->A are in hash
	Expected tab delimited columns are nodeA nodeB pvalue correlation numNonNA	
	"""
    edges_hash = {}
    max_pv = -1000.0
    max_pv_corr = -1000.0
    mydb = db_util.getDBSchema(config)  #config.get("mysql_jdbc_configs", "db")
    myuser = db_util.getDBUser(
        config)  #config.get("mysql_jdbc_configs", "username")
    mypw = db_util.getDBPassword(
        config)  #config.get("mysql_jdbc_configs", "password")
    myhost = db_util.getDBHost(config)
    myport = db_util.getDBPort(config)
    mysolr = db_util.getSolrPath(config)
    edges_file = open(pairwised_file)
    fIntHash = parse_features_rfex.get_feature_interest_hash(
        featureInterestingFile)
    edge_table = mydb + ".mv_" + dataset_label + "_feature_networks"
    efshout = open(results_path + 'load_edges_' + dataset_label + '.sh', 'w')
    solrshout = open(results_path + 'load_solr_' + dataset_label + '.sh', 'w')
    edges_out_re = open(
        results_path + 'edges_out_' + dataset_label + '_pw_re.tsv', 'w')
    edges_out_pc = open(
        results_path + 'edges_out_' + dataset_label + '_pw_pc.tsv', 'w')
    edges_meta_json = open(
        results_path + 'edges_out_' + dataset_label + '_meta.json', 'w')
    unmappedPath = results_path + 'edges_out_' + dataset_label + '_pw_unmapped.tsv'
    unmappedout = open(unmappedPath, 'w')
    features_file = open(results_path + dataset_label + '_features_out.tsv',
                         'r')
    features_hash = {}
    for fl in features_file.readlines():
        ftk = fl.strip().split("\t")
        features_hash[ftk[1]] = ftk
        features_file.close()

    validEdgeId = 1
    invalidEdges = 0
    dupeEdges = 0
    totalEdges = 0
    cnan = 0
    pcc = 0
    unMapped = 0
    for line in edges_file:
        totalEdges += 1
        line = line.strip()
        tokens = line.split('\t')
        if (len(tokens) < 11):
            if (validEdgeId == 1):
                print "Skipping header/line 1 for insufficient token reasons"
                continue
            print "ERROR: requires 11 tokens, found:" + str(
                len(tokens)) + " Skipping line\n" + line
            continue
        nodeA = tokens[0]
        nodeB = tokens[1]

        try:
            f1genescore = fIntHash[nodeA]
        except KeyError:
            f1genescore = 0
        try:
            f2genescore = fIntHash[nodeB]
        except KeyError:
            f2genescore = 0

        if (db_util.isUnmappedAssociation(nodeA, nodeB)
                and keep_unmapped == 0):
            unmappedout.write(nodeA + "\t" + nodeB + "\n")
            unMapped += 1
            continue
        #nodeA = nodeA.replace('|', '_')
        #nodeB = nodeB.replace('|', '_')
        try:
            features_hash[nodeA]
        except KeyError:
            print "key error in resolving featureId for " + nodeA + " skipping edge."
            continue
        try:
            features_hash[nodeB]
        except KeyError:
            print "key error in resolving featureId for " + nodeB + " skipping edge."
            continue

        if (features_hash[nodeA] and features_hash[nodeB]):
            if (not edges_hash.get(nodeA + "_" + nodeB)
                    and not edges_hash.get(nodeA + "_" + nodeB)):
                feature1id = ""  #str(features_hash[nodeA])
                feature2id = ""  #str(features_hash[nodeB])
                #This will need to be improve once all pairs has annotations
                try:
                    feature1id = str(features_hash[nodeA][0])
                except KeyError:
                    print "ERROR: key error in resolving featureId for " + nodeA
                try:
                    feature2id = str(features_hash[nodeB][0])
                except:
                    print "ERROR: key error in resolving featureId for " + nodeB

                edges_hash[nodeA + "_" + nodeB] = validEdgeId
                validEdgeId += 1
                dataA = process_feature_alias(nodeA)
                label1_desc = ""
                dataB = process_feature_alias(nodeB)
                label2_desc = ""
                if (len(dataA) == 7):
                    dataA.append("")
                    nodeA = nodeA + ":"
                if (len(dataB) == 7):
                    dataB.append("")
                    nodeB = nodeB + ":"
                correlation_str = tokens[2]
                try:
                    correlation = float(correlation_str)
                except ValueError:
                    #Align correlation value to NaN
                    cnan += 1
                    correlation = float('nan')
                    correlation_str = ''
                numna = tokens[3]
                pv_str = tokens[4]
                bonf = tokens[5]
                pv_bonf_str = tokens[6]
                numnaf1 = tokens[7]
                pvf1_str = tokens[8]
                numnaf2 = tokens[9]
                pvf2_str = tokens[10]
                try:
                    pv = str(pvlambda(float(pv_str)))
                    pv_bonf = str(pvlambda(float(pv_bonf_str)))
                    pvf1 = str(pvlambda(float(pvf1_str)))
                    pvf2 = str(pvlambda(float(pvf2_str)))
                except ValueError:
                    #error in pairwise script, ignore these associations for now
                    continue

                if (float(pv) > max_pv):
                    max_pv = float(pv)

                if (float(pv_bonf) > max_pv_corr):
                    max_pv_corr = float(pv_bonf)

                rho = str(db_util.sign(correlation) * abs(float(pv)))

                link_distance = 500000000
                if (len(tokens) >= 12):
                    link_distance = int(tokens[11])
                else:
                    if (len(dataA) >= 5 and len(dataB) >= 5
                            and db_util.is_numeric(dataA[4]) >= 1
                            and db_util.is_numeric(dataB[4]) >= 1
                            and dataA[3] == dataB[3]):
                        link_distance = abs(int(dataB[4]) - int(dataA[4]))
                edges_out_re.write(feature1id + "\t" + feature2id + "\t" +
                                   nodeA + "\t" + "\t".join(dataA) + "\t" +
                                   nodeB + "\t" + "\t".join(dataB) + "\t" +
                                   correlation_str + "\t" + numna + "\t" + pv +
                                   "\t" + bonf + "\t" + pv_bonf + "\t" +
                                   numnaf1 + "\t" + pvf1 + "\t" + numnaf2 +
                                   "\t" + pvf2 + "\t" + rho + "\t" +
                                   str(link_distance) + "\t" +
                                   str(f1genescore) + "\t" + str(f2genescore) +
                                   "\n")
                if (do_pubcrawl == "yes"):
                    #call andrea code
                    getPairwiseInfo.processLine(line, edges_out_pc)
                    pcc += 1
            else:
                print "duplicated edge:" + nodeA + "_" + nodeB
                dupeEdges += 1
        else:
            print "invalid edge nodeA and nodeB not in features:" + nodeA + "_" + nodeB
            invalidEdges += 1
    print "Report: Valid Edges %i Duped %i cNAN %i \nunMapped %i Saved to %s \nTotal %i max_pvalue %f max_pvalue_corr %f" % (
        validEdgeId - 1, dupeEdges, cnan, unMapped, unmappedPath, totalEdges,
        max_pv, max_pv_corr)
    edges_meta_json.write('{"max_logpv":%f}' % (max_pv))
    edges_file.close()
    edges_out_re.close()
    edges_out_pc.close()
    edges_meta_json.close()
    unmappedout.close()
    efshout.write("#!/bin/bash\n")
    efshout.write(
        "mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n"
        % (myhost, myport, myuser, mypw, mydb))
    efshout.write("load data local infile '" + edges_out_re.name +
                  "' replace INTO TABLE " + edge_table +
                  " fields terminated by '\\t' LINES TERMINATED BY '\\n';\n")
    efshout.write("\ncommit;")
    efshout.write("\nEOFMYSQL")
    efshout.close()
    print "Begin pairwise db bulk upload " + time.ctime()
    os.system("sh " + efshout.name)
    #create sharded association files for solr import
    solrshout.write("#!/bin/bash\n")
    solrshout.write("python createPWShardedDataset.py " + edges_out_re.name +
                    " " + dataset_label + "\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\""
        + dataset_label + "\"</query></delete>'\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core2_final.tsv  -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.write(
        "curl '" + mysolr +
        "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @"
        + edges_out_re.name +
        "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
    solrshout.close()
    print "Begin pairwise solr upload " + time.ctime()
    os.system("sh " + solrshout.name)
    if (do_pubcrawl == "yes"):
        print "senting Pubcrawl notification to " + contacts
        smtp.main(
            "*****@*****.**", contacts,
            "Notification - New Pairwise Associations for PubCrawl",
            "New pairwise associations ready for PubCrawl load\n" +
            edges_out_pc.name + "\n\n" + str(pcc) + " Total Edges\n\n" +
            edges_out_re.name +
            " loaded into RegulomeExplorer, dataset label is " +
            dataset_label + " \n\n")
Esempio n. 6
0
if __name__ == '__main__':
    # fgnet train 82 group
    start_time = smtp.print_time("全部开始训练!!!")
    fgnet_root = cfg.dataset.fgnet_leave1out
    best_val_mae_arr = []
    for i in range(1, 83):
        tmp = str(i) if i > 9 else "0" + str(i)
        data_dir = Path(fgnet_root).joinpath(tmp)
        best_val_mae_arr.append(main(str(data_dir)))
    print(
        f"fgnet all train finished and best_val_mae_arr is:{best_val_mae_arr}")
    end_time = smtp.print_time("全部训练结束!!!")
    print(smtp.date_gap(start_time, end_time))
    smtp.main(
        dict_={
            "fgnet全部训练耗时: ": smtp.date_gap(start_time, end_time),
            "best_val_mae_arr": best_val_mae_arr
        })

    time.sleep(600)  # sleep 10 min

    # fgnet_align train 82 group
    start_time = smtp.print_time("全部开始训练!!!")
    fgnet_align_root = cfg.dataset.fgnet_align_leave1out
    best_val_mae_arr = []
    for i in range(1, 83):
        tmp = str(i) if i > 9 else "0" + str(i)
        data_dir = Path(fgnet_align_root).joinpath(tmp)
        best_val_mae_arr.append(main(str(data_dir)))
    print(
        f"fgnet_align all train finished and best_val_mae_arr is:{best_val_mae_arr}"
Esempio n. 7
0
def process_associations_rfex(dataset_label, matrixfile, associationsfile, config, annotations, collapse_direction, reverse_direction, results_path, pv_lambda, do_pubcrawl, contacts, keep_unmapped, featureInterestingFile):
	mydb = db_util.getDBSchema(config) 
	myuser = db_util.getDBUser(config) 
	mypw = db_util.getDBPassword(config) 
	myhost = db_util.getDBHost(config) 
	myport = db_util.getDBPort(config)
	mysolr = db_util.getSolrPath(config)
	if (not os.path.isfile(associationsfile)):
		print associationsfile + " does not exist; unrecoverable ERROR"
		sys.exit(-1)
	associations_table = mydb + ".mv_" + dataset_label + "_feature_networks"
	print "Begin processing associations %s Applying processing_pubcrawl %s" %(time.ctime(), do_pubcrawl)
	fIntHash = parse_features_rfex.get_feature_interest_hash(featureInterestingFile)

	edges_out_re = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w')
	associations_in = open(associationsfile,'r')
	annotation_hash, ftype = parse_features_rfex.process_feature_annotations(annotations)
	fshout = open(results_path + 'load_sql_associations_' + dataset_label + '.sh','w')
	solrshout = open(results_path + 'load_solr_assocations_' + dataset_label + '.sh','w')
	unmappedPath = results_path  + 'edges_out_' + dataset_label + '_rface_unmapped.tsv'
	unmappedout = open(unmappedPath,'w')
	features_file = open(results_path + dataset_label + '_features_out.tsv','r')
	features_hash = {}
	for fl in features_file.readlines():
		ftk = fl.strip().split("\t")
		features_hash[ftk[1]] = ftk
	features_file.close()
		
	aliasid_file = open(results_path + dataset_label + '_features_alias_id.tsv','r')
	aliasid_hash = {}
	for fl in aliasid_file.readlines():
		ftk = fl.strip().split("\t")
		aliasid_hash[ftk[0]] = ftk
	aliasid_file.close()	
	
	tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_re.tsv','w')
	pubcrawl_tsvout = open(results_path + 'edges_out_' + dataset_label + '_rface_pc.tsv','w')
	lc = 0
	edgeCount = 0
	pcc = 0
	unMapped = 0
	pvalueCutCount = 0
	impCut = 0
	lines = associations_in.readlines()
	associations_in.close()
	associations_dic = {}
	for line in lines:
		lc = lc + 1
		columns = line.strip().split('\t')
		if (len(columns) < 5):
			print "Missing required tokens in associations lineIndex %i lineValue %s" %(lc, line)
			continue
		f1alias = columns[0]
		#afm_ids will be used for directionality collapsing, if needed
		f1afm_id = columns[0]
		f2afm_id = columns[1]
		if (len(f1alias.split(":")) < 3):
			annotated_feature = annotation_hash.get(f1alias)
			if (annotated_feature == None):
				print "ERROR: Target feature %s is not in afm/annotation %i" %(f1alias, len(annotation_hash))
				continue
			f1alias = annotated_feature.replace("\t", ":")
		f2alias = columns[1]
		if (len(f2alias.split(":")) < 3):
			annotated_feature = annotation_hash.get(f2alias)
			if (annotated_feature == None):
				print "ERROR: Predictor feature %s is not in afm/annotation" %(f2alias)
				continue
			f2alias = annotated_feature.replace("\t", ":")
		try:
			f1genescore = fIntHash[f1alias]
		except KeyError:
			f1genescore = 0
		try:
			f2genescore = fIntHash[f2alias]
		except KeyError:
			f2genescore = 0
		
		f1data = f1alias.split(':')
		f2data = f2alias.split(':')

		if len(f1data) > 4:
			f1data[3] = f1data[3][3:]
		if len(f2data) > 4:
			f2data[3] = f2data[3][3:]
		
		if (len(f1data) <= 7 and (f1data[1] == 'CLIN' or f1data[1] == 'SAMP')):
			f1alias = ":".join(f1data[0:3]) + ":::::"
			f1data = f1alias.split(':')
		elif (len(f1data) == 7):
			f1data.append("")
		if (len(f2data) <= 7 and (f2data[1] == 'CLIN' or f2data[1] == 'SAMP')):
			f2alias = ":".join(f2data[0:3]) + ":::::"
			f2data = f2alias.split(':')
		elif (len(f2data) == 7):
			f2data.append("") 
		f1aliasOmic = f1alias
		f2aliasOmic = f2alias
		#for annotations
		try:    
			f1id = features_hash[f1alias][0]
		except KeyError:
			try:
				f1id = aliasid_hash[f1alias][1]
				f1aliasOmic = aliasid_hash[f1alias][2]
				f1data = f1aliasOmic.split(':')
				f1data[3] = f1data[3][3:]
			except KeyError:
				print "Skipping Key error with alias1 " + f1alias
                                continue
			
		try:
			f2id = features_hash[f2alias][0]#f2alias.split(":")[-1]
		except KeyError:
			try:
				f2id = aliasid_hash[f2alias][1]
				f2aliasOmic = aliasid_hash[f2alias][2]
				f2data = f2aliasOmic.split(':')
				f2data[3] = f2data[3][3:]	
			except KeyError:
				print "Skipping Key error with alias2 " + f2alias
				continue		

		pvalue = float(columns[2])
		pvalue = str(pv_lambda(pvalue))
		
		importance = columns[3]
		correlation = columns[4]
		patientct = columns[5]
		if (db_util.isUnmappedAssociation(f1alias, f2alias) and keep_unmapped == 0):
			unmappedout.write(f1alias + "\t" + f2alias + "\n")
			unMapped += 1
			continue	
		rhoscore = ""
		link_distance = -1
		if (len(f1data) >=5 and len(f2data)>=5 and db_util.is_numeric(f1data[4]) >= 1 and db_util.is_numeric(f2data[4]) >= 1 and f1data[3] == f2data[3]):
			link_distance = abs(int(f2data[4]) - int(f1data[4]))
		if (collapse_direction == 0):
			associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"
		else:
			#check whether (f1 -> f2 or f2 -> f1) exists, if yes, take the more important
			#if not, store pair
			if ((associations_dic.get(f1afm_id + "_" + f2afm_id) == None) and (associations_dic.get(f2afm_id + "_" + f1afm_id) == None)):
				associations_dic[f1afm_id + "_" + f2afm_id] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"
			else:
				existingLink = associations_dic.get(f1afm_id + "_" + f2afm_id)
				ekey = f1afm_id + "_" + f2afm_id
				if (existingLink == None):
					existingLink = associations_dic.get(f2afm_id + "_" + f1afm_id) 
					ekey = f2afm_id + "_" + f1afm_id
				prevImportance = existingLink.split("\t")[3]
				if (float(importance) > float(prevImportance)):
					associations_dic[ekey] = f1aliasOmic + "\t" + f2aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + str(f1genescore) + "\t" + str(f2genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"					 			 
		if (reverse_direction == 1):
			associations_dic[f2afm_id + "_" + f1afm_id] = f2aliasOmic + "\t" + f1aliasOmic + "\t" + pvalue + "\t" + importance + "\t" + correlation + "\t" + patientct + "\t" + f2id + "\t" + "\t".join(f2data) + "\t" + f1id + "\t" + "\t".join(f1data) + "\t" + str(f2genescore) + "\t" + str(f1genescore) + "\t" + rhoscore + "\t" + str(link_distance) + "\n"
			edgeCount = edgeCount + 1
		edgeCount = edgeCount + 1
		if (do_pubcrawl == "yes"):
			getRFACEInfo.processLine(line, pubcrawl_tsvout)
			pcc += 1
	for ei in associations_dic:
		tsvout.write(associations_dic[ei])
	fshout.write("#!/bin/bash\n")
	fshout.write("mysql -h %s --port %s --user=%s --password=%s --database=%s<<EOFMYSQL\n" %(myhost, myport, myuser, mypw, mydb))
	fshout.write("load data local infile '" + tsvout.name + "' replace INTO TABLE " + associations_table + " fields terminated by '\\t' LINES TERMINATED BY '\\n';")
	fshout.write("\nEOFMYSQL\n")
	tsvout.close()
	unmappedout.close()
	pubcrawl_tsvout.close()
	fshout.close()
	print "\nReport: ValidEdges %i ImportanceCutoff %i edges filtered %i \nunMapped Edges %i Saved to %s" %(len(associations_dic), impCut, pvalueCutCount, unMapped, unmappedPath)
	print "Begin RF-ACE db bulk upload %s os.system sh %s" %(time.ctime(), fshout.name)
	os.system("sh " + fshout.name)
	solrshout.write("#!/bin/bash\n")
	solrshout.write("python createRFShardedDataset.py " + edges_out_re.name + " " + dataset_label + "\n")
	solrshout.write("curl '" + mysolr + "/core0/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core1/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core2/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core3/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core4/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core5/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core6/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core7/update/?commit=true' -H 'Content-type:text/xml' --data-binary '<delete><query>dataset:\"" + dataset_label + "\"</query></delete>'\n")
	solrshout.write("curl '" + mysolr + "/core0/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core0_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core1/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core1_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core2/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core2_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core3/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core3_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core4/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core4_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core5/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core5_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core6/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core6_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.write("curl '" + mysolr + "/core7/update/csv?commit=true&separator=%09&overwrite=false&escape=\ ' --data-binary @" + edges_out_re.name + "_core7_final.tsv -H 'Content-type:text/plain;charset=utf-8' &\n")
	solrshout.close()
	print "Begin rface solr upload " + time.ctime()
	os.system("sh " + solrshout.name)

	if (do_pubcrawl == 'yes'):
		smtp.main("*****@*****.**", contacts, "Notification - New RFAce " + dataset_label + " Associations for PubCrawl", "New RFAce associations ready for PubCrawl load\n" + pubcrawl_tsvout.name + "\n" + str(pcc) + " Total Edges\n" + tsvout.name + " loaded into RegulomeExplorer, dataset label is " + dataset_label + "\n\n")
	print "Done processing associations %s" %(time.ctime())
	associations_dic = None