def addDataset(label, feature_matrix, associations, method, source, description, comments, configfile, results_path, ds_date, disease, contact): print "Adding " + source + " dataset to admin table with config " + configfile + " for label " + label if (description == ""): #not general, revisit this to enter all TCGA known cancers if (label.find("brca") != -1 or label.find("BRCA") != -1): description = "Breast" if (label.find("ov") != -1 or label.find("OV") != -1): description = description + "Ovarian" if (label.find("gbm") != -1 or label.find("GBM") != -1): description = description + "Glioblastoma" if (label.find("coadread") != -1 or label.find("COAD") != -1 or label.find("coad") != -1 or label.find("crc") != -1 or label.find("CRC") != -1): description = description + "ColoRectal" if (label.find("cesc") != -1 or label.find("CESC") != -1): description = description + "Cervical" if (label.find("hnsc") != -1 or label.find("HNSC") != -1): description = description + "HeadNeck" if (label.find("kirc") != -1 or label.find("KIRC") != -1 or label.find("kirp") != -1 or label.find("KIRP") != -1): description = description + "Kidney" if (label.find("luad") != -1 or label.find("LUAD") != -1 or label.find("lusc") != -1 or label.find("LUSC") != -1): description = description + "Lung" if (label.find("stad") != -1 or label.find("STAD") != -1): description = description + "Stomach" if (label.find("nomask") != -1): description = description elif (label.find("mask") != -1): description = description + " filtered" if (comments == ""): comments = "{matrix:"+feature_matrix+",associations:"+associations+"}" inputfiles = "{matrix:"+feature_matrix+",associations:"+associations+"}" currentDate = time.strftime("%m-%d-%y") config = db_util.getConfig(configfile) max_logpv = -1.0 if (os.path.exists(results_path + 'edges_out_' + label + '_meta.json')): meta_json_file = open(results_path + 'edges_out_' + label + '_meta.json','r') metaline = meta_json_file.read() if (len(metaline) > 1): try: max_logpv = json.loads(metaline)["max_logpv"] except ValueError: max_logpv = -1 #okay that the max_logpv is not set except: print "Unexpected error:", sys.exc_info()[0] raise meta_json_file.close() summary_json = "" if (os.path.exists(results_path + "feature_summary_" + label + ".json")): summary_file = open(results_path + "feature_summary_" + label + ".json", "r") summary_json = summary_file.read().strip() summary_file.close() insertSql = "replace into tcga.regulome_explorer_dataset (label,method,source,contact,comments,dataset_date,description,max_logged_pvalue, input_files, default_display,disease,summary_json) values ('%s', '%s', '%s', '%s', '%s', '%s', '%s', %f, '%s', '%i', '%s', '%s');" %(label, method, source, contact, comments,ds_date,description, max_logpv, inputfiles, 1, disease, summary_json) print "updating regulome_explorer_dataset\n" + insertSql db_util.executeInsert(config, insertSql)
def loadConfig(env): configFile = "" if env == "internal": configFile = "../config/rfex_sql_sandbox.config" elif env == "": configFile = "../config/rfex_sql.config" elif env == "gdac": configFile = "../config/rfex_sql_gdac.config" else: print "The env selected is invalid " + env sys.exit(-1) config = db_util.getConfig(configFile) return config
def loadConfig(env): configFile = "" if (env == "internal"): configFile = "../config/rfex_sql_sandbox.config" elif (env == ""): configFile = "../config/rfex_sql.config" elif (env == "gdac"): configFile = "../config/rfex_sql_gdac.config" else: print "The env selected is invalid " + env sys.exit(-1) config = db_util.getConfig(configFile) return config
def main(dataset_label, feature_matrix, associations, pvalueRepresentation, configfile, resultsPath, doPubcrawl, contacts, keep_unmapped, featureInterestingFile): print "\n in parse_pairwise : dataset_label = <%s> \n" % dataset_label config = db_util.getConfig(configfile) #results_path = db_util.getResultsPath(config) #if (not os.path.exists(results_path + "/" + dataset_label)): # os.mkdir(results_path + "/" + dataset_label) print "Done with processing features, processing pairwise edges %s " %(time.ctime()) pvlambda = db_util.reflective if (pvalueRepresentation == "negative"): pvlambda = db_util.negative elif (pvalueRepresentation == "negative_log10"): pvlambda = db_util.negative_log10 elif (pvalueRepresentation == "absolute"): pvlambda = db_util.absolute process_pairwise_edges(dataset_label, feature_matrix, associations, pvlambda, config, resultsPath, doPubcrawl, contacts, int(keep_unmapped), featureInterestingFile) print "Done with processing pairwise edges %s " %(time.ctime())
def updateFromTemplate(label, template, configfile, resultsPath): template_file = open(template) schema_out_name = template_file.name.replace('template', label) schema_out_name = schema_out_name.replace('sql', "sql_processing", 1) sql_processing_dir = resultsPath + "/sql_processing" if (not os.path.exists(sql_processing_dir)): os.system("mkdir " + sql_processing_dir) os.system("chmod 777 " + sql_processing_dir) schema_out_name = sql_processing_dir + "/" + schema_out_name.split("/")[-1] schema_file = open(schema_out_name,'w') config = db_util.getConfig(configfile) schema_file.write("use %s;\n" %(db_util.getDBSchema(config))) for line in template_file: schema_file.write(line.replace(placeHolder, label)) schema_file.close() template_file.close() executeSchema(schema_file.name, config) print "Done creating schema file from template %s" % time.ctime()
def main(dataset_label, feature_matrix, associations, pvalueRepresentation, configfile, resultsPath, doPubcrawl, contacts, keep_unmapped, featureInterestingFile): print "\n in parse_pairwise : dataset_label = <%s> \n" % dataset_label config = db_util.getConfig(configfile) #results_path = db_util.getResultsPath(config) #if (not os.path.exists(results_path + "/" + dataset_label)): # os.mkdir(results_path + "/" + dataset_label) print "Done with processing features, processing pairwise edges %s " % ( time.ctime()) pvlambda = db_util.reflective if (pvalueRepresentation == "negative"): pvlambda = db_util.negative elif (pvalueRepresentation == "negative_log10"): pvlambda = db_util.negative_log10 elif (pvalueRepresentation == "absolute"): pvlambda = db_util.absolute process_pairwise_edges(dataset_label, feature_matrix, associations, pvlambda, config, resultsPath, doPubcrawl, contacts, int(keep_unmapped), featureInterestingFile) print "Done with processing pairwise edges %s " % (time.ctime())
def getFeatures(): return features_hash def getFeatureId(featureStr): return features_hash.get(featureStr) if __name__ == "__main__": global datast_label print "Parsing features kicked off %s" % time.ctime() if (len(sys.argv) < 7): print 'Usage is py2.6 parse_features_rfex.py data_matrix.tsv dataset_label configFile annotations quantileFeatures resultsPath' sys.exit(1) dataset_label = sys.argv[2] print "\nin parse_features_rfex : dataset_label = <%s>\n" % dataset_label configfile = sys.argv[3] config = db_util.getConfig(configfile) annotations = sys.argv[4] quantileFeatures = sys.argv[5] resultsPath = sys.argv[6] featureInterestingFile = "" if (len(sys.argv) == 8): featureInterestingFile = sys.argv[7] process_feature_matrix(dataset_label, sys.argv[1], 1, config, annotations, quantileFeatures, resultsPath, featureInterestingFile) print "Done with processing feature relating loads %s " % (time.ctime())
def addDataset( label, feature_matrix, associations, method, source, description, comments, configfile, results_path, ds_date, disease, contact, ): print "Adding " + source + " dataset to admin table with config " + configfile + " for label " + label if description == "": # not general, revisit this to enter all TCGA known cancers if label.find("brca") != -1 or label.find("BRCA") != -1: description = "Breast" if label.find("ov") != -1 or label.find("OV") != -1: description = description + "Ovarian" if label.find("gbm") != -1 or label.find("GBM") != -1: description = description + "Glioblastoma" if ( label.find("coadread") != -1 or label.find("COAD") != -1 or label.find("coad") != -1 or label.find("crc") != -1 or label.find("CRC") != -1 ): description = description + "ColoRectal" if label.find("cesc") != -1 or label.find("CESC") != -1: description = description + "Cervical" if label.find("hnsc") != -1 or label.find("HNSC") != -1: description = description + "HeadNeck" if label.find("kirc") != -1 or label.find("KIRC") != -1 or label.find("kirp") != -1 or label.find("KIRP") != -1: description = description + "Kidney" if label.find("luad") != -1 or label.find("LUAD") != -1 or label.find("lusc") != -1 or label.find("LUSC") != -1: description = description + "Lung" if label.find("stad") != -1 or label.find("STAD") != -1: description = description + "Stomach" if label.find("nomask") != -1: description = description elif label.find("mask") != -1: description = description + " filtered" if comments == "": comments = "{matrix:" + feature_matrix + ",associations:" + associations + "}" inputfiles = "{matrix:" + feature_matrix + ",associations:" + associations + "}" currentDate = time.strftime("%m-%d-%y") config = db_util.getConfig(configfile) max_logpv = -1.0 if os.path.exists(results_path + "edges_out_" + label + "_meta.json"): meta_json_file = open(results_path + "edges_out_" + label + "_meta.json", "r") metaline = meta_json_file.read() if len(metaline) > 1: try: max_logpv = json.loads(metaline)["max_logpv"] except ValueError: max_logpv = -1 # okay that the max_logpv is not set except: print "Unexpected error:", sys.exc_info()[0] raise meta_json_file.close() summary_json = "" if os.path.exists(results_path + "feature_summary_" + label + ".json"): summary_file = open(results_path + "feature_summary_" + label + ".json", "r") summary_json = summary_file.read().strip() summary_file.close() insertSql = ( "replace into tcga.regulome_explorer_dataset (label,method,source,contact,comments,dataset_date,description,max_logged_pvalue, input_files, default_display,disease,summary_json) values ('%s', '%s', '%s', '%s', '%s', '%s', '%s', %f, '%s', '%i', '%s', '%s');" % ( label, method, source, contact, comments, ds_date, description, max_logpv, inputfiles, 1, disease, summary_json, ) ) print "updating regulome_explorer_dataset\n" + insertSql db_util.executeInsert(config, insertSql)
if not os.access(jarPath, os.R_OK): print "Could not open JAR file at %s, check setting 'batch_path' in config. EXIT" % (jarPath) sys.exit(-1) if __name__ == "__main__": if len(sys.argv) is not 2: print "Usage is py2.6 neo4j_csv.py batch_import.config" sys.exit(-1) config_file = sys.argv[1] if not os.access(config_file, os.R_OK): print "Could not open config file. EXIT" sys.exit(-1) config = db_util.getConfig(config_file) checkImportProgram(config) if not os.path.exists(getBatchTSVPath(config)): print "TSV directory does not exist, creating." os.makedirs(getBatchTSVPath(config)) if not os.path.exists(getMysqlDumps(config)): print "Dump directory does not exist, creating" os.makedirs(getMysqlDumps(config)) print "Data import started at %s" % (str(datetime.datetime.now())) importer = DatasetImporter(config) t = timeit.Timer(importer.start, "gc.enable()") importTime = t.timeit(1)