def construct_trn(path): LOGGER.info("{0:*^78s}".format("Construct TRN")) version = os.path.basename(path) if not version: version = os.path.basename(os.path.dirname(path)) LOGGER.info("{0:*^78s}".format(version)) # load objects so that they are in memory pyorganism.read_pickle(os.path.join(path, "conformations.pkl")) t_units = pyorganism.read_pickle(os.path.join(path, "transcription_units.pkl")) interactions = pyorganism.read_pickle(os.path.join(path, "interactions.pkl")) assert all(pyreg.Conformation.has_key(triple[0], version) for triple in interactions),\ "unknown conformation in regulatory interactions" assert all(pyreg.Promoter.has_key(triple[1], version) for triple in interactions),\ "unknown promoter in regulatory interactions" # conformation-promoter directed regulatory network cpn = nx.MultiDiGraph() for (u, v, inter) in interactions: first = pyreg.Conformation[u, version] second = pyreg.Promoter[v, version] cpn.add_edge(first, second, key=inter) # TRN from CPN trn = pyreg.TRN() node_converter = NodeConverter(t_units) for (u, v, k, d) in cpn.edges_iter(keys=True, data=True): first = u.t_factor for nbr in node_converter(v.unique_id): trn.add_edge(first, nbr, key=k, **d.copy()) LOGGER.info("%d Nodes", len(trn)) LOGGER.info("%d Links", trn.size()) components = list(nx.connected_components(trn.to_undirected())) LOGGER.info("%d Components", len(components)) LOGGER.info("Largest Component: %d", len(components[0])) if len(components) > 1: LOGGER.info("Second Largest Component: %d", len(components[1])) pyorganism.write_pickle(trn, os.path.join(path, "trn.pkl"))
def compile_feature2gene(objects_path): LOGGER.info("{0:*^78s}".format("Compile Gene Feature Map")) version = os.path.basename(objects_path) if not version: version = os.path.basename(os.path.dirname(objects_path)) global VERSION VERSION = version LOGGER.info("{0:*^78s}".format(version)) names = compile_names(CONFIG["data_paths"], CONFIG["data_load"]) LOGGER.info("Loading genes") genes = pyorganism.read_pickle(os.path.join(objects_path, "genes.pkl")) LOGGER.info("Finding gene names") finder = pyorganism.FindObject(genes, targets=[gene.name.lower() for gene in genes]) (feature2gene, gap) = compile_feature_map(genes, names, finder) synonyms = synonym_finder(genes) LOGGER.info("Finding gene names by synonyms") gap = extend_feature_map(feature2gene, gap, synonyms) LOGGER.info("Missing %d gene names", len(gap)) LOGGER.info("Fuzzy search of gene names (threshold %d%%)", CONFIG["threshold"]) LOGGER.setLevel(logging.DEBUG) gap = fuzzy_extension(feature2gene, gap, finder, CONFIG["threshold"]) LOGGER.info("Fuzzy search of gene names by synonyms (threshold %d%%)", CONFIG["threshold"]) gap = fuzzy_extension(feature2gene, gap, synonyms, CONFIG["threshold"]) LOGGER.info("Manual update") manual_name_updates(feature2gene) num = sum(1 for gene in feature2gene.itervalues() if gene) LOGGER.info("Final map contains %d names and %d genes (%3.2f%%)", len(feature2gene), num, 100.0 * num / len(feature2gene)) pyorganism.write_pickle(feature2gene, os.path.join(objects_path, "feature2gene.pkl"))
def update_map(map_path, frame_path, update_from, version=""): LOGGER.info("{0:*^78s}".format("Update Gene IDs")) map_name = os.path.splitext(os.path.basename(map_path))[0] LOGGER.info("{0:*^78s}".format(map_name)) base_path = os.path.dirname(map_path) if not version: version = os.path.basename(base_path) LOGGER.info("{0:*^78s}".format(version)) # load into memory LOGGER.info("Loading genes") pyorganism.read_pickle(os.path.join(base_path, "genes.pkl")) LOGGER.info("Reading data frame") hdf_key = "/%s" % (map_name,) mapping = pandas.read_hdf(frame_path, hdf_key) # frame_info(mapping) id2gene = dict() for row in mapping[[version, update_from]].itertuples(): if not row[1]: id2gene[row[0]] = pyreg.Gene.get(row[2], None, version) elif row[1] != row[2]: id2gene[row[0]] = pyreg.Gene.get(row[2], None, version) else: id2gene[row[0]] = pyreg.Gene.get(row[1], None, version) pyorganism.write_pickle(id2gene, os.path.join(base_path, "corrected_" + map_name + ".pkl"))
def compile_name2gene(objects_path): LOGGER.info("{0:*^78s}".format("Compile Gene Name Map")) full_data = compile_names(CONFIG["data_paths"], CONFIG["data_load"]) version = os.path.basename(objects_path) if not version: version = os.path.basename(os.path.dirname(objects_path)) global VERSION VERSION = version LOGGER.info("{0:*^78s}".format(version)) LOGGER.info("Loading genes") genes = pyorganism.read_pickle(os.path.join(objects_path, "genes.pkl")) LOGGER.info("Finding gene names") names = set(full_data["name"].unique()) if numpy.nan in names: names.remove(numpy.nan) names = sorted(names) finder = pyorganism.FindObject(genes, "name") (name2gene, name_gap) = compile_feature_map(genes, names, finder) synonyms = synonym_finder(genes) LOGGER.info("Finding gene names by synonyms") name_gap = extend_feature_map(name2gene, name_gap, synonyms) LOGGER.info("Missing %d gene names", len(name_gap)) LOGGER.info("Fuzzy search of gene names (threshold %d%%)", CONFIG["threshold"]) name_gap = fuzzy_extension(name2gene, name_gap, finder, CONFIG["threshold"]) # LOGGER.info("Fuzzy search of gene names by synonyms (threshold %d%%)", CONFIG["threshold"]) # name_gap = fuzzy_extension(name2gene, name_gap, synonyms, CONFIG["threshold"]) manual_name_updates(name2gene) num = sum(1 for gene in name2gene.itervalues() if gene) LOGGER.info("Final map contains %d names and %d genes (%3.2f%%)", len(name2gene), num, 100.0 * num / len(name2gene)) LOGGER.info("Finding gene blattner numbers") bnumbers = set(full_data["blattner"].unique()) if numpy.nan in bnumbers: bnumbers.remove(numpy.nan) bnumbers = sorted(bnumbers) gene_finder = pyorganism.FindObject(genes, "bnumber") (blattner2gene, blattner_gap) = compile_feature_map(genes, bnumbers, gene_finder) LOGGER.info("Finding gene blattner numbers by synonyms") blattner_gap = extend_feature_map(blattner2gene, blattner_gap, synonyms) LOGGER.info("Missing %d gene blattner numbers", len(blattner_gap)) LOGGER.info("Fuzzy search of gene blattner numbers (threshold %d%%)", CONFIG["threshold"]) blattner_gap = fuzzy_extension(blattner2gene, blattner_gap, finder, CONFIG["threshold"]) # LOGGER.info("Fuzzy search of gene blattner numbers by synonyms (threshold %d%%)", CONFIG["threshold"]) # blattner_gap = fuzzy_extension(blattner2gene, blattner_gap, synonyms, CONFIG["threshold"]) num = sum(1 for gene in blattner2gene.itervalues() if gene) LOGGER.info("Final map contains %d blattner numbers and %d genes (%3.2f%%)", len(blattner2gene), num, 100.0 * num / len(blattner2gene)) verify_union(name2gene, blattner2gene, full_data) pyorganism.write_pickle(name2gene, os.path.join(objects_path, "name2gene.pkl")) pyorganism.write_pickle(blattner2gene, os.path.join(objects_path, "blattner2gene.pkl"))
def main_random(args): locations = sorted(glob(os.path.join(args.in_path, args.glob))) locations = [os.path.abspath(loc) for loc in locations] pool = multiprocessing.Pool(args.nproc) bar = ProgressBar(maxval=args.rnd_num, widgets=[Timer(), " ", SimpleProgress(), " ", Percentage(), " ", Bar(), " ", ETA()]) rewire_name = "grn_rewired_{0:.1f}.pkl".format(args.prob) switch_name = "grn_switched_{0:d}.pkl".format(args.flip_num) for path in locations: filename = os.path.join(path, "trn.pkl") if not os.path.exists(filename): continue ver = version_from_path(path) base_path = os.path.join(args.out_path, ver) if not os.path.isdir(base_path): os.makedirs(base_path) LOGGER.info(ver) trn = pyorg.read_pickle(filename) # we consider null-models on the projected level since that is the level # on which we evaluate topological quantities net = pyreg.to_simple(trn.to_grn()) if args.run_rewire: LOGGER.info("Rewiring with probability %.1f", args.prob) tasks = [(net, args.prob)] * args.rnd_num res_it = pool.imap_unordered(rewire, tasks) rands = list() bar.start() for rnd in res_it: rands.append(rnd) bar += 1 bar.finish() pyorg.write_pickle(rands, os.path.join(base_path, rewire_name)) if args.run_switch: LOGGER.info("Switch-randomizing each edge %d times", args.flip_num) tasks = [(net, args.flip_num)] * args.rnd_num res_it = pool.imap_unordered(switch, tasks) success = list() rands = list() bar.start() for (rnd, rate) in res_it: rands.append(rnd) success.append(rate) bar += 1 bar.finish() pyorg.write_pickle(rands, os.path.join(base_path, switch_name)) LOGGER.info("mean flip success rate: %.3G +/- %.3G", np.mean(success), np.std(success)) pool.close()
def rewiring(lb_view, versions, args): bar = ProgressBar(maxval=args.rnd_num, widgets=[Timer(), " ", SimpleProgress(), " ", Percentage(), " ", Bar(), " ", ETA()]) rands = list() for ver in versions: LOGGER.info(ver) res_it = lb_view.map(rewire, [ver] * args.rnd_num, block=False, ordered=False) bar.start() for rng in res_it: rands.append(rng) bar += 1 bar.finish() pyorg.write_pickle(rands, os.path.join(args.out_path, ver, "trn_rewired_{0:.1f}.pkl".format(args.prob))) lb_view.purge_results("all") del rands[:] # activate garbage collection in loop
def randomisation(lb_view, versions, args): bar = ProgressBar(maxval=args.rnd_num, widgets=[Timer(), " ", SimpleProgress(), " ", Percentage(), " ", Bar(), " ", ETA()]) for ver in versions: LOGGER.info(ver) res_it = lb_view.map(null_model, [ver] * args.rnd_num, block=False, ordered=False) bar.start() rands = list() success = list() for (rnd_net, flip_rate) in res_it: rands.append(rnd_net) success.append(flip_rate) bar +=1 bar.finish() lb_view.purge_results("all") LOGGER.info("mean flip success rate: %.3G +/- %.3G", np.mean(success), np.std(success)) pyorg.write_pickle(rands, os.path.join(args.out_path, ver, "trn_random.pkl")) del rands[:] # activate garbage collection in loop
def construct_gpn(path, window_sizes): LOGGER.info("{0:*^78s}".format("Construct GPN")) version = os.path.basename(path) if not version: version = os.path.basename(os.path.dirname(path)) LOGGER.info("{0:*^78s}".format(version)) genes = pyorganism.read_pickle(os.path.join(path, "genes.pkl")) gpn_gen = pyreg.GPNGenerator() gpn_gen.parse_genes(genes) for window in window_sizes: LOGGER.info("Window Size: %d",window) gpn = gpn_gen.generate_gpn(window) LOGGER.info("%d Nodes", len(gpn)) LOGGER.info("%d Links", gpn.size()) components = list(nx.connected_components(gpn)) LOGGER.info("%d Components", len(components)) LOGGER.info("Largest Component: %d", len(components[0])) if len(components) > 1: LOGGER.info("Second Largest Component: %d", len(components[1])) pyorganism.write_pickle(gpn, os.path.join(path, "gpn_%d.pkl" % window))
def main(argv): LOGGER.info("{0:*^78s}".format("Compile RegulonDB Content")) if not os.path.exists(argv[0]): sys.exit(1) if not os.path.exists(argv[1]): os.makedirs(argv[1]) version = argv[2] if len(argv) == 3 else os.path.basename(argv[0]) if not version: version = os.path.basename(os.path.dirname(argv[0])) LOGGER.info("{0:*^78s}".format(version)) global VERSION VERSION = version version = tuple([int(num) for num in version.split(".")]) # compile genes and gene products gene_file = os.path.join(argv[1], "genes.pkl") if os.path.exists(gene_file): genes = pyorganism.read_pickle(gene_file) else: genes = compile_genes(argv[0]) product_file = os.path.join(argv[1], "products.pkl") if os.path.exists(product_file): products = pyorganism.read_pickle(product_file) else: products = compile_products(argv[0]) regdb.link_gene_product(os.path.join(argv[0], "gene_product_link.xml"), VERSION) tf_file = os.path.join(argv[1], "transcription_factors.pkl") if os.path.exists(tf_file): t_factors = pyorganism.read_pickle(tf_file) else: t_factors = compile_transcription_factors(argv[0], version) num = sum(1 for gene in genes if isinstance(gene.regulatory_product, pyreg.TranscriptionFactor)) norm = float(len(genes)) LOGGER.info("Found {0:d} genes that code for transcription factors({1:.2%})".format(num, num / norm)) if version >= (7, 2): sigma_file = os.path.join(argv[1], "sigma_factors.pkl") if os.path.exists(sigma_file): sigma_factors = pyorganism.read_pickle(sigma_file) else: sigma_factors = compile_sigma_factors(argv[0]) conf_file = os.path.join(argv[1], "conformations.pkl") if os.path.exists(conf_file): conformations = pyorganism.read_pickle(conf_file) else: conformations = compile_conformations(argv[0]) prom_file = os.path.join(argv[1], "promoters.pkl") if os.path.exists(prom_file): promoters = pyorganism.read_pickle(prom_file) else: promoters = compile_promoters(argv[0]) op_file = os.path.join(argv[1], "operons.pkl") if os.path.exists(op_file): operons = pyorganism.read_pickle(op_file) else: operons = compile_operons(argv[0]) regdb.update_operons(operons, promoters, genes) tu_file = os.path.join(argv[1], "transcription_units.pkl") if os.path.exists(tu_file): t_units = pyorganism.read_pickle(tu_file) else: t_units = compile_transcription_units(argv[0]) inter_file = os.path.join(argv[1], "interactions.pkl") if os.path.exists(inter_file): interactions = pyorganism.read_pickle(inter_file) else: interactions = compile_regulation(argv[0]) # write-out all pickles again since object attributes may have been updated pyorganism.write_pickle(genes, gene_file) pyorganism.write_pickle(products, product_file) pyorganism.write_pickle(t_factors, tf_file) if version >= (7, 2): pyorganism.write_pickle(sigma_factors, sigma_file) pyorganism.write_pickle(conformations, conf_file) pyorganism.write_pickle(promoters, prom_file) pyorganism.write_pickle(operons, op_file) pyorganism.write_pickle(t_units, tu_file) pyorganism.write_pickle(interactions, inter_file)