Esempio n. 1
0
def construct_trn(path):
    LOGGER.info("{0:*^78s}".format("Construct TRN"))
    version = os.path.basename(path)
    if not version:
        version = os.path.basename(os.path.dirname(path))
    LOGGER.info("{0:*^78s}".format(version))
    # load objects so that they are in memory
    pyorganism.read_pickle(os.path.join(path, "conformations.pkl"))
    t_units = pyorganism.read_pickle(os.path.join(path, "transcription_units.pkl"))
    interactions = pyorganism.read_pickle(os.path.join(path, "interactions.pkl"))
    assert all(pyreg.Conformation.has_key(triple[0], version) for triple in interactions),\
            "unknown conformation in regulatory interactions"
    assert all(pyreg.Promoter.has_key(triple[1], version) for triple in interactions),\
            "unknown promoter in regulatory interactions"
    # conformation-promoter directed regulatory network
    cpn = nx.MultiDiGraph()
    for (u, v, inter) in interactions:
        first = pyreg.Conformation[u, version]
        second = pyreg.Promoter[v, version]
        cpn.add_edge(first, second, key=inter)
    # TRN from CPN
    trn = pyreg.TRN()
    node_converter = NodeConverter(t_units)
    for (u, v, k, d) in cpn.edges_iter(keys=True, data=True):
        first = u.t_factor
        for nbr in node_converter(v.unique_id):
            trn.add_edge(first, nbr, key=k, **d.copy())
    LOGGER.info("%d Nodes", len(trn))
    LOGGER.info("%d Links", trn.size())
    components = list(nx.connected_components(trn.to_undirected()))
    LOGGER.info("%d Components", len(components))
    LOGGER.info("Largest Component: %d", len(components[0]))
    if len(components) > 1:
        LOGGER.info("Second Largest Component: %d", len(components[1]))
    pyorganism.write_pickle(trn, os.path.join(path, "trn.pkl"))
def compile_feature2gene(objects_path):
    LOGGER.info("{0:*^78s}".format("Compile Gene Feature Map"))
    version = os.path.basename(objects_path)
    if not version:
        version = os.path.basename(os.path.dirname(objects_path))
    global VERSION
    VERSION = version
    LOGGER.info("{0:*^78s}".format(version))
    names = compile_names(CONFIG["data_paths"], CONFIG["data_load"])
    LOGGER.info("Loading genes")
    genes = pyorganism.read_pickle(os.path.join(objects_path, "genes.pkl"))
    LOGGER.info("Finding gene names")
    finder = pyorganism.FindObject(genes,
            targets=[gene.name.lower() for gene in genes])
    (feature2gene, gap) = compile_feature_map(genes, names, finder)
    synonyms = synonym_finder(genes)
    LOGGER.info("Finding gene names by synonyms")
    gap = extend_feature_map(feature2gene, gap, synonyms)
    LOGGER.info("Missing %d gene names", len(gap))
    LOGGER.info("Fuzzy search of gene names (threshold %d%%)", CONFIG["threshold"])
    LOGGER.setLevel(logging.DEBUG)
    gap = fuzzy_extension(feature2gene, gap, finder, CONFIG["threshold"])
    LOGGER.info("Fuzzy search of gene names by synonyms (threshold %d%%)", CONFIG["threshold"])
    gap = fuzzy_extension(feature2gene, gap, synonyms, CONFIG["threshold"])
    LOGGER.info("Manual update")
    manual_name_updates(feature2gene)
    num = sum(1 for gene in feature2gene.itervalues() if gene)
    LOGGER.info("Final map contains %d names and %d genes (%3.2f%%)", len(feature2gene),
            num, 100.0 * num / len(feature2gene))
    pyorganism.write_pickle(feature2gene, os.path.join(objects_path, "feature2gene.pkl"))
Esempio n. 3
0
def update_map(map_path, frame_path, update_from, version=""):
    LOGGER.info("{0:*^78s}".format("Update Gene IDs"))
    map_name = os.path.splitext(os.path.basename(map_path))[0]
    LOGGER.info("{0:*^78s}".format(map_name))
    base_path = os.path.dirname(map_path)
    if not version:
        version = os.path.basename(base_path)
    LOGGER.info("{0:*^78s}".format(version))
    # load into memory
    LOGGER.info("Loading genes")
    pyorganism.read_pickle(os.path.join(base_path, "genes.pkl"))
    LOGGER.info("Reading data frame")
    hdf_key = "/%s" % (map_name,)
    mapping = pandas.read_hdf(frame_path, hdf_key)
#    frame_info(mapping)
    id2gene = dict()
    for row in mapping[[version, update_from]].itertuples():
        if not row[1]:
            id2gene[row[0]] = pyreg.Gene.get(row[2], None, version)
        elif row[1] != row[2]:
            id2gene[row[0]] = pyreg.Gene.get(row[2], None, version)
        else:
            id2gene[row[0]] = pyreg.Gene.get(row[1], None, version)
    pyorganism.write_pickle(id2gene, os.path.join(base_path, "corrected_" +
        map_name + ".pkl"))
Esempio n. 4
0
def compile_name2gene(objects_path):
    LOGGER.info("{0:*^78s}".format("Compile Gene Name Map"))
    full_data = compile_names(CONFIG["data_paths"], CONFIG["data_load"])
    version = os.path.basename(objects_path)
    if not version:
        version = os.path.basename(os.path.dirname(objects_path))
    global VERSION
    VERSION = version
    LOGGER.info("{0:*^78s}".format(version))
    LOGGER.info("Loading genes")
    genes = pyorganism.read_pickle(os.path.join(objects_path, "genes.pkl"))
    LOGGER.info("Finding gene names")
    names = set(full_data["name"].unique())
    if numpy.nan in names:
        names.remove(numpy.nan)
    names = sorted(names)
    finder = pyorganism.FindObject(genes, "name")
    (name2gene, name_gap) = compile_feature_map(genes, names, finder)
    synonyms = synonym_finder(genes)
    LOGGER.info("Finding gene names by synonyms")
    name_gap = extend_feature_map(name2gene, name_gap, synonyms)
    LOGGER.info("Missing %d gene names", len(name_gap))
    LOGGER.info("Fuzzy search of gene names (threshold %d%%)", CONFIG["threshold"])
    name_gap = fuzzy_extension(name2gene, name_gap, finder, CONFIG["threshold"])
#    LOGGER.info("Fuzzy search of gene names by synonyms (threshold %d%%)", CONFIG["threshold"])
#    name_gap = fuzzy_extension(name2gene, name_gap, synonyms, CONFIG["threshold"])
    manual_name_updates(name2gene)
    num = sum(1 for gene in name2gene.itervalues() if gene)
    LOGGER.info("Final map contains %d names and %d genes (%3.2f%%)", len(name2gene),
            num, 100.0 * num / len(name2gene))
    LOGGER.info("Finding gene blattner numbers")
    bnumbers = set(full_data["blattner"].unique())
    if numpy.nan in bnumbers:
        bnumbers.remove(numpy.nan)
    bnumbers = sorted(bnumbers)
    gene_finder = pyorganism.FindObject(genes, "bnumber")
    (blattner2gene, blattner_gap) = compile_feature_map(genes, bnumbers, gene_finder)
    LOGGER.info("Finding gene blattner numbers by synonyms")
    blattner_gap = extend_feature_map(blattner2gene, blattner_gap, synonyms)
    LOGGER.info("Missing %d gene blattner numbers", len(blattner_gap))
    LOGGER.info("Fuzzy search of gene blattner numbers (threshold %d%%)", CONFIG["threshold"])
    blattner_gap = fuzzy_extension(blattner2gene, blattner_gap, finder, CONFIG["threshold"])
#    LOGGER.info("Fuzzy search of gene blattner numbers by synonyms (threshold %d%%)", CONFIG["threshold"])
#    blattner_gap = fuzzy_extension(blattner2gene, blattner_gap, synonyms, CONFIG["threshold"])
    num = sum(1 for gene in blattner2gene.itervalues() if gene)
    LOGGER.info("Final map contains %d blattner numbers and %d genes (%3.2f%%)",
            len(blattner2gene), num, 100.0 * num / len(blattner2gene))
    verify_union(name2gene, blattner2gene, full_data)
    pyorganism.write_pickle(name2gene, os.path.join(objects_path, "name2gene.pkl"))
    pyorganism.write_pickle(blattner2gene, os.path.join(objects_path, "blattner2gene.pkl"))
def main_random(args):
    locations = sorted(glob(os.path.join(args.in_path, args.glob)))
    locations = [os.path.abspath(loc) for loc in locations]
    pool = multiprocessing.Pool(args.nproc)
    bar = ProgressBar(maxval=args.rnd_num, widgets=[Timer(), " ",
        SimpleProgress(), " ", Percentage(), " ", Bar(), " ", ETA()])
    rewire_name = "grn_rewired_{0:.1f}.pkl".format(args.prob)
    switch_name = "grn_switched_{0:d}.pkl".format(args.flip_num)
    for path in locations:
        filename = os.path.join(path, "trn.pkl")
        if not os.path.exists(filename):
            continue
        ver = version_from_path(path)
        base_path = os.path.join(args.out_path, ver)
        if not os.path.isdir(base_path):
            os.makedirs(base_path)
        LOGGER.info(ver)
        trn = pyorg.read_pickle(filename)
        # we consider null-models on the projected level since that is the level
        # on which we evaluate topological quantities
        net = pyreg.to_simple(trn.to_grn())
        if args.run_rewire:
            LOGGER.info("Rewiring with probability %.1f", args.prob)
            tasks = [(net, args.prob)] * args.rnd_num
            res_it = pool.imap_unordered(rewire, tasks)
            rands = list()
            bar.start()
            for rnd in res_it:
                rands.append(rnd)
                bar += 1
            bar.finish()
            pyorg.write_pickle(rands, os.path.join(base_path, rewire_name))
        if args.run_switch:
            LOGGER.info("Switch-randomizing each edge %d times", args.flip_num)
            tasks = [(net, args.flip_num)] * args.rnd_num
            res_it = pool.imap_unordered(switch, tasks)
            success = list()
            rands = list()
            bar.start()
            for (rnd, rate) in res_it:
                rands.append(rnd)
                success.append(rate)
                bar += 1
            bar.finish()
            pyorg.write_pickle(rands, os.path.join(base_path, switch_name))
            LOGGER.info("mean flip success rate: %.3G +/- %.3G", np.mean(success),
                    np.std(success))
    pool.close()
def rewiring(lb_view, versions, args):
    bar = ProgressBar(maxval=args.rnd_num, widgets=[Timer(), " ",
        SimpleProgress(), " ", Percentage(), " ", Bar(), " ", ETA()])
    rands = list()
    for ver in versions:
        LOGGER.info(ver)
        res_it = lb_view.map(rewire, [ver] * args.rnd_num, block=False, ordered=False)
        bar.start()
        for rng in res_it:
            rands.append(rng)
            bar += 1
        bar.finish()
        pyorg.write_pickle(rands, os.path.join(args.out_path, ver,
                "trn_rewired_{0:.1f}.pkl".format(args.prob)))
        lb_view.purge_results("all")
        del rands[:] # activate garbage collection in loop
def randomisation(lb_view, versions, args):
    bar = ProgressBar(maxval=args.rnd_num, widgets=[Timer(), " ",
        SimpleProgress(), " ", Percentage(), " ", Bar(), " ", ETA()])
    for ver in versions:
        LOGGER.info(ver)
        res_it = lb_view.map(null_model, [ver] * args.rnd_num, block=False, ordered=False)
        bar.start()
        rands = list()
        success = list()
        for (rnd_net, flip_rate) in res_it:
            rands.append(rnd_net)
            success.append(flip_rate)
            bar +=1
        bar.finish()
        lb_view.purge_results("all")
        LOGGER.info("mean flip success rate: %.3G +/- %.3G", np.mean(success),
                np.std(success))
        pyorg.write_pickle(rands, os.path.join(args.out_path, ver,
                "trn_random.pkl"))
        del rands[:] # activate garbage collection in loop
Esempio n. 8
0
def construct_gpn(path, window_sizes):
    LOGGER.info("{0:*^78s}".format("Construct GPN"))
    version = os.path.basename(path)
    if not version:
        version = os.path.basename(os.path.dirname(path))
    LOGGER.info("{0:*^78s}".format(version))
    genes = pyorganism.read_pickle(os.path.join(path, "genes.pkl"))
    gpn_gen = pyreg.GPNGenerator()
    gpn_gen.parse_genes(genes)
    for window in window_sizes:
        LOGGER.info("Window Size: %d",window)
        gpn = gpn_gen.generate_gpn(window)
        LOGGER.info("%d Nodes", len(gpn))
        LOGGER.info("%d Links", gpn.size())
        components = list(nx.connected_components(gpn))
        LOGGER.info("%d Components", len(components))
        LOGGER.info("Largest Component: %d", len(components[0]))
        if len(components) > 1:
            LOGGER.info("Second Largest Component: %d", len(components[1]))
        pyorganism.write_pickle(gpn, os.path.join(path, "gpn_%d.pkl" % window))
def main(argv):
    LOGGER.info("{0:*^78s}".format("Compile RegulonDB Content"))
    if not os.path.exists(argv[0]):
        sys.exit(1)
    if not os.path.exists(argv[1]):
        os.makedirs(argv[1])
    version = argv[2] if len(argv) == 3 else os.path.basename(argv[0])
    if not version:
        version = os.path.basename(os.path.dirname(argv[0]))
    LOGGER.info("{0:*^78s}".format(version))
    global VERSION
    VERSION = version
    version = tuple([int(num) for num in version.split(".")])
    # compile genes and gene products
    gene_file = os.path.join(argv[1], "genes.pkl")
    if os.path.exists(gene_file):
        genes = pyorganism.read_pickle(gene_file)
    else:
        genes = compile_genes(argv[0])
    product_file = os.path.join(argv[1], "products.pkl")
    if os.path.exists(product_file):
        products = pyorganism.read_pickle(product_file)
    else:
        products = compile_products(argv[0])
        regdb.link_gene_product(os.path.join(argv[0], "gene_product_link.xml"), VERSION)
    tf_file = os.path.join(argv[1], "transcription_factors.pkl")
    if os.path.exists(tf_file):
        t_factors = pyorganism.read_pickle(tf_file)
    else:
        t_factors = compile_transcription_factors(argv[0], version)
        num = sum(1 for gene in genes if isinstance(gene.regulatory_product, pyreg.TranscriptionFactor))
        norm = float(len(genes))
        LOGGER.info("Found {0:d} genes that code for transcription factors({1:.2%})".format(num, num / norm))
    if version >= (7, 2):
        sigma_file = os.path.join(argv[1], "sigma_factors.pkl")
        if os.path.exists(sigma_file):
            sigma_factors = pyorganism.read_pickle(sigma_file)
        else:
            sigma_factors = compile_sigma_factors(argv[0])
    conf_file = os.path.join(argv[1], "conformations.pkl")
    if os.path.exists(conf_file):
        conformations = pyorganism.read_pickle(conf_file)
    else:
        conformations = compile_conformations(argv[0])
    prom_file = os.path.join(argv[1], "promoters.pkl")
    if os.path.exists(prom_file):
        promoters = pyorganism.read_pickle(prom_file)
    else:
        promoters = compile_promoters(argv[0])
    op_file = os.path.join(argv[1], "operons.pkl")
    if os.path.exists(op_file):
        operons = pyorganism.read_pickle(op_file)
    else:
        operons = compile_operons(argv[0])
        regdb.update_operons(operons, promoters, genes)
    tu_file = os.path.join(argv[1], "transcription_units.pkl")
    if os.path.exists(tu_file):
        t_units = pyorganism.read_pickle(tu_file)
    else:
        t_units = compile_transcription_units(argv[0])
    inter_file = os.path.join(argv[1], "interactions.pkl")
    if os.path.exists(inter_file):
        interactions = pyorganism.read_pickle(inter_file)
    else:
        interactions = compile_regulation(argv[0])
    # write-out all pickles again since object attributes may have been updated
    pyorganism.write_pickle(genes, gene_file)
    pyorganism.write_pickle(products, product_file)
    pyorganism.write_pickle(t_factors, tf_file)
    if version >= (7, 2):
        pyorganism.write_pickle(sigma_factors, sigma_file)
    pyorganism.write_pickle(conformations, conf_file)
    pyorganism.write_pickle(promoters, prom_file)
    pyorganism.write_pickle(operons, op_file)
    pyorganism.write_pickle(t_units, tu_file)
    pyorganism.write_pickle(interactions, inter_file)