Esempio n. 1
0
def update_map(map_path, frame_path, update_from, version=""):
    LOGGER.info("{0:*^78s}".format("Update Gene IDs"))
    map_name = os.path.splitext(os.path.basename(map_path))[0]
    LOGGER.info("{0:*^78s}".format(map_name))
    base_path = os.path.dirname(map_path)
    if not version:
        version = os.path.basename(base_path)
    LOGGER.info("{0:*^78s}".format(version))
    # load into memory
    LOGGER.info("Loading genes")
    pyorganism.read_pickle(os.path.join(base_path, "genes.pkl"))
    LOGGER.info("Reading data frame")
    hdf_key = "/%s" % (map_name,)
    mapping = pandas.read_hdf(frame_path, hdf_key)
#    frame_info(mapping)
    id2gene = dict()
    for row in mapping[[version, update_from]].itertuples():
        if not row[1]:
            id2gene[row[0]] = pyreg.Gene.get(row[2], None, version)
        elif row[1] != row[2]:
            id2gene[row[0]] = pyreg.Gene.get(row[2], None, version)
        else:
            id2gene[row[0]] = pyreg.Gene.get(row[1], None, version)
    pyorganism.write_pickle(id2gene, os.path.join(base_path, "corrected_" +
        map_name + ".pkl"))
Esempio n. 2
0
def construct_trn(path):
    LOGGER.info("{0:*^78s}".format("Construct TRN"))
    version = os.path.basename(path)
    if not version:
        version = os.path.basename(os.path.dirname(path))
    LOGGER.info("{0:*^78s}".format(version))
    # load objects so that they are in memory
    pyorganism.read_pickle(os.path.join(path, "conformations.pkl"))
    t_units = pyorganism.read_pickle(os.path.join(path, "transcription_units.pkl"))
    interactions = pyorganism.read_pickle(os.path.join(path, "interactions.pkl"))
    assert all(pyreg.Conformation.has_key(triple[0], version) for triple in interactions),\
            "unknown conformation in regulatory interactions"
    assert all(pyreg.Promoter.has_key(triple[1], version) for triple in interactions),\
            "unknown promoter in regulatory interactions"
    # conformation-promoter directed regulatory network
    cpn = nx.MultiDiGraph()
    for (u, v, inter) in interactions:
        first = pyreg.Conformation[u, version]
        second = pyreg.Promoter[v, version]
        cpn.add_edge(first, second, key=inter)
    # TRN from CPN
    trn = pyreg.TRN()
    node_converter = NodeConverter(t_units)
    for (u, v, k, d) in cpn.edges_iter(keys=True, data=True):
        first = u.t_factor
        for nbr in node_converter(v.unique_id):
            trn.add_edge(first, nbr, key=k, **d.copy())
    LOGGER.info("%d Nodes", len(trn))
    LOGGER.info("%d Links", trn.size())
    components = list(nx.connected_components(trn.to_undirected()))
    LOGGER.info("%d Components", len(components))
    LOGGER.info("Largest Component: %d", len(components[0]))
    if len(components) > 1:
        LOGGER.info("Second Largest Component: %d", len(components[1]))
    pyorganism.write_pickle(trn, os.path.join(path, "trn.pkl"))
Esempio n. 3
0
def store_joint_ids(map_path, frame_path, version=""):
    LOGGER.info("{0:*^78s}".format("Store Gene IDs"))
    map_name = os.path.splitext(os.path.basename(map_path))[0]
    LOGGER.info("{0:*^78s}".format(map_name))
    base_path = os.path.dirname(map_path)
    if not version:
        version = os.path.basename(base_path)
    LOGGER.info("{0:*^78s}".format(version))
    # load into memory
    LOGGER.info("Loading genes")
    pyorganism.read_pickle(os.path.join(base_path, "genes.pkl"))
    LOGGER.info("Loading feature map")
    id2gene = pyorganism.read_pickle(map_path)
    LOGGER.info("Creating data frame")
    df = create_dataframe(id2gene, version)
    hdf_key = "/%s" % (map_name,)
    LOGGER.info("Assembling joint frame")
    if os.path.exists(frame_path):
        try:
            mapping = pandas.read_hdf(frame_path, hdf_key)
            mapping[version] = pandas.Series(df[version], index=mapping.index)
        except KeyError:
            mapping = df
    else:
        mapping = df
    frame_info(mapping)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", pandas.io.pytables.PerformanceWarning)
        mapping.to_hdf(frame_path, hdf_key, format="table")
def null_stats(base_dir, task):
    glbls = globals()
    prob = glbls["prob"]
    choose = glbls["choose"]
    logger = glbls["LOGGER"]
    ver = os.path.basename(base_dir)
    if not ver:
        ver = os.path.basename(os.path.dirname(base_dir))
    if task == "rewired":
        desc = "rewired {0:.1f}".format(prob)
        filename = "trn_rewired_{0:.1f}.pkl".format(prob)
    elif task == "switch":
        desc = "switch"
        filename = "trn_random.pkl"
    try:
        nets = pyorg.read_pickle(os.path.join(base_dir, filename))
    except (OSError, IOError, EOFError):
        (err, msg, trace) = sys.exc_info()
        logger.error("Version: '%s' Task: '%s'", ver, task)
        logger.error(str(msg))
        return err_stats(ver, desc)
    chosen = random.sample(nets, choose)
    logger.info("%d/%d random networks", len(chosen), len(nets))
    nets = [trn2grn(net) for net in chosen]
    return pd.concat([stats(net, ver, desc) for net in nets], ignore_index=True)
def compile_feature2gene(objects_path):
    LOGGER.info("{0:*^78s}".format("Compile Gene Feature Map"))
    version = os.path.basename(objects_path)
    if not version:
        version = os.path.basename(os.path.dirname(objects_path))
    global VERSION
    VERSION = version
    LOGGER.info("{0:*^78s}".format(version))
    names = compile_names(CONFIG["data_paths"], CONFIG["data_load"])
    LOGGER.info("Loading genes")
    genes = pyorganism.read_pickle(os.path.join(objects_path, "genes.pkl"))
    LOGGER.info("Finding gene names")
    finder = pyorganism.FindObject(genes,
            targets=[gene.name.lower() for gene in genes])
    (feature2gene, gap) = compile_feature_map(genes, names, finder)
    synonyms = synonym_finder(genes)
    LOGGER.info("Finding gene names by synonyms")
    gap = extend_feature_map(feature2gene, gap, synonyms)
    LOGGER.info("Missing %d gene names", len(gap))
    LOGGER.info("Fuzzy search of gene names (threshold %d%%)", CONFIG["threshold"])
    LOGGER.setLevel(logging.DEBUG)
    gap = fuzzy_extension(feature2gene, gap, finder, CONFIG["threshold"])
    LOGGER.info("Fuzzy search of gene names by synonyms (threshold %d%%)", CONFIG["threshold"])
    gap = fuzzy_extension(feature2gene, gap, synonyms, CONFIG["threshold"])
    LOGGER.info("Manual update")
    manual_name_updates(feature2gene)
    num = sum(1 for gene in feature2gene.itervalues() if gene)
    LOGGER.info("Final map contains %d names and %d genes (%3.2f%%)", len(feature2gene),
            num, 100.0 * num / len(feature2gene))
    pyorganism.write_pickle(feature2gene, os.path.join(objects_path, "feature2gene.pkl"))
def load_data(locations):
    tr_nets = dict()
    versions = list()
    drop = list()
    for path in locations:
        ver = os.path.basename(path)
        if not ver:
            ver = os.path.basename(os.path.dirname(path))
        try:
            pyorg.read_pickle(os.path.join(path, "genes.pkl"))
            pyorg.read_pickle(os.path.join(path, "transcription_factors.pkl"))
            trn = pyorg.read_pickle(os.path.join(path, "trn.pkl"))
            tr_nets[ver] = trn
            versions.append(ver)
        except IOError:
            drop.append(path)
            continue
    for path in drop:
        locations.remove(path)
    return (tr_nets, versions)
def main(in_path, out_path, version=""):
    version = os.path.basename(in_path)
    if not version:
        version = os.path.basename(os.path.dirname(in_path))
    LOGGER.info("{0:*^78s}".format(version))
    LOGGER.info("Loading genes")
    genes = pyorganism.read_pickle(os.path.join(in_path, "genes.pkl"))
    LOGGER.info("Loading TRN")
    trn = pyorganism.read_pickle(os.path.join(in_path, "trn.pkl"))
    LOGGER.info("Loading TFs")
    t_factors = pyorganism.read_pickle(os.path.join(in_path,
            "transcription_factors.pkl"))
    LOGGER.info("Loading GPN")
    gpn = pyorganism.read_pickle(os.path.join(in_path, "gpn_5000.pkl"))
    version = os.path.basename(in_path)
    if not version:
        version = os.path.basename(os.path.dirname(in_path))
    (stats, dists) = gpn_stats(genes, gpn, version)
    store_results(os.path.join(out_path, "gpn_5000_statistics.csv"), stats)
    store_results(os.path.join(out_path, "gpn_5000_distributions.csv"), dists)
    (stats, dists) = trn_stats(genes, trn, t_factors, version)
    store_results(os.path.join(out_path, "trn_statistics.csv"), stats)
    store_results(os.path.join(out_path, "trn_distributions.csv"), dists)
Esempio n. 8
0
def compile_name2gene(objects_path):
    LOGGER.info("{0:*^78s}".format("Compile Gene Name Map"))
    full_data = compile_names(CONFIG["data_paths"], CONFIG["data_load"])
    version = os.path.basename(objects_path)
    if not version:
        version = os.path.basename(os.path.dirname(objects_path))
    global VERSION
    VERSION = version
    LOGGER.info("{0:*^78s}".format(version))
    LOGGER.info("Loading genes")
    genes = pyorganism.read_pickle(os.path.join(objects_path, "genes.pkl"))
    LOGGER.info("Finding gene names")
    names = set(full_data["name"].unique())
    if numpy.nan in names:
        names.remove(numpy.nan)
    names = sorted(names)
    finder = pyorganism.FindObject(genes, "name")
    (name2gene, name_gap) = compile_feature_map(genes, names, finder)
    synonyms = synonym_finder(genes)
    LOGGER.info("Finding gene names by synonyms")
    name_gap = extend_feature_map(name2gene, name_gap, synonyms)
    LOGGER.info("Missing %d gene names", len(name_gap))
    LOGGER.info("Fuzzy search of gene names (threshold %d%%)", CONFIG["threshold"])
    name_gap = fuzzy_extension(name2gene, name_gap, finder, CONFIG["threshold"])
#    LOGGER.info("Fuzzy search of gene names by synonyms (threshold %d%%)", CONFIG["threshold"])
#    name_gap = fuzzy_extension(name2gene, name_gap, synonyms, CONFIG["threshold"])
    manual_name_updates(name2gene)
    num = sum(1 for gene in name2gene.itervalues() if gene)
    LOGGER.info("Final map contains %d names and %d genes (%3.2f%%)", len(name2gene),
            num, 100.0 * num / len(name2gene))
    LOGGER.info("Finding gene blattner numbers")
    bnumbers = set(full_data["blattner"].unique())
    if numpy.nan in bnumbers:
        bnumbers.remove(numpy.nan)
    bnumbers = sorted(bnumbers)
    gene_finder = pyorganism.FindObject(genes, "bnumber")
    (blattner2gene, blattner_gap) = compile_feature_map(genes, bnumbers, gene_finder)
    LOGGER.info("Finding gene blattner numbers by synonyms")
    blattner_gap = extend_feature_map(blattner2gene, blattner_gap, synonyms)
    LOGGER.info("Missing %d gene blattner numbers", len(blattner_gap))
    LOGGER.info("Fuzzy search of gene blattner numbers (threshold %d%%)", CONFIG["threshold"])
    blattner_gap = fuzzy_extension(blattner2gene, blattner_gap, finder, CONFIG["threshold"])
#    LOGGER.info("Fuzzy search of gene blattner numbers by synonyms (threshold %d%%)", CONFIG["threshold"])
#    blattner_gap = fuzzy_extension(blattner2gene, blattner_gap, synonyms, CONFIG["threshold"])
    num = sum(1 for gene in blattner2gene.itervalues() if gene)
    LOGGER.info("Final map contains %d blattner numbers and %d genes (%3.2f%%)",
            len(blattner2gene), num, 100.0 * num / len(blattner2gene))
    verify_union(name2gene, blattner2gene, full_data)
    pyorganism.write_pickle(name2gene, os.path.join(objects_path, "name2gene.pkl"))
    pyorganism.write_pickle(blattner2gene, os.path.join(objects_path, "blattner2gene.pkl"))
def main_random(args):
    locations = sorted(glob(os.path.join(args.in_path, args.glob)))
    locations = [os.path.abspath(loc) for loc in locations]
    pool = multiprocessing.Pool(args.nproc)
    bar = ProgressBar(maxval=args.rnd_num, widgets=[Timer(), " ",
        SimpleProgress(), " ", Percentage(), " ", Bar(), " ", ETA()])
    rewire_name = "grn_rewired_{0:.1f}.pkl".format(args.prob)
    switch_name = "grn_switched_{0:d}.pkl".format(args.flip_num)
    for path in locations:
        filename = os.path.join(path, "trn.pkl")
        if not os.path.exists(filename):
            continue
        ver = version_from_path(path)
        base_path = os.path.join(args.out_path, ver)
        if not os.path.isdir(base_path):
            os.makedirs(base_path)
        LOGGER.info(ver)
        trn = pyorg.read_pickle(filename)
        # we consider null-models on the projected level since that is the level
        # on which we evaluate topological quantities
        net = pyreg.to_simple(trn.to_grn())
        if args.run_rewire:
            LOGGER.info("Rewiring with probability %.1f", args.prob)
            tasks = [(net, args.prob)] * args.rnd_num
            res_it = pool.imap_unordered(rewire, tasks)
            rands = list()
            bar.start()
            for rnd in res_it:
                rands.append(rnd)
                bar += 1
            bar.finish()
            pyorg.write_pickle(rands, os.path.join(base_path, rewire_name))
        if args.run_switch:
            LOGGER.info("Switch-randomizing each edge %d times", args.flip_num)
            tasks = [(net, args.flip_num)] * args.rnd_num
            res_it = pool.imap_unordered(switch, tasks)
            success = list()
            rands = list()
            bar.start()
            for (rnd, rate) in res_it:
                rands.append(rnd)
                success.append(rate)
                bar += 1
            bar.finish()
            pyorg.write_pickle(rands, os.path.join(base_path, switch_name))
            LOGGER.info("mean flip success rate: %.3G +/- %.3G", np.mean(success),
                    np.std(success))
    pool.close()
Esempio n. 10
0
def construct_gpn(path, window_sizes):
    LOGGER.info("{0:*^78s}".format("Construct GPN"))
    version = os.path.basename(path)
    if not version:
        version = os.path.basename(os.path.dirname(path))
    LOGGER.info("{0:*^78s}".format(version))
    genes = pyorganism.read_pickle(os.path.join(path, "genes.pkl"))
    gpn_gen = pyreg.GPNGenerator()
    gpn_gen.parse_genes(genes)
    for window in window_sizes:
        LOGGER.info("Window Size: %d",window)
        gpn = gpn_gen.generate_gpn(window)
        LOGGER.info("%d Nodes", len(gpn))
        LOGGER.info("%d Links", gpn.size())
        components = list(nx.connected_components(gpn))
        LOGGER.info("%d Components", len(components))
        LOGGER.info("Largest Component: %d", len(components[0]))
        if len(components) > 1:
            LOGGER.info("Second Largest Component: %d", len(components[1]))
        pyorganism.write_pickle(gpn, os.path.join(path, "gpn_%d.pkl" % window))
def main_continuous(args):
    glbls = globals()
    engine = create_engine(args.engine)
    pymodels.Base.metadata.bind = engine
    pymodels.Session.configure(bind=engine)
    session = pymodels.Session()
    tasks = session.query(pymodels.Job).\
            options(joinedload("analysis"), joinedload("control"),
            joinedload("experiment")).filter(~pymodels.Job.complete).all()
    if len(tasks) == 0:
        LOGGER.warn("Nothing to do")
        return
    analysis_configs = {job.analysis for job in tasks}
    control_configs = {job.control for job in tasks}
    experiments = {job.experiment for job in tasks}
    preparations = {job.preparation for job in tasks}
    sampling = {job.sampling for job in tasks}
    projections = {job.projection for job in tasks}
    LOGGER.debug("%d analysis configurations", len(analysis_configs))
    LOGGER.debug("%d control configurations", len(control_configs))
    LOGGER.debug("%d experiments", len(experiments))
    LOGGER.debug("%d setup cases", len(preparations))
    LOGGER.debug("%d sampling methods", len(sampling))
    LOGGER.debug("%d network projections", len(projections))
    num_prep = len(analysis_configs) * len(control_configs) * len(experiments)\
            * len(preparations) * len(sampling) * len(projections)
    LOGGER.debug("%d total configurations", num_prep)
    LOGGER.info("Preparing Data")
    task_args = dict()
    bar = ProgressBar(maxval=num_prep, widgets=[Timer(), " ",
            SimpleProgress(), " ", Percentage(), " ", Bar(), " ",
            ETA()]).start()
    for anal in analysis_configs:
        LOGGER.debug(" %s:", anal.version)
        feature2node = pyorg.read_pickle(os.path.join(anal.objects, anal.map))
        for cntrl in control_configs:
            LOGGER.debug("  %s", cntrl.type)
            net = pyorg.read_pickle(os.path.join(anal.objects, cntrl.network))
            tu_net = pyreg.to_transcription_unit_based(net)
            op_net = pyreg.to_operon_based(net)
            for exp in experiments:
                LOGGER.debug("   %s", exp.strain)
                for prep in preparations:
                    LOGGER.debug("    %s", prep)
                    series = glbls[prep](session, exp)
                    for sampl in sampling:
                        LOGGER.debug("     %s", sampl)
                        for prj in projections:
                            LOGGER.debug("      %s", prj)
                            control = pyreg.ContinuousControl()
                            if prj == "tu":
                                control.setup(tu_net, series, feature2node, sampl)
                            elif prj == "operon":
                                control.setup(op_net, series, feature2node, sampl)
                            else:
                                control.setup(net, series, feature2node, sampl)
                            if cntrl.type == "analog":
                                control.from_gpn()
                            elif cntrl.type == "digital":
                                control.from_trn()
                            else:
                                raise ValueError("'{}'".format(cntrl.type))
                            task_args[(anal.id, cntrl.id, exp.id, prep, sampl,
                                prj)] = (control, series.columns)
                            bar += 1
    bar.finish()
    LOGGER.info("Running Jobs")
    tasks = [task_args[(job.analysis.id, job.control.id, job.experiment.id,
        job.preparation, job.sampling, job.projection)] + (job.measure,
        job.random_num, job.delay, job.id) for job in tasks]
    pool = multiprocessing.Pool(args.nproc)
    result_it = pool.imap_unordered(continuous_exec, tasks)
    bar = ProgressBar(maxval=len(tasks), widgets=[Timer(), " ",
            SimpleProgress(), " ", Percentage(), " ", Bar(), " ",
            ETA()]).start()
    for (job_id, z_scores, cntrl_scores, samples, points) in result_it:
        results = list()
        try:
            job = session.query(pymodels.Job).filter_by(id=job_id).one()
            for (i, name) in enumerate(points):
                res = pymodels.Result(control=cntrl_scores[i], ctc=z_scores[i],
                        point=name, job=job)
                session.add(res)
                results.append(res)
            job.complete = True
            session.commit()
        except Exception:
            session.rollback()
            bar += 1
            continue
        if job.selection > 0:
            try:
                for (i, res) in enumerate(results):
                    # use a more low-level insert for speed
                    session.execute(pymodels.RandomSample.__table__.insert(),
                            [{"control": val, "result_id": res.id}\
                            for val in np.random.choice(samples[i], job.selection,
                            replace=False)])
                session.commit()
            except Exception:
                session.rollback()
                bar += 1
                continue
        bar += 1
    bar.finish()
    session.close()
def main(argv):
    LOGGER.info("{0:*^78s}".format("Compile RegulonDB Content"))
    if not os.path.exists(argv[0]):
        sys.exit(1)
    if not os.path.exists(argv[1]):
        os.makedirs(argv[1])
    version = argv[2] if len(argv) == 3 else os.path.basename(argv[0])
    if not version:
        version = os.path.basename(os.path.dirname(argv[0]))
    LOGGER.info("{0:*^78s}".format(version))
    global VERSION
    VERSION = version
    version = tuple([int(num) for num in version.split(".")])
    # compile genes and gene products
    gene_file = os.path.join(argv[1], "genes.pkl")
    if os.path.exists(gene_file):
        genes = pyorganism.read_pickle(gene_file)
    else:
        genes = compile_genes(argv[0])
    product_file = os.path.join(argv[1], "products.pkl")
    if os.path.exists(product_file):
        products = pyorganism.read_pickle(product_file)
    else:
        products = compile_products(argv[0])
        regdb.link_gene_product(os.path.join(argv[0], "gene_product_link.xml"), VERSION)
    tf_file = os.path.join(argv[1], "transcription_factors.pkl")
    if os.path.exists(tf_file):
        t_factors = pyorganism.read_pickle(tf_file)
    else:
        t_factors = compile_transcription_factors(argv[0], version)
        num = sum(1 for gene in genes if isinstance(gene.regulatory_product, pyreg.TranscriptionFactor))
        norm = float(len(genes))
        LOGGER.info("Found {0:d} genes that code for transcription factors({1:.2%})".format(num, num / norm))
    if version >= (7, 2):
        sigma_file = os.path.join(argv[1], "sigma_factors.pkl")
        if os.path.exists(sigma_file):
            sigma_factors = pyorganism.read_pickle(sigma_file)
        else:
            sigma_factors = compile_sigma_factors(argv[0])
    conf_file = os.path.join(argv[1], "conformations.pkl")
    if os.path.exists(conf_file):
        conformations = pyorganism.read_pickle(conf_file)
    else:
        conformations = compile_conformations(argv[0])
    prom_file = os.path.join(argv[1], "promoters.pkl")
    if os.path.exists(prom_file):
        promoters = pyorganism.read_pickle(prom_file)
    else:
        promoters = compile_promoters(argv[0])
    op_file = os.path.join(argv[1], "operons.pkl")
    if os.path.exists(op_file):
        operons = pyorganism.read_pickle(op_file)
    else:
        operons = compile_operons(argv[0])
        regdb.update_operons(operons, promoters, genes)
    tu_file = os.path.join(argv[1], "transcription_units.pkl")
    if os.path.exists(tu_file):
        t_units = pyorganism.read_pickle(tu_file)
    else:
        t_units = compile_transcription_units(argv[0])
    inter_file = os.path.join(argv[1], "interactions.pkl")
    if os.path.exists(inter_file):
        interactions = pyorganism.read_pickle(inter_file)
    else:
        interactions = compile_regulation(argv[0])
    # write-out all pickles again since object attributes may have been updated
    pyorganism.write_pickle(genes, gene_file)
    pyorganism.write_pickle(products, product_file)
    pyorganism.write_pickle(t_factors, tf_file)
    if version >= (7, 2):
        pyorganism.write_pickle(sigma_factors, sigma_file)
    pyorganism.write_pickle(conformations, conf_file)
    pyorganism.write_pickle(promoters, prom_file)
    pyorganism.write_pickle(operons, op_file)
    pyorganism.write_pickle(t_units, tu_file)
    pyorganism.write_pickle(interactions, inter_file)
Esempio n. 13
0
def main_analysis(args):
    locations = sorted(glob(os.path.join(args.in_path, args.glob)))
    locations = [os.path.abspath(loc) for loc in locations]
    tasks = list()
    if args.run_rewire:
        tasks.extend([(loc, "rewired", args.choose, args.prob) for loc in locations])
    if args.run_switch:
        tasks.extend([(loc, "switch", args.choose) for loc in locations])
    out_name = os.path.join(args.out_path, args.file)
    if not os.path.isdir(args.out_path):
        os.makedirs(args.out_path)
    if os.path.exists(out_name):
        result = pd.read_csv(out_name, sep=str(";"), dtype={"version": str},
                encoding=args.encoding)
    else:
        result = pd.DataFrame(columns=["version", "num_components",
            "largest_component", "feed_forward", "feedback", "cycles",
            "regulated_in_deg", "regulating_out_deg", "null_model"])
    pool = multiprocessing.Pool(args.nproc)
    rewire_name = "grn_rewired_{0:.1f}.pkl".format(args.prob)
    rewire_descr = "rewired {0:.1f}".format(args.prob)
    switch_name = "grn_switched_{0:d}.pkl".format(args.flip_num)
    switch_descr = "switch {0:d}".format(args.flip_num)
    for path in locations:
        ver = version_from_path(path)
        LOGGER.info(ver)
        if args.run_rewire:
            filename = os.path.join(path, rewire_name)
            if os.path.exists(filename):
                LOGGER.info("Analyzing rewired networks (probability %.1f)",
                        args.prob)
                nets = pyorg.read_pickle(filename)
                if args.choose is not None:
                    chosen_num = min(args.choose, len(nets))
                    LOGGER.info("Using %d/%d random networks", chosen_num,
                            len(nets))
                    nets = random.sample(nets, chosen_num)
                tasks = [(net, ver, rewire_descr) for net in nets]
                res_it = pool.imap_unordered(stats, tasks)
                frames = list()
                bar = ProgressBar(maxval=len(tasks), widgets=[Timer(), " ",
                        SimpleProgress(), " ", Percentage(), " ", Bar(), " ",
                        ETA()]).start()
                for df in res_it:
                    frames.append(df)
                    bar += 1
                bar.finish()
                result = result.append(pd.concat(frames, ignore_index=True),
                        ignore_index=True)
                result.to_csv(out_name, header=True, index=False,
                        sep=str(";"), encoding=args.encoding)
        if args.run_switch:
            filename = os.path.join(path, switch_name)
            if os.path.exists(filename):
                LOGGER.info("Analyzing switched networks (flip number %d)",
                        args.flip_num)
                nets = pyorg.read_pickle(filename)
                if args.choose is not None:
                    chosen_num = min(args.choose, len(nets))
                    LOGGER.info("Using %d/%d random networks", chosen_num, len(nets))
                    nets = random.sample(nets, chosen_num)
                tasks = [(net, ver, switch_descr) for net in nets]
                res_it = pool.imap_unordered(stats, tasks)
                frames = list()
                bar = ProgressBar(maxval=len(tasks), widgets=[Timer(), " ",
                        SimpleProgress(), " ", Percentage(), " ", Bar(), " ",
                        ETA()]).start()
                for df in res_it:
                    frames.append(df)
                    bar += 1
                bar.finish()
                result = result.append(pd.concat(frames, ignore_index=True),
                        ignore_index=True)
                result.to_csv(out_name, header=True, index=False,
                        sep=str(";"), encoding=args.encoding)
Esempio n. 14
0
def main(remote_client, args):
    config = json.load(codecs.open(args.config, encoding=args.encoding, mode="rb"))
    if config["continuous"]:
        load_func = load_continuous
        table_key = "/Continuous"
        job_gen = continuous_jobs
        worker = continuous_worker
        result = continuous_result
    else:
        load_func = load_discrete
        table_key = "/Discrete"
        job_gen = discrete_jobs
        worker = discrete_worker
        result = discrete_result
    organism = pyorg.Organism(name=config["organism"])
    load_func(organism, config)
    LOGGER.info("Load data")
    glob_vars = globals()
    data = config["data"]
    network = config["network"]
    analysis = config["analysis"]
    namespace = dict()
    namespace["genes"] = dict()
    namespace["networks"] = dict()
    namespace["prepared"] = dict()
    for version in config["versions"]:
        LOGGER.info("{0:*^78s}".format(version))
        namespace["genes"][version] = pyorg.read_pickle(os.path.join(
                data["base"], version, data["gene_path"]))
        id2gene = pyorg.read_pickle(os.path.join(data["base"], version,
                data["mapping_path"]))
        namespace["networks"][version] = dict()
        for (cntrl_type, net_file, projections) in izip(analysis["control_types"],
                network["paths"], network["projections"]):
            net = pyorg.read_pickle(os.path.join(data["base"], version, net_file))
            namespace["networks"][version][cntrl_type] = dict()
            for basis in projections:
                if basis == "gene":
                    namespace["networks"][version][cntrl_type][basis] = net
                elif basis == "tu":
                    namespace["networks"][version][cntrl_type][basis] =\
                            pyreg.to_transcription_unit_based(net)
                elif basis == "operon":
                    namespace["networks"][version][cntrl_type][basis] =\
                            pyreg.to_operon_based(net)
        namespace["prepared"][version] = dict()
        for (cntrl_type, experiments, setups) in izip(analysis["control_types"],
                 analysis["experimental_sets"], analysis["experimental_setups"]):
            LOGGER.info("{0:*^78s}".format(cntrl_type))
            namespace["prepared"][version][cntrl_type] = dict()
            for (exp_name, exp_setup) in izip(experiments, setups):
                LOGGER.info("{0:*^78s}".format(exp_name))
                df = organism.activity[exp_name]
                setup_func = glob_vars[exp_setup]
                namespace["prepared"][version][cntrl_type][exp_name] =\
                        setup_func(cntrl_type, df, id2gene)
                if any(method.startswith("delayed") for method in chain(analysis["control"], *analysis["ctc"])):
                    namespace["prepared"][version][cntrl_type][exp_name]["delayed"] = dict()
                    for delta in analysis["delays"]:
                        delayed_continuous(namespace["prepared"][version][cntrl_type][exp_name],
                                delta)
                if any(ms_name.endswith("comparison") for ms_name in chain(*analysis["measures"])):
                    rate_continuous(namespace["prepared"][version][cntrl_type][exp_name])
    LOGGER.debug("\n".join(print_dict(namespace)))
    db = shelve.open(config["shelve"], protocol=pickle.HIGHEST_PROTOCOL)
    for (key, value) in namespace.iteritems():
        db[key] = value
    db.close()
    # general parallel setup using IPython.parallel
    LOGGER.info("Remote imports")
    d_view = remote_client.direct_view()
    d_view.execute("import numpy as np; "\
            "import shelve; import pickle;"\
            "import pyorganism as pyorg; import pyorganism.regulation as pyreg;"\
            "import logging; from IPython.config import Application;"\
            "LOGGER = Application.instance().log;"\
            "LOGGER.setLevel(logging.{level});".format(level=args.log_level),
            block=True)
    LOGGER.info("Transfer data")
#    d_view.push(namespace, block=True)
    d_view.execute("db = shelve.open('{shelve}', protocol=pickle.HIGHEST_PROTOCOL);"\
                    "globals().update(db);db.close()".format(shelve=config["shelve"]), block=True)
    LOGGER.info("Generate job descriptions")
    jobs = job_gen(organism, config, namespace)
    l_view = remote_client.load_balanced_view()
    bar = ProgressBar(maxval=len(jobs), widgets=[Timer(), " ", Percentage(),
            " ", Bar(), " ", ETA()]).start()
    result_mngr = ResultManager(config["output"], table_key)
    results_it = l_view.map(worker, jobs, ordered=False, block=False)
    for (spec, res_cntrl, res_ctc, samples) in results_it:
        LOGGER.debug(res_cntrl)
        LOGGER.debug(res_ctc)
        result(result_mngr, spec, res_cntrl, res_ctc, samples)
        bar += 1
    result_mngr.finalize()
    bar.finish()
    LOGGER.info("parallel speed-up was %.3g",
            results_it.serial_time / results_it.wall_time)