Esempio n. 1
0
def run(args):
    start = timer()
    Utilities.ensure_requisite_folders(args.output_prefix)
    logging.info("Loading SNP annotation")
    snp_key = KeyedDataSource.load_data(args.snp_annotation_file,
                                        "varID",
                                        "rsid_dbSNP150",
                                        should_skip=KeyedDataSource.skip_na)

    logging.info("Loading Genotype")
    genotype, individual_ids = ModelTraining.load_genotype_folder(
        args.input_genotype_folder, args.input_genotype_file_pattern, snp_key)

    logging.info("Saving Genotype")
    path_variant = args.output_prefix + ".variants.parquet"
    Parquet.save_variants(path_variant, genotype, individual_ids)

    path_metadata_variant = args.output_prefix + ".variants_metadata.parquet"
    Parquet.save_metadata(path_metadata_variant, genotype)

    logging.info("Processing Expression Phenotype")
    expression_logic = Utilities.file_logic(
        args.input_phenotype_folder, args.input_phenotype_expression_pattern)
    for row in expression_logic.itertuples():
        logging.info("Phenotype: %s", row.name)
        process_phenotype(row.path, row.name, args.output_prefix)
    end = timer()
    logging.info("Finished in %s", str(end - start))
def get_gene_variant_list(model_db_folder, pattern):
    logic = Utilities.file_logic(model_db_folder, pattern)
    g = []
    for i, l in enumerate(logic.itertuples()):
        logging.log(9, "Opening db %d/%d: %s ", i + 1, logic.shape[0], l.name)
        with sqlite3.connect(l.path) as connection:
            w = pandas.read_sql("select * from weights;",
                                connection)[["gene", "varID", "rsid"]]
            g.append(w)
    g = pandas.concat(g).drop_duplicates()
    return g
def run(args):
    logging.info("Processing...")
    Utilities.ensure_requisite_folders(args.output_prefix)

    spec = Utilities.file_logic(args.input_folder, args.input_pattern)

    with gzip.open(args.output_prefix + ".models.txt.gz", mode="w") as models:
        models.write("gene\tmodel\tn\tpp\tps\n".encode())
        with gzip.open(args.output_prefix + ".models_variants.txt.gz",
                       mode="w") as model_variants:
            model_variants.write("gene\tmodel\tvariant\n".encode())
            with gzip.open(args.output_prefix + ".model_summary.txt.gz",
                           mode="w") as model_summary:
                model_summary.write(
                    "gene\tpes\tpes_se\tlog_nc\tlog10_nc\n".encode())
                with gzip.open(args.output_prefix + ".variants_pip.txt.gz",
                               mode="w") as variant_pip:
                    variant_pip.write(
                        "gene\trank\tvariant_id\tpip\tlog10_abf\tcluster_id\n".
                        encode())
                    with gzip.open(args.output_prefix + ".clusters.txt.gz",
                                   mode="w") as clusters:
                        clusters.write(
                            "gene\tcluster\tn_snps\tpip\taverage_r2\n".encode(
                            ))
                        with gzip.open(args.output_prefix +
                                       ".cluster_correlations.txt.gz",
                                       mode="w") as cluster_correlations:
                            cluster_correlations.write(
                                "gene\tid1\tid2\tvalue\n".encode())
                            for i, t in enumerate(spec.itertuples()):
                                logging.log(9, "Processing %s", t.name)
                                written = set()
                                with open(t.path) as dap:
                                    p, pse, lognc, log10nc = None, None, None, None
                                    for l in dap:
                                        s = model_re.search(l)
                                        if s:
                                            ml = parse_model_line(t.name, s)
                                            models.write(ml.encode())
                                            vl = parse_model_line_for_variant(
                                                t.name, s)
                                            if vl:
                                                for vl_ in vl:
                                                    model_variants.write(
                                                        vl_.encode())
                                            continue

                                        s = model_expected_size_re.search(l)
                                        if s:
                                            p, pse = parse_expected_size(s)
                                            continue

                                        s = lognc_re.search(l)
                                        if s:
                                            lognc, log10nc = parse_log_10_nc(s)
                                            model_summary.write(
                                                "{}\t{}\t{}\t{}\t{}\n".format(
                                                    t.name, p, pse, lognc,
                                                    log10nc).encode())
                                            continue

                                        s = variant_re.search(l)
                                        if s:
                                            rank, id, pip, log10_abvf, cluster_id = parse_variant_line(
                                                s)
                                            variant_pip.write(
                                                "{}\t{}\t{}\t{}\t{}\t{}\n".
                                                format(t.name, rank, id, pip,
                                                       log10_abvf,
                                                       cluster_id).encode())
                                            continue

                                        s = cluster_re.search(l)
                                        if s:
                                            id, n, pip, r2 = parse_cluster_line(
                                                s)
                                            clusters.write(
                                                "{}\t{}\t{}\t{}\t{}\n".format(
                                                    t.name, id, n, pip,
                                                    r2).encode())

                                            _id1 = int(id)
                                            comps = s.group(
                                                "correlation").strip().split()

                                            for _id2 in range(
                                                    1,
                                                    len(comps) + 1):
                                                if (_id1, _id2) in written or (
                                                        _id2, _id1) in written:
                                                    continue
                                                comp = comps[_id2 - 1]
                                                cluster_correlations.write(
                                                    "{}\t{}\t{}\t{}\n".format(
                                                        t.name, _id1, _id2,
                                                        comp).encode())
                                                written.add((_id1, _id2))
    logging.info("Finished")