Ejemplo n.º 1
0
def run(args):
    if os.path.exists(args.output):
        logging.info("Output exists. Nope")
        return

    filters = {x[0]: x[1:] for x in args.filter}

    maf_filter = float(filters["MAF"][0]) if "MAF" in filters else None
    logging.info("Loading GTEX variant map")
    gtex_snp_key = GTExMisc.load_gtex_variant_to_rsid(args.annotation[0])

    logging.info("Processing genotype")
    m = []
    for mean, metadata, ids in ModelTraining.dosage_generator(
            args.genotype,
            gtex_snp_key,
            dosage_conversion=ModelTraining._mean,
            do_none=True):
        if maf_filter:
            f = mean / 2 if mean < 1 else 1 - mean / 2
            if f < maf_filter:
                continue
        m.append(metadata)

    m = Utilities.to_dataframe(m, [x[1] for x in Genotype.MetadataTFE.order])
    if "TOP_CHR_POS_BY_FREQ" in filters:
        logging.info("Simplifying multi-allelic variants")
        m = Genotype._monoallelic_by_frequency(m)

    logging.info("Saving...")
    Utilities.save_dataframe(m, args.output)
    logging.info("Finished")
Ejemplo n.º 2
0
def run(args):
    if os.path.exists(args.output_file):
        logging.info("Output %s exists. Nope", args.output_file)
        return

    results_order = []
    results = {}
    logging.info("Streaming file for groups")
    for i,line in Utilities.iterate_file(args.input_file):
        if i==0: continue

        comps = line.strip().split()
        key = comps[0]
        if not key in results:
            results_order.append(key)
            results[key] = 0
            logging.log(9, "Key: %s", str(key))
        results[key] += 1

    r = []
    logging.info("Producing output")
    for key in results_order:
        r.append((key, results[key]))
    r = pandas.DataFrame(r, columns=["key","count"])

    logging.info("Saving")
    Utilities.ensure_requisite_folders(args.output_file)
    Utilities.save_dataframe(r, args.output_file)

    logging.info("Finished.")
Ejemplo n.º 3
0
    def sink(self, cov, ids, region):
        logging.log(9, "Serializing covariance")
        _region = "{}_{}_{}_{}".format(region.name, region.chr, region.start,
                                       region.stop)
        if args.text_output:
            if args.dapg_output:
                raise RuntimeError("Not supported for this option")
            else:
                cov = matrices._flatten_matrix_data([(_region, ids, cov)])
                self.of.sink(cov)
        elif args.text_output_folder:
            if args.dapg_output:
                f = os.path.join(args.text_output_folder, _region) + ".txt.gz"
                with gzip.open(f, "w") as o:
                    for i in range(0, cov.shape[0]):
                        l = "\t".join(["{:0.4f}".format(x)
                                       for x in cov[i]]) + "\n"
                        o.write(l.encode())
                id = os.path.join(args.text_output_folder,
                                  _region) + ".id.txt.gz"
                with gzip.open(id, "w") as o:
                    l = "\n".join(ids).encode()
                    o.write(l)

            else:
                cov = matrices._flatten_matrix_data_2(ids, cov)
                cov = pandas.DataFrame(cov)[["id1", "id2", "value"]]
                f = os.path.join(args.text_output_folder, _region) + ".txt.gz"
                Utilities.save_dataframe(cov, f)
Ejemplo n.º 4
0
def process_original_gwas(args, imputed):
    logging.info("Processing GWAS file %s", args.gwas_file)
    g = pandas.read_table(args.gwas_file)
    g = g.assign(current_build="hg38",
                 imputation_status="original")[COLUMN_ORDER]
    # Remember the palindromic snps are to be excluded from the input GWAS;
    logging.info("Read %d variants", g.shape[0])

    if not args.keep_all_observed:
        if args.keep_criteria == "GTEX_VARIANT_ID":
            g = g.loc[~g.panel_variant_id.isin(imputed.panel_variant_id)]
        elif args.keep_criteria == "CHR_POS":
            g = g.assign(k=gwas_k(g))
            imputed = imputed.assign(k=gwas_k(imputed))
            g = g.loc[~g.k.isin({x for x in imputed.k})]
            g.drop("k", axis=1, inplace=True)
            imputed.drop("k", axis=1, inplace=True)
        else:
            raise RuntimeError("Unsupported keep option")
        logging.info("Kept %d variants as observed", g.shape[0])

    g = pandas.concat([g, imputed])[COLUMN_ORDER]
    logging.info("%d variants", g.shape[0])

    logging.info("Filling median")
    g = Genomics.fill_column_to_median(g, "sample_size", numpy.int32)

    logging.info("Sorting by chromosome-position")
    g = Genomics.sort(g)

    logging.info("Saving")
    Utilities.save_dataframe(g, args.output)

    return g[["panel_variant_id"]]
Ejemplo n.º 5
0
def run(args):
    Coloc.initialize(args.coloc_script)
    if os.path.exists(args.output):
        logging.info("Output exists. Nope.")
        return
    start = timer()

    logging.info("Loading gwas")
    gwas = Coloc.read_gwas(args.gwas, args.gwas_sample_size, args.gwas_mode)

    streamer = Coloc.eqtl_streamer(args.eqtl, gwas)

    results = []
    logging.info("Beggining process")
    MAX_N = args.MAX_N
    for i, d in enumerate(streamer):
        gene = d.gene_id.values[0]
        logging.log(9, "Processing gene %s", gene)
        eqtl = Coloc.get_eqtl(d, args.eqtl_sample_size, args.eqtl_mode)
        r = Coloc.coloc_on_gwas_eqtl(gene, gwas, eqtl, args.gwas_mode,
                                     args.eqtl_mode, args.p1, args.p2,
                                     args.p12)
        results.append(r)
        if MAX_N and i > MAX_N:
            logging.info("Early exit")
            break

    logging.info("Saving")
    results = Coloc.results_to_dataframe(results)
    Utilities.ensure_requisite_folders(args.output)
    Utilities.save_dataframe(results, args.output)
    end = timer()
    logging.info("Finished COLOC in %s seconds" % (str(end - start)))
Ejemplo n.º 6
0
def run(args):
    start = timer()

    if os.path.exists(args.output_folder):
        logging.info("Output folder exists. Nope.")
        return

    if os.path.exists(args.intermediate_folder):
        logging.info("Intermediate folder exists. Nope.")
        return

    stats = []

    context = DAPUtilities.context_from_args(args)
    available_genes = context.get_available_genes()

    for i,gene in enumerate(available_genes):
        if args.MAX_M and i==args.MAX_M:
            break
        _start = timer()
        logging.log(8, "Processing %i/%i:%s", i+1, len(available_genes), gene)
        _stats = RunDAP.run_dap(context, gene)
        _end = timer()
        logging.log(7, "Elapsed: %s", str(_end - _start))
        stats.append(_stats)

    end = timer()
    logging.info("Ran DAP in %s seconds" % (str(end - start)))

    Utilities.ensure_requisite_folders(args.output_folder)
    stats_ = args.stats_name if args.stats_name else "stats.txt"
    stats_path = os.path.join(args.output_folder, stats_)
    stats = RunDAP.data_frame_from_stats(stats).fillna("NA")
    Utilities.save_dataframe(stats, stats_path)
Ejemplo n.º 7
0
def run(args):
    if os.path.exists(args.output):
        logging.info("%s exists. Nope.", args.output)
        return

    logging.info("Loading regions")
    regions = pandas.read_table(
        args.region_file).rename(columns={"chr": "chromosome"})
    regions.dropna(inplace=True)
    regions.start = regions.start.astype(int)
    regions.stop = regions.stop.astype(int)

    logging.info("Loading gwas")
    gwas = pandas.read_table(
        args.gwas_file,
        usecols=["panel_variant_id", "chromosome", "position", "zscore"])
    gwas.dropna(inplace=True)

    logging.info("Processing")
    sliced = []
    for i, region in enumerate(regions.itertuples()):
        logging.log(8, "Processing region %d", i + 1)
        if numpy.isnan(region.start) or numpy.isnan(region.stop) or \
                (type(region.chromosome) != str and numpy.isnan(region.chromosome)):
            logging.log(8, "skipping incomplete region")
            continue
        slice = gwas[(gwas.chromosome == region.chromosome)
                     & (gwas.position >= region.start) &
                     (gwas.position < region.stop)]
        slice = slice.sort_values(by="position")
        if slice.shape[0] == 0:
            continue
        slice = slice.assign(region="region-{}-{}-{}".format(
            region.chromosome, region.start, region.stop),
                             r=i)

        slice = slice[["panel_variant_id", "region", "r", "zscore"]]
        sliced.append(slice)

    sliced = pandas.concat(sliced).sort_values(by="r")
    if args.output_format == "dapg":
        sliced.region = sliced.r.apply(lambda x: "region{}".format(x))
        sliced = sliced.drop(["r"], axis=1)
        Utilities.save_dataframe(sliced, args.output, header=False)
    elif args.output_format == "gtex_eqtl":
        sliced = sliced.assign(gene_id=sliced.region,
                               variant_id=sliced.panel_variant_id,
                               tss_distance=numpy.nan,
                               ma_samples=numpy.nan,
                               ma_count=numpy.nan,
                               maf=numpy.nan,
                               pval_nominal=numpy.nan,
                               slope=sliced.zscore,
                               slope_se=1)
        sliced = sliced[[
            "gene_id", "variant_id", "tss_distance", "ma_samples", "ma_count",
            "maf", "pval_nominal", "slope", "slope_se"
        ]]
        Utilities.save_dataframe(sliced, args.output, header=True)
    logging.info("Finished slicing gwas")
Ejemplo n.º 8
0
def run(args):

    logging.info("Loading models")
    weights, extra = Models.read_model(args.input)

    Utilities.save_dataframe(weights, args.output_prefix + "_weights.txt.gz")
    Utilities.save_dataframe(extra, args.output_prefix + "_extra.txt.gz")
    logging.info("Done")
Ejemplo n.º 9
0
def save_study(study, selected_snps, simulated_gencode, prefix, _save):
    Utilities.ensure_requisite_folders(prefix)
    _save(study)

    selected_snps_ = prefix + ".selected_snps.txt.gz"
    Utilities.write_iterable_to_file(selected_snps, selected_snps_)

    gencode_path = os.path.join(os.path.split(prefix)[0],  "gene_annotation.txt.gz")
    Utilities.save_dataframe(simulated_gencode, gencode_path)
Ejemplo n.º 10
0
def run(args):
    if os.path.exists(args.cs_output) or os.path.exists(args.var_output):
        logging.info("Output exists. Nope.")
        return

    study, variants_whitelist = get_study(args.parquet_genotype_folder, args.parquet_genotype_pattern, args.parquet_genotype_metadata)

    #_skip = lambda x: x not in variants_whitelist
    columns = ["maf", "pval_nominal", "slope", "slope_se"]
    eqtl_streamer = DataFrameStreamer.data_frame_streamer(args.eqtl, sanitize=True, to_numeric=columns, sentinel_column="gene_id")

    individuals = None if not args.restrict_to_individuals else TextFileTools.load_list(args.restrict_to_individuals)

    genes = None if not args.restrict_to_genes else set(TextFileTools.load_list(args.restrict_to_genes))

    cs_results = []
    var_results = []
    logging.info("Beggining process")
    MAX_N=args.MAX_N
    n=args.sample_size
    for i, d in enumerate(eqtl_streamer):
        if MAX_N and i > MAX_N:
            logging.info("Early exit")
            break
        gene = d.gene_id.values[0]
        if genes is not None and gene.split('.')[0] not in genes:
            logging.log(9, "Skipping gene: %s", gene)
            continue
        logging.log(9, "Processing gene %i:%s", i+1, gene)
        d = d.loc[(~d.slope_se.isnull()) & (d.slope!=0) & (~d.slope.isnull())]
        try:
            res_, d_ = _do_susie(d, study, variants_whitelist, n, individuals, args.mode)
            cs, vars =_process_result(res_, d_, gene)
        except Exception as e:
            logging.log(9, "Error while doing susie:\n%s", traceback.format_exc())
            cs = _void_cs("susie_error").assign(gene_id=gene, pp_sum=None)
            vars = _void_var().assign(gene_id=[gene], var_id=[None])

        cs_results.append(cs)
        #if vars.shape[1]>0:
        var_results.append(vars)

    if len(cs_results) > 0:
        logging.info("Saving")
        cs_results = pandas.concat(cs_results)[["gene_id", "cs", "cs_avg_r2", "cs_log10bf", "cs_min_r2", "var_id", "pp_sum", "status"]]
        Utilities.ensure_requisite_folders(args.cs_output)
        Utilities.save_dataframe(cs_results, args.cs_output)
    else:
        logging.info('No results')

    if len(var_results) > 0:
        var_results = pandas.concat(var_results)[["gene_id", "var_id",  "cs", "variable_prob"]]
        Utilities.ensure_requisite_folders(args.var_output)
        Utilities.save_dataframe(var_results, args.var_output)
    logging.info("Ran susie")
def save_expression(intermediate_folder, gene, d_, features_data_):
    y_folder = _y_folder(intermediate_folder, gene)
    os.makedirs(_y_folder(intermediate_folder, gene))
    for k, v in d_.items():
        if not gene in v:
            logging.log(8, "%s not present in %s", gene, k)
            continue
        p = os.path.join(y_folder, k) + ".txt"
        v = v.merge(features_data_[["individual", "id"]],
                    on="individual")[["id", gene]]
        Utilities.save_dataframe(v, p, header=False)
def run(args):
    logging.info("Loading model summaries")
    extra = _read_2(args.input_prefix, "_summary.txt.gz")
    extra = extra[extra["n.snps.in.model"] > 0]
    if "rho_avg" in extra:
        extra = extra[(extra["pred.perf.pval"] < 0.05) & (extra.rho_avg > 0.1)]
    else:
        extra = extra[(extra["pred.perf.pval"] < 0.05)]
        extra = extra.assign(rho_avg=None)
    if not "pred.perf.qval" in extra:
        extra["pred.perf.qval"] = None

    if "nested_cv_converged" in extra:
        extra.nested_cv_converged = extra.nested_cv_converged.astype(
            numpy.int32)

    logging.info("Loading weights")
    weights = _read_2(args.input_prefix, "_weights.txt.gz")
    weights = weights[weights.gene.isin(extra.gene)]

    if args.output_prefix:
        logging.info("Saving dbs and covariance")
        db = args.output_prefix + ".db"
        logging.info("Saving db")
        Models.create_model_db(db, extra, weights)

        logging.info("Processing covariances")
        genes = {x for x in extra.gene}

        path_ = os.path.split(args.input_prefix)
        r = re.compile(path_[1] + "_covariance.txt.gz")
        files = sorted([x for x in os.listdir(path_[0]) if r.search(x)])
        files = [os.path.join(path_[0], x) for x in files]
        cov = args.output_prefix + ".txt.gz"
        with gzip.open(cov, "w") as cov_:
            cov_.write("GENE RSID1 RSID2 VALUE\n".encode())
            for nf, f in enumerate(files):
                logging.log(9, "file %i/%i: %s", nf, len(files), f)
                with gzip.open(f) as f_:
                    f_.readline()
                    for l in f_:
                        gene = l.decode().strip().split()[0]
                        if not gene in genes:
                            continue
                        cov_.write(l)

    if args.output_prefix_text:
        logging.info("Saving text output")
        Utilities.save_dataframe(weights,
                                 args.output_prefix_text + "_t_weights.txt")
        Utilities.save_dataframe(extra,
                                 args.output_prefix_text + "_t_extra.txt")
    logging.info("Done")
def save_x(intermediate_folder, gene, features_, features_data_):
    Utilities.save_dataframe(features_data_.drop("individual", axis=1),
                             _x_path(intermediate_folder, gene),
                             header=False,
                             sep=" ")

    Utilities.save_dataframe(
        features_[["id", "allele_0", "allele_1"]].rename(columns={
            "id": "SNP",
            "allele_0": "REF.0.",
            "allele_1": "ALT.1."
        }), _info_path(intermediate_folder, gene))
Ejemplo n.º 14
0
def run(args):
    start = timer()

    if os.path.exists(args.output_folder):
        logging.info("Output folder exists. Nope.")
        return

    if os.path.exists(args.intermediate_folder):
        logging.info("Intermediate folder exists. Nope.")
        return

    os.makedirs(args.intermediate_folder)
    os.makedirs(args.output_folder)

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(
            args.parquet_genotype_metadata).to_pandas()
    else:
        features_metadata = pq.ParquetFile(
            args.parquet_genotype_metadata).read_row_group(args.chromosome -
                                                           1).to_pandas()

    logging.info("Opening features")
    features = pq.ParquetFile(args.parquet_genotype)

    logging.info("Opening summary stats")
    summary_stats = load_summary_stats(args.summary_stats)
    summary_stats = summary_stats[summary_stats.variant_id.isin(
        features_metadata.id)]
    regions = summary_stats[["region_id"]].drop_duplicates()

    if args.sub_batches is not None and args.sub_batch is not None:
        regions = PandasHelpers.sub_batch(regions, args.sub_batches,
                                          args.sub_batch)

    stats = []
    for i, region in enumerate(regions.itertuples()):
        logging.log(9, "Region %i/%i:%s", i, regions.shape[0],
                    region.region_id)
        _stats = run_dapg(region, features, features_metadata, summary_stats,
                          args.intermediate_folder, args.output_folder,
                          args.options, args.dap_command,
                          not args.keep_intermediate_folder)
        stats.append(_stats)

    stats_path = os.path.join(args.output_folder, "stats.txt")
    stats = RunDAP.data_frame_from_stats(stats).fillna("NA")
    Utilities.save_dataframe(stats, stats_path)

    end = timer()
    logging.info("Ran DAP in %s seconds" % (str(end - start)))
Ejemplo n.º 15
0
def run(args):
    if os.path.exists(args.output):
        logging.info("Output exists. Nope.")
        return

    Utilities.ensure_requisite_folders(args.output)
    logging.info("Acquiring files")
    logic = Utilities.file_logic_2(args.input_folder, args.input_pattern,
                                   args.name_subfield, args.input_filter)

    trait_map = None
    if args.trait_map:
        logging.info("Loading file mapping")
        trait_map = get_trait_map(args.trait_map)

    gene_id_map, gene_name_map = None, None
    if args.gene_annotation:
        logging.info("Loading gene annotation")
        gene_id_map, gene_name_map = get_gene_map(args.gene_annotation)

    logging.info("Processing files")
    r = []

    for f in logic.itertuples():
        logging.info("Processing %s", f.file)
        names = get_header_names(args.header_names)
        if args.separator == ",":
            d = pandas.read_csv(f.path,
                                header='infer' if not names else None,
                                names=names)
        elif args.separator is None:
            d = pandas.read_table(f.path,
                                  header='infer' if not names else None,
                                  names=get_header_names(args.header_names),
                                  sep="\s+")
        else:
            raise RuntimeError("Unsupported separator")

        if args.specific_post_processing == "FAST_ENLOC":
            d = fast_enloc_postprocessing(d, gene_id_map, gene_name_map)
        elif args.specific_post_processing:
            raise RuntimeError("Unsupported postprocessing option")

        d = d.assign(trait=trait_map[f.trait], tissue=f.tissue)
        r.append(d)

    r = pandas.concat(r)
    logging.info("Saving")
    Utilities.save_dataframe(r, args.output)

    logging.info("Finished processing.")
Ejemplo n.º 16
0
def run(args):
    if os.path.exists(args.output):
        logging.info("output path %s exists. Nope.", args.output)
        return

    start = timer()
    logging.info("Parsing input GWAS")
    d = GWAS.load_gwas(args.gwas_file,
                       args.output_column_map,
                       force_special_handling=args.force_special_handling,
                       skip_until_header=args.skip_until_header,
                       separator=args.separator,
                       handle_empty_columns=args.handle_empty_columns,
                       input_pvalue_fix=args.input_pvalue_fix,
                       enforce_numeric_columns=args.enforce_numeric_columns)
    logging.info("loaded %d variants", d.shape[0])

    d = pre_process_gwas(args, d)

    if args.fill_from_snp_info:
        d = fill_coords(args, d)

    if args.chromosome_format:
        d = d.assign(chromosome=Genomics.to_int(d.chromosome))
        d = d.assign(chromosome=["chr{}".format(x) for x in d.chromosome])

    if args.liftover:
        d = liftover(args, d)

    if args.snp_reference_metadata:
        d = fill_from_metadata(args,
                               d,
                               extra_col_dict=load_extra_col_key_value_pairs(
                                   args.meta_extra_col))

    if args.output_order:
        order = args.output_order
        for c in order:
            if not c in d:
                d = d.assign(**{c: numpy.nan})
        d = d[order]

    d = clean_up(d)

    logging.info("Saving...")
    Utilities.save_dataframe(d, args.output, fill_na=True)
    end = timer()
    logging.info("Finished converting GWAS in %s seconds", str(end - start))
Ejemplo n.º 17
0
def process_imputed(args):
    r = re.compile(args.pattern)
    files = sorted([x for x in os.listdir(args.folder) if r.search(x)])
    count = 0
    keys = set()
    for i, file in enumerate(files):
        logging.info("Processing imputed %s", file)
        p = os.path.join(args.folder, file)
        g = pandas.read_table(p)
        if g.shape[0] == 0:
            logging.info("Empty set of results for %s", p)
            continue
        count += g.shape[0]

        #Fast dropping of observed values
        #g = g.merge(observed_ids, on="panel_variant_id", how="left", copy=False, indicator=True)
        #g = g[g._merge == "left_only"]

        g.drop(["n", "n_indep", "most_extreme_z"], axis=1, inplace=True)
        g.rename(columns={
            "effect_allele_frequency": "frequency",
            "status": "imputation_status"
        },
                 inplace=True)
        g = g.assign(pvalue=2 * stats.norm.sf(numpy.abs(g.zscore)),
                     effect_size=numpy.nan,
                     standard_error=numpy.nan,
                     sample_size=numpy.nan,
                     current_build="hg38")
        g = g[COLUMN_ORDER]
        Utilities.save_dataframe(g,
                                 args.output,
                                 mode="a" if i > 0 else "w",
                                 header=i == 0)
        if not args.keep_all_observed:
            if args.keep_criteria == "GTEX_VARIANT_ID":
                keys.update(g.panel_variant_id.values)
            elif args.keep_criteria == "CHR_POS":
                chr_pos = g.apply(
                    lambda x: "{}_{}".format(x.chromosome, int(x.position)),
                    axis=1)
                keys.update(chr_pos)
            else:
                raise RuntimeError("Unsupported keep option")

    logging.info("Processed %d imputed variants", count)
    return keys
def run(args):
    if os.path.exists(args.output):
        logging.info("Output exists. Nope.")
        return

    if args.output_column_map:
        selected = [x[0] for x in args.output_column_map]
    else:
        selected = [
            Gencode.GFTF.K_GENE_ID, Gencode.GFTF.K_GENE_NAME,
            Gencode.GFTF.K_GENE_TYPE
        ]

    logging.info("Loading Gencode")
    gencode = Gencode.load(
        args.gencode_file,
        feature_type_whitelist={x
                                for x in args.feature_type_whitelist},
        gene_type_white_list={x
                              for x in args.gene_type_whitelist},
        transcript_type_whitelist={x
                                   for x in args.transcript_type_whitelist},
        selected_key_value_pairs=selected)
    #gencode = _reformat(gencode)
    logging.info("Converting format")
    if args.output_column_map:
        gencode = gencode.rename(
            columns={x[0]: x[1]
                     for x in args.output_column_map})
        if "gene_version" in gencode and "gene_id" in gencode:
            gencode["gene_id"] = gencode.gene_id + "." + gencode.gene_version
            keep = [
                "chromosome", "start_location", "end_location", "feature_type",
                "strand"
            ] + [
                x[1] for x in args.output_column_map
                if x[1] not in {"gene_version"}
            ]
            gencode = gencode[keep]
        else:
            gencode = gencode[[
                "chromosome", "start_location", "end_location", "feature_type",
                "strand"
            ] + [x[1] for x in args.output_column_map]]
    logging.info("Saving")
    Utilities.save_dataframe(gencode, args.output)
    logging.info("Finished")
def run(args):
    if os.path.exists(args.output):
        logging.info("Output exists. Nope.")
        return

    start = timer()
    logging.info("Beginning process")
    if args.by_region_file:
        results = run_by_region(args)
    else:
        results = run_by_variant(args)

    Utilities.ensure_requisite_folders(args.output)
    Utilities.save_dataframe(results, args.output)

    end = timer()
    logging.info("Finished in %s seconds", str(end - start))
Ejemplo n.º 20
0
def run(args):
    logging.info("Starting process")

    vf = pq.ParquetFile(args.parquet_genotype_metadata)
    m = None
    last_chromosome = None

    r = []
    for i, line in Utilities.iterate_file(args.regions):
        if i == 0: continue
        comps = line.strip().split()
        count, m, last_chromosome = count_variants(comps[0], comps[1],
                                                   comps[2], vf, m,
                                                   last_chromosome, args)
        r.append((comps[0], comps[1], comps[2], count))

    r = Utilities.to_dataframe(r, ["chromosome", "start", "end", "count"])
    Utilities.save_dataframe(r, args.output)
    logging.info("Finished process")
Ejemplo n.º 21
0
def run(args):
    r_ = pandas.read_csv if ".csv" in args.input else pandas.read_table
    sep = "," if ".csv" in args.output else "\t"

    logging.info("Loading gene table")
    g = KeyedDataSource.load_data(args.gene_table, "gene_id", "gene_name")

    logging.info("Loading input")
    i = r_(args.input)

    gene_name = []
    for t in i.itertuples():
        gene_name.append(g[t.gene])
    i["gene_name"] = gene_name

    logging.info("saving")
    Utilities.save_dataframe(i, args.output, sep=sep)

    logging.info("Done")
Ejemplo n.º 22
0
def process_original_gwas(args, imputed_keys):
    logging.info("Processing GWAS file %s", args.gwas_file)
    g = pandas.read_table(args.gwas_file)
    #Remember the palindromic snps are to be excluded from the input GWAS;
    logging.info("Read %d variants", g.shape[0])
    if not args.keep_all_observed:
        if args.keep_criteria == "GTEX_VARIANT_ID":
            g = g.loc[~g.panel_variant_id.isin(imputed_keys)]
        elif args.keep_criteria == "CHR_POS":
            g["k"] = g.apply(_gwas_k, axis=1)
            g = g.loc[~g.k.isin(imputed_keys)]
            g.drop("k", axis=1, inplace=True)
        else:
            raise RuntimeError("Unsupported keep option")
        logging.info("Kept %d variants as observed", g.shape[0])
    g = g.assign(current_build="hg38",
                 imputation_status="original")[COLUMN_ORDER]
    Utilities.save_dataframe(g, args.output, mode="a", header=False)

    return g[["panel_variant_id"]]
Ejemplo n.º 23
0
def run(args):
    start = timer()
    if os.path.exists(args.intermediate_folder):
        logging.info("Intermediate folder already exists. Nope.")
        return
    else:
        os.makedirs(args.intermediate_folder)

    if not (args.output_stats or args.output_weights or args.output_covariance
            or args.output_hyperparameters):
        logging.info("Specify at least one output")
        return

    if args.output_weights:
        Utilities.ensure_requisite_folders(args.output_weights)
    if args.output_stats: Utilities.ensure_requisite_folders(args.output_stats)
    if args.output_covariance:
        Utilities.ensure_requisite_folders(args.output_covariance)
    if args.output_hyperparameters:
        Utilities.ensure_requisite_folders(args.output_hyperparameters)

    weights = []
    covariance = []
    stats = []
    hyperparameters = []
    context = GEMMAUtilities.context_from_args(args)
    n = len(context.get_available_genes())
    for i, gene in enumerate(context.get_available_genes()):
        start_ = timer()
        logging.log(8, "Processing %d/%d:%s", i + 1, n, gene)
        weights_, covariance_, hyperparameters_, stats_ = RunGEMMA.run_gemma(
            context, gene)

        end_ = timer()
        logging.log(7, "Elapsed: %s", str(end_ - start_))
        if args.output_weights: weights.append(weights_)
        if args.output_covariance: covariance.append(covariance_)
        if args.output_hyperparameters:
            hyperparameters.append(hyperparameters_)
        if args.output_stats: stats.append(stats_)

    if args.output_weights:
        weights = pandas.concat(weights)
        Utilities.save_dataframe(weights, args.output_weights)

    if args.output_stats:
        stats = RunGEMMA.dataframe_from_stats(stats).fillna("NA")
        Utilities.save_dataframe(stats, args.output_stats)

    if args.output_covariance:
        covariance = RunGEMMA.dataframe_from_covariance_data(
            covariance).fillna("NA")
        Utilities.save_dataframe(covariance, args.output_covariance)

    if args.output_hyperparameters:
        hyperparameters = RunGEMMA.dataframe_from_hyperparameters(
            hyperparameters).fillna("NA")
        Utilities.save_dataframe(hyperparameters, args.output_hyperparameters)

    shutil.rmtree(args.intermediate_folder)

    end = timer()
    logging.info("Ran BSLMM in %s seconds" % (str(end - start)))
Ejemplo n.º 24
0
def run(args):
    d = duplicated_entries(args.input_folder)
    Utilities.ensure_requisite_folders(args.output)
    Utilities.save_dataframe(d, args.output, sep=",", quoting=csv.QUOTE_NONE)
Ejemplo n.º 25
0
def run(args):
    wp = args.output_prefix + "_weights.txt.gz"
    if os.path.exists(wp):
        logging.info("Weights output exists already, delete it or move it")
        return

    sp = args.output_prefix + "_summary.txt.gz"
    if os.path.exists(sp):
        logging.info("Summary output exists already, delete it or move it")
        return

    cp = args.output_prefix + "_covariance.txt.gz"
    if os.path.exists(wp):
        logging.info("covariance output exists already, delete it or move it")
        return

    r = args.output_prefix + "_run.txt.gz"
    if os.path.exists(wp):
        logging.info("run output exists already, delete it or move it")
        return

    logging.info("Starting")
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Opening data")
    data = pq.ParquetFile(args.data)
    available_data = {x for x in data.metadata.schema.names}

    logging.info("Loading data annotation")
    data_annotation = StudyUtilities.load_gene_annotation(
        args.data_annotation, args.chromosome, args.sub_batches,
        args.sub_batch)
    data_annotation = data_annotation[data_annotation.gene_id.isin(
        available_data)]
    if args.gene_whitelist:
        logging.info("Applying gene whitelist")
        data_annotation = data_annotation[data_annotation.gene_id.isin(
            set(args.gene_whitelist))]
    logging.info("Kept %i entries", data_annotation.shape[0])

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(args.features_annotation).to_pandas()
    else:
        features_metadata = pq.ParquetFile(
            args.features_annotation).read_row_group(args.chromosome -
                                                     1).to_pandas()

    if args.chromosome and args.sub_batches:
        logging.info("Trimming variants")
        features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(
            features_metadata, data_annotation, args.window)

    if args.rsid_whitelist:
        logging.info("Filtering features annotation")
        whitelist = TextFileTools.load_list(args.rsid_whitelist)
        whitelist = set(whitelist)
        features_metadata = features_metadata[features_metadata.rsid.isin(
            whitelist)]

    if args.features_weights:
        logging.info("Loading weights")
        x_weights = get_weights(args.features_weights,
                                {x
                                 for x in features_metadata.id})
        logging.info(
            "Filtering features metadata to those available in weights")
        features_metadata = features_metadata[features_metadata.id.isin(
            x_weights.id)]
        logging.info("Kept %d entries", features_metadata.shape[0])
    else:
        x_weights = None

    logging.info("Opening features")
    features = pq.ParquetFile(args.features)

    logging.info("Setting R seed")
    s = numpy.random.randint(1e8)
    set_seed(s)
    if args.run_tag:
        d = pandas.DataFrame({
            "run": [args.run_tag],
            "cv_seed": [s]
        })[["run", "cv_seed"]]
        Utilities.save_dataframe(d, r)

    WEIGHTS_FIELDS = [
        "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"
    ]
    SUMMARY_FIELDS = [
        "gene", "genename", "gene_type", "alpha", "n_snps_in_window",
        "n.snps.in.model", "test_R2_avg", "test_R2_sd", "cv_R2_avg",
        "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval",
        "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore",
        "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"
    ]

    train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols

    with gzip.open(wp, "w") as w:
        w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode())
        with gzip.open(sp, "w") as s:
            s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode())
            with gzip.open(cp, "w") as c:
                c.write("GENE RSID1 RSID2 VALUE\n".encode())
                for i, data_annotation_ in enumerate(
                        data_annotation.itertuples()):
                    if args.MAX_M and i >= args.MAX_M:
                        logging.info("Early abort")
                        break
                    logging.log(9, "processing %i/%i:%s", i + 1,
                                data_annotation.shape[0],
                                data_annotation_.gene_id)
                    if args.repeat:
                        for j in range(0, args.repeat):
                            logging.log(9, "%i-th reiteration", j)
                            process(w, s, c, data, data_annotation_, features,
                                    features_metadata, x_weights,
                                    SUMMARY_FIELDS, train, j,
                                    args.nested_cv_folds)
                    else:
                        process(w,
                                s,
                                c,
                                data,
                                data_annotation_,
                                features,
                                features_metadata,
                                x_weights,
                                SUMMARY_FIELDS,
                                train,
                                nested_folds=args.nested_cv_folds)

    logging.info("Finished")
Ejemplo n.º 26
0
def _dump(p, d, cov):
    Utilities.save_dataframe(d, p + "_d.txt.gz")
    import gzip
    with gzip.open(p + "_m.txt.gz", "w") as f:
        for i in cov:
            f.write("{}\n".format("\t".join(map(str, i))).encode())
def run(args):
    wp = args.output_prefix + "_weights.txt.gz"
    if os.path.exists(wp):
        logging.info("Weights output exists already, delete it or move it")
        return

    sp = args.output_prefix + "_summary.txt.gz"
    if os.path.exists(sp):
        logging.info("Summary output exists already, delete it or move it")
        return

    cp = args.output_prefix + "_covariance.txt.gz"
    if os.path.exists(wp):
        logging.info("covariance output exists already, delete it or move it")
        return

    r = args.output_prefix + "_run.txt.gz"
    if os.path.exists(wp):
        logging.info("run output exists already, delete it or move it")
        return

    logging.info("Starting")
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Opening data")
    data = pq.ParquetFile(args.data)
    available_data = {x for x in data.metadata.schema.names}

    logging.info("Loading data annotation")
    data_annotation = StudyUtilities.load_gene_annotation(args.data_annotation, args.chromosome, args.sub_batches, args.sub_batch, args.simplify_data_annotation)
    data_annotation = data_annotation[data_annotation.gene_id.isin(available_data)]
    if args.gene_whitelist:
        logging.info("Applying gene whitelist")
        data_annotation = data_annotation[data_annotation.gene_id.isin(set(args.gene_whitelist))]
    logging.info("Kept %i entries", data_annotation.shape[0])

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(args.features_annotation).to_pandas()
    else:
        features_metadata = pq.ParquetFile(args.features_annotation).read_row_group(args.chromosome-1).to_pandas()

    if args.output_rsids:
        if not args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]:
            logging.warning("Several variants map to a same rsid (hint: multiple INDELS?).\n"
                            "Can't proceed. Consider the using the --keep_highest_frequency_rsid flag, or models will be ill defined.")
            return

    if args.chromosome and args.sub_batches:
        logging.info("Trimming variants")
        features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(features_metadata, data_annotation, args.window)
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_call_filter:
        logging.info("Filtering variants by average call rate")
        features_metadata = features_metadata[features_metadata.avg_call > args.variant_call_filter]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_r2_filter:
        logging.info("Filtering variants by imputation R2")
        features_metadata = features_metadata[features_metadata.r2 > args.variant_r2_filter]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.variant_variance_filter:
        logging.info("Filtering variants by (dosage/2)'s variance")
        features_metadata = features_metadata[features_metadata["std"]/2 > numpy.sqrt(args.variant_variance_filter)]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.discard_palindromic_snps:
        logging.info("Discarding palindromic snps")
        features_metadata = Genomics.discard_gtex_palindromic_variants(features_metadata)
        logging.info("Kept %d", features_metadata.shape[0])

    if args.rsid_whitelist:
        logging.info("Filtering features annotation for whitelist")
        whitelist = TextFileTools.load_list(args.rsid_whitelist)
        whitelist = set(whitelist)
        features_metadata = features_metadata[features_metadata.rsid.isin(whitelist)]
        logging.info("Kept %d", features_metadata.shape[0])

    if args.only_rsids:
        logging.info("discarding non-rsids")
        features_metadata = StudyUtilities.trim_variant_metadata_to_rsids_only(features_metadata)
        logging.info("Kept %d", features_metadata.shape[0])

        if args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]:
            logging.info("Keeping only the highest frequency entry for every rsid")
            k = features_metadata[["rsid", "allele_1_frequency", "id"]]
            k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"] = 1 - k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"]
            k = k.sort_values(by=["rsid", "allele_1_frequency"], ascending=False)
            k = k.groupby("rsid").first().reset_index()
            features_metadata = features_metadata[features_metadata.id.isin(k.id)]
            logging.info("Kept %d", features_metadata.shape[0])
        else:
            logging.info("rsids are unique, no need to restrict to highest frequency entry")

    if args.features_weights:
        logging.info("Loading weights")
        x_weights = get_weights(args.features_weights, {x for x in features_metadata.id})
        logging.info("Filtering features metadata to those available in weights")
        features_metadata = features_metadata[features_metadata.id.isin(x_weights.id)]
        logging.info("Kept %d entries", features_metadata.shape[0])
    else:
        x_weights = None

    logging.info("Opening features")
    features = pq.ParquetFile(args.features)

    logging.info("Setting R seed")
    s = numpy.random.randint(1e8)
    set_seed(s)
    if args.run_tag:
        d = pandas.DataFrame({"run":[args.run_tag], "cv_seed":[s]})[["run", "cv_seed"]]
        Utilities.save_dataframe(d, r)

    WEIGHTS_FIELDS=["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"]
    SUMMARY_FIELDS=["gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model",
                    "test_R2_avg", "test_R2_sd", "cv_R2_avg", "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval",
                    "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"]

    train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols

    available_individuals = check_missing(args, data, features)

    with gzip.open(wp, "w") as w:
        w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode())
        with gzip.open(sp, "w") as s:
            s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode())
            with gzip.open(cp, "w") as c:
                c.write("GENE RSID1 RSID2 VALUE\n".encode())
                for i,data_annotation_ in enumerate(data_annotation.itertuples()):
                    if args.MAX_M and  i>=args.MAX_M:
                        logging.info("Early abort")
                        break
                    logging.log(9, "processing %i/%i:%s", i+1, data_annotation.shape[0], data_annotation_.gene_id)

                    if args.repeat:
                        for j in range(0, args.repeat):
                            logging.log(9, "%i-th reiteration", j)
                            process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, j, nested_folds=args.nested_cv_folds, use_individuals=available_individuals)
                    else:
                        process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, nested_folds=args.nested_cv_folds, use_individuals=available_individuals)

    logging.info("Finished")
Ejemplo n.º 28
0
def run(args):
    Utilities.ensure_requisite_folders(args.output)

    logging.info("Loading db snp file")
    #db_snp_mapping = load_dbsnp_mapping(args.dbsnp_file)

    logging.info("processing")
    files = sorted(args.info_files, key=chr_key)
    r = []
    variant_key = {}
    for p in files:
        with gzip.open(p) as f:
            logging.info("%s", p)
            for i, l in enumerate(f):
                if i == 0:
                    continue
                # if i > 20000:
                #     break
                comps = l.decode().strip().split()
                variant = comps[0]

                variant_comps = variant.split(":")
                chr = "chr" + variant_comps[0]
                pos = variant_comps[1]
                ref = variant_comps[2]
                alt = variant_comps[3]
                if "CN" in ref or "CN" in alt:
                    continue
                freq = comps[3]

                variant_id = "{}_{}_{}_{}_b37".format(chr, pos, ref, alt)
                r.append((chr, pos, variant_id, ref, alt, freq))

                k = "{}_{}".format(chr, pos)
                if not k in variant_key:
                    variant_key[k] = []
                variant_key[k].append(variant_id)

    r = pandas.DataFrame(data=r,
                         columns=[
                             "chromosome", "position", "id", "allele_0",
                             "allele_1", "allele_1_frequency"
                         ])

    variant_key = {}
    for t in r.itertuples():
        k = "{}_{}".format(t.chromosome, t.position)
        if not k in variant_key:
            variant_key[k] = []
        variant_key[k].append(t.id)

    logging.info("looking for rsids in ucsc dbsnp file")
    dbsnp_mapping = load_dbsnp_mapping(args.dbsnp_file, variant_key)

    rsids = []
    for id in r.id:
        rsids.append(dbsnp_mapping[id] if id in dbsnp_mapping else "NA")
    r["rsid"] = rsids
    logging.info("Saving")
    Utilities.save_dataframe(r, args.output)

    logging.info("Done")
def run(args):
    Utilities.maybe_create_folder(args.intermediate_folder)
    Utilities.ensure_requisite_folders(args.output_prefix)

    logging.info("Opening data")
    p_ = re.compile(args.data_name_pattern)
    f = [x for x in sorted(os.listdir(args.data_folder)) if p_.search(x)]
    tissue_names = [p_.search(x).group(1) for x in f]
    data = []
    for i in range(0, len(tissue_names)):
        logging.info("Loading %s", tissue_names[i])
        data.append((tissue_names[i],
                     pq.ParquetFile(os.path.join(args.data_folder, f[i]))))
    data = collections.OrderedDict(data)
    available_data = {
        x
        for p in data.values() for x in p.metadata.schema.names
    }

    logging.info("Preparing output")
    WEIGHTS_FIELDS = [
        "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"
    ]
    SUMMARY_FIELDS = [
        "gene", "genename", "gene_type", "alpha", "n_snps_in_window",
        "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval"
    ]

    Utilities.ensure_requisite_folders(args.output_prefix)

    if args.skip_regression:
        weights, summaries, covariances = None, None, None
    else:
        weights, summaries, covariances = setup_output(args.output_prefix,
                                                       tissue_names,
                                                       WEIGHTS_FIELDS,
                                                       SUMMARY_FIELDS)

    logging.info("Loading data annotation")
    data_annotation = StudyUtilities._load_gene_annotation(
        args.data_annotation)
    data_annotation = data_annotation[data_annotation.gene_id.isin(
        available_data)]
    if args.chromosome or (args.sub_batches and args.sub_batch):
        data_annotation = StudyUtilities._filter_gene_annotation(
            data_annotation, args.chromosome, args.sub_batches, args.sub_batch)
    logging.info("Kept %i entries", data_annotation.shape[0])

    logging.info("Opening features annotation")
    if not args.chromosome:
        features_metadata = pq.read_table(args.features_annotation).to_pandas()
    else:
        features_metadata = pq.ParquetFile(
            args.features_annotation).read_row_group(args.chromosome -
                                                     1).to_pandas()

    if args.chromosome and args.sub_batches:
        logging.info("Trimming variants")
        features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(
            features_metadata, data_annotation, args.window)

    if args.rsid_whitelist:
        logging.info("Filtering features annotation")
        whitelist = TextFileTools.load_list(args.rsid_whitelist)
        whitelist = set(whitelist)
        features_metadata = features_metadata[features_metadata.rsid.isin(
            whitelist)]

    logging.info("Opening features")
    features = pq.ParquetFile(args.features)

    logging.info("Setting R seed")
    seed = numpy.random.randint(1e8)

    if args.run_tag:
        d = pandas.DataFrame({
            "run": [args.run_tag],
            "cv_seed": [seed]
        })[["run", "cv_seed"]]
        for t in tissue_names:
            Utilities.save_dataframe(
                d, "{}_{}_runs.txt.gz".format(args.output_prefix, t))

    failed_run = False
    try:
        for i, data_annotation_ in enumerate(data_annotation.itertuples()):
            logging.log(9, "processing %i/%i:%s", i + 1,
                        data_annotation.shape[0], data_annotation_.gene_id)
            logging.log(8, "loading data")
            d_ = {}
            for k, v in data.items():
                d_[k] = Parquet._read(v, [data_annotation_.gene_id],
                                      to_pandas=True)
            features_ = Genomics.entries_for_gene_annotation(
                data_annotation_, args.window, features_metadata)

            if features_.shape[0] == 0:
                logging.log(9, "No features available")
                continue

            features_data_ = Parquet._read(features,
                                           [x for x in features_.id.values],
                                           to_pandas=True)
            features_data_["id"] = range(1, features_data_.shape[0] + 1)
            features_data_ = features_data_[["individual", "id"] +
                                            [x for x in features_.id.values]]

            logging.log(8, "training")
            prepare_ctimp(args.script_path, seed, args.intermediate_folder,
                          data_annotation_, features_, features_data_, d_)
            del (features_data_)
            del (d_)
            if args.skip_regression:
                continue

            subprocess.call([
                "bash",
                _execution_script(args.intermediate_folder,
                                  data_annotation_.gene_id)
            ])

            w = pandas.read_table(_weights(args.intermediate_folder,
                                           data_annotation_.gene_id),
                                  sep="\s+")
            s = pandas.read_table(_summary(args.intermediate_folder,
                                           data_annotation_.gene_id),
                                  sep="\s+")

            for e_, entry in enumerate(s.itertuples()):
                entry_weights = w[["SNP", "REF.0.", "ALT.1.",
                                   entry.tissue]].rename(
                                       columns={
                                           "SNP": "varID",
                                           "REF.0.": "ref_allele",
                                           "ALT.1.": "eff_allele",
                                           entry.tissue: "weight"
                                       })
                entry_weights = entry_weights[entry_weights.weight != 0]
                entry_weights = entry_weights.assign(
                    gene=data_annotation_.gene_id)
                entry_weights = entry_weights.merge(features_,
                                                    left_on="varID",
                                                    right_on="id",
                                                    how="left")
                entry_weights = entry_weights[WEIGHTS_FIELDS]
                if args.output_rsids:
                    entry_weights.loc[entry_weights.rsid == "NA",
                                      "rsid"] = entry_weights.loc[
                                          entry_weights.rsid == "NA", "varID"]
                weights[entry.tissue].write(
                    entry_weights.to_csv(sep="\t",
                                         index=False,
                                         header=False,
                                         na_rep="NA").encode())

                entry_summary = s[s.tissue == entry.tissue].rename(
                    columns={
                        "zscore_pval": "pred.perf.pval",
                        "rho_avg_squared": "pred.perf.R2"
                    })
                entry_summary = entry_summary.assign(
                    gene=data_annotation_.gene_id,
                    alpha=0.5,
                    genename=data_annotation_.gene_name,
                    gene_type=data_annotation_.gene_type,
                    n_snps_in_window=features_.shape[0])
                entry_summary["n.snps.in.model"] = entry_weights.shape[0]
                #must repeat strings beause of weird pandas indexing issue
                entry_summary = entry_summary.drop(
                    ["R2", "n", "tissue"], axis=1)[[
                        "gene", "genename", "gene_type", "alpha",
                        "n_snps_in_window", "n.snps.in.model", "rho_avg",
                        "pred.perf.R2", "pred.perf.pval"
                    ]]
                summaries[entry.tissue].write(
                    entry_summary.to_csv(sep="\t",
                                         index=False,
                                         header=False,
                                         na_rep="NA").encode())

                features_data_ = Parquet._read(
                    features, [x for x in entry_weights.varID.values],
                    to_pandas=True)
                var_ids = [x for x in entry_weights.varID.values]
                cov = numpy.cov([features_data_[k] for k in var_ids], ddof=1)
                ids = [x for x in entry_weights.rsid.values
                       ] if args.output_rsids else var_ids
                cov = matrices._flatten_matrix_data([(data_annotation_.gene_id,
                                                      ids, cov)])
                for cov_ in cov:
                    l = "{} {} {} {}\n".format(cov_[0], cov_[1], cov_[2],
                                               cov_[3]).encode()
                    covariances[entry.tissue].write(l)

            if not args.keep_intermediate_folder:
                logging.info("Cleaning up")
                shutil.rmtree(
                    _intermediate_folder(args.intermediate_folder,
                                         data_annotation_.gene_id))

            if args.MAX_M and i >= args.MAX_M:
                logging.info("Early abort")
                break

    except Exception as e:
        logging.info("Exception running model training:\n%s",
                     traceback.format_exc())
        failed_run = True
    finally:
        pass
        # if not args.keep_intermediate_folder:
        #     shutil.rmtree(args.intermediate_folder)

    if not args.skip_regression:
        set_down(weights, summaries, covariances, tissue_names, failed_run)

    logging.info("Finished")