def clean_up(d):
    d = d.assign(sample_size=[
        int(x) if not math.isnan(x) else "NA" for x in d.sample_size
    ])
    if "chromosome" in d.columns.values and "position" in d.columns.values:
        d = Genomics.sort(d)
    return d
Esempio n. 2
0
def process_original_gwas(args, imputed):
    logging.info("Processing GWAS file %s", args.gwas_file)
    g = pandas.read_table(args.gwas_file)
    g = g.assign(current_build="hg38",
                 imputation_status="original")[COLUMN_ORDER]
    # Remember the palindromic snps are to be excluded from the input GWAS;
    logging.info("Read %d variants", g.shape[0])

    if not args.keep_all_observed:
        if args.keep_criteria == "GTEX_VARIANT_ID":
            g = g.loc[~g.panel_variant_id.isin(imputed.panel_variant_id)]
        elif args.keep_criteria == "CHR_POS":
            g = g.assign(k=gwas_k(g))
            imputed = imputed.assign(k=gwas_k(imputed))
            g = g.loc[~g.k.isin({x for x in imputed.k})]
            g.drop("k", axis=1, inplace=True)
            imputed.drop("k", axis=1, inplace=True)
        else:
            raise RuntimeError("Unsupported keep option")
        logging.info("Kept %d variants as observed", g.shape[0])

    g = pandas.concat([g, imputed])[COLUMN_ORDER]
    logging.info("%d variants", g.shape[0])

    logging.info("Filling median")
    g = Genomics.fill_column_to_median(g, "sample_size", numpy.int32)

    logging.info("Sorting by chromosome-position")
    g = Genomics.sort(g)

    logging.info("Saving")
    Utilities.save_dataframe(g, args.output)

    return g[["panel_variant_id"]]
Esempio n. 3
0
def run(args):
    if os.path.exists(args.output):
        logging.info("Output already exists, either delete it or move it")
        return

    logging.info("Loading group")
    groups = pandas.read_table(args.group)
    groups = groups.assign(chromosome = groups.gtex_intron_id.str.split(":").str.get(0))
    groups = groups.assign(position=groups.gtex_intron_id.str.split(":").str.get(1))
    groups = Genomics.sort(groups)

    logging.info("Getting parquet genotypes")
    file_map = get_file_map(args)

    logging.info("Getting genes")
    with sqlite3.connect(args.model_db_group_key) as connection:
        # Pay heed to the order. This avoids arbitrariness in sqlite3 loading of results.
        extra = pandas.read_sql("SELECT * FROM EXTRA order by gene", connection)
        extra = extra[extra["n.snps.in.model"] > 0]

    individuals = TextFileTools.load_list(args.individuals) if args.individuals else None

    logging.info("Processing")
    Utilities.ensure_requisite_folders(args.output)

    genes_ = groups[["chromosome", "position", "gene_id"]].drop_duplicates()
    with gzip.open(args.output, "w") as f:
        f.write("GENE RSID1 RSID2 VALUE\n".encode())
        with sqlite3.connect(args.model_db_group_key) as db_group_key:
            with sqlite3.connect(args.model_db_group_values) as db_group_values:
                for i,t_ in enumerate(genes_.itertuples()):
                    g_ = t_.gene_id
                    chr_ = t_.chromosome.split("chr")[1]
                    logging.log(8, "Proccessing %i/%i:%s", i+1, len(genes_), g_)

                    if not n_.search(chr_):
                        logging.log(9, "Unsupported chromosome: %s", chr_)
                        continue
                    dosage = file_map[int(chr_)]

                    group = groups[groups.gene_id == g_]
                    wg=[]
                    for value in group.intron_id:
                        wk = pandas.read_sql("select * from weights where gene = '{}';".format(value), db_group_values)
                        if wk.shape[0] == 0:
                            continue
                        wg.append(wk)

                    if len(wg) > 0:
                        wg = pandas.concat(wg)
                        w = pandas.concat([wk, wg])[["varID", "rsid"]].drop_duplicates()
                    else:
                        w = wk[["varID", "rsid"]].drop_duplicates()

                    if w.shape[0] == 0:
                        logging.log(8, "No data, skipping")
                        continue

                    if individuals:
                        d = Parquet._read(dosage, columns=w.varID.values, specific_individuals=individuals)
                        del d["individual"]
                    else:
                        d = Parquet._read(dosage, columns=w.varID.values, skip_individuals=True)

                    var_ids = list(d.keys())
                    if len(var_ids) == 0:
                        if len(w.varID.values) == 1:
                            logging.log(9, "workaround for single missing genotype at %s", g_)
                            d = {w.varID.values[0]:[0,1]}
                        else:
                            logging.log(9, "No genotype available for %s, skipping",g_)
                            next

                    if args.output_rsids:
                        ids = [x for x in pandas.DataFrame({"varID": var_ids}).merge(w[["varID", "rsid"]], on="varID").rsid.values]
                    else:
                        ids = var_ids

                    c = numpy.cov([d[x] for x in var_ids])
                    c = matrices._flatten_matrix_data([(g_, ids, c)])
                    for entry in c:
                        l = "{} {} {} {}\n".format(entry[0], entry[1], entry[2], entry[3])
                        f.write(l.encode())
    logging.info("Finished building covariance.")