Ejemplo n.º 1
0
def run(args):
    if os.path.exists(args.output_file):
        logging.info("Output %s exists. Nope", args.output_file)
        return

    results_order = []
    results = {}
    logging.info("Streaming file for groups")
    for i,line in Utilities.iterate_file(args.input_file):
        if i==0: continue

        comps = line.strip().split()
        key = comps[0]
        if not key in results:
            results_order.append(key)
            results[key] = 0
            logging.log(9, "Key: %s", str(key))
        results[key] += 1

    r = []
    logging.info("Producing output")
    for key in results_order:
        r.append((key, results[key]))
    r = pandas.DataFrame(r, columns=["key","count"])

    logging.info("Saving")
    Utilities.ensure_requisite_folders(args.output_file)
    Utilities.save_dataframe(r, args.output_file)

    logging.info("Finished.")
Ejemplo n.º 2
0
def _get_metadata(path, index):
    m = []
    for i, line in Utilities.iterate_file(path):
        if i == 0: continue
        comps = line.strip().split()
        chr = "chr" + comps[0]
        pos = int(comps[1])
        variant = comps[2]
        non_effect = comps[3]
        effect = comps[4]
        frequency = float(comps[5])
        if chr in index and pos in index[chr]:
            m.append((variant, chr, pos, non_effect, effect, frequency))
    return m
def _get_metadata(path, index, extra_cols=None):
    m = []
    for i, line in Utilities.iterate_file(path):
        if i == 0: continue
        comps = line.strip().split()
        chr = "chr" + comps[0]
        pos = int(comps[1])
        variant = comps[2]
        non_effect = comps[3]
        effect = comps[4]
        frequency = float(comps[5]) if comps[5] != "NA" else numpy.nan
        extra_entries = []
        for ecol in extra_cols:
            extra_entries.append(comps[ecol])
        if chr in index and pos in index[chr]:
            m.append([variant, chr, pos, non_effect, effect, frequency] +
                     extra_entries)
    return m
Ejemplo n.º 4
0
def input_generator(input_file, samples):
    introns = set()
    for i,line in Utilities.iterate_file(input_file):
        comps = line.strip().split()
        if i==0:
            _index = [i for i,x in enumerate(comps) if x in samples]
            yield _to_line(comps, _index, "NAME")
            continue

        name = comps[3]
        name_ = name.split(":")
        name = "_".join(["intron", name_[0].split("chr")[1], name_[1], name_[2]])
        gene = name_[4]
        if name in introns:
            continue
        introns.add(name)

        yield _to_line(comps, _index, name)
Ejemplo n.º 5
0
def run(args):
    logging.info("Starting process")

    vf = pq.ParquetFile(args.parquet_genotype_metadata)
    m = None
    last_chromosome = None

    r = []
    for i, line in Utilities.iterate_file(args.regions):
        if i == 0: continue
        comps = line.strip().split()
        count, m, last_chromosome = count_variants(comps[0], comps[1],
                                                   comps[2], vf, m,
                                                   last_chromosome, args)
        r.append((comps[0], comps[1], comps[2], count))

    r = Utilities.to_dataframe(r, ["chromosome", "start", "end", "count"])
    Utilities.save_dataframe(r, args.output)
    logging.info("Finished process")
Ejemplo n.º 6
0
def run(args):
    r = re.compile(args.pattern)
    files = [x for x in os.listdir(args.folder) if r.search(x)]
    if args.sort_groups:
        files = sorted(files, key=lambda x: _key(x, r, args.sort_groups))

    output_firstline = True
    Utilities.ensure_requisite_folders(args.output)

    logging.info("Starting concatenation")
    with gzip.open(args.output, "w") as o:
        for file in files:
            path = os.path.join(args.folder, file)
            logging.log(9, "Opening %s", path)
            for i, line in Utilities.iterate_file(path):
                if i==0:
                    if output_firstline:
                        o.write(line.encode())
                        if not args.headerless:
                            output_firstline = False
                    continue
                o.write(line.encode())

    logging.info("Finished")