def run(args): if os.path.exists(args.output_file): logging.info("Output %s exists. Nope", args.output_file) return results_order = [] results = {} logging.info("Streaming file for groups") for i,line in Utilities.iterate_file(args.input_file): if i==0: continue comps = line.strip().split() key = comps[0] if not key in results: results_order.append(key) results[key] = 0 logging.log(9, "Key: %s", str(key)) results[key] += 1 r = [] logging.info("Producing output") for key in results_order: r.append((key, results[key])) r = pandas.DataFrame(r, columns=["key","count"]) logging.info("Saving") Utilities.ensure_requisite_folders(args.output_file) Utilities.save_dataframe(r, args.output_file) logging.info("Finished.")
def _get_metadata(path, index): m = [] for i, line in Utilities.iterate_file(path): if i == 0: continue comps = line.strip().split() chr = "chr" + comps[0] pos = int(comps[1]) variant = comps[2] non_effect = comps[3] effect = comps[4] frequency = float(comps[5]) if chr in index and pos in index[chr]: m.append((variant, chr, pos, non_effect, effect, frequency)) return m
def _get_metadata(path, index, extra_cols=None): m = [] for i, line in Utilities.iterate_file(path): if i == 0: continue comps = line.strip().split() chr = "chr" + comps[0] pos = int(comps[1]) variant = comps[2] non_effect = comps[3] effect = comps[4] frequency = float(comps[5]) if comps[5] != "NA" else numpy.nan extra_entries = [] for ecol in extra_cols: extra_entries.append(comps[ecol]) if chr in index and pos in index[chr]: m.append([variant, chr, pos, non_effect, effect, frequency] + extra_entries) return m
def input_generator(input_file, samples): introns = set() for i,line in Utilities.iterate_file(input_file): comps = line.strip().split() if i==0: _index = [i for i,x in enumerate(comps) if x in samples] yield _to_line(comps, _index, "NAME") continue name = comps[3] name_ = name.split(":") name = "_".join(["intron", name_[0].split("chr")[1], name_[1], name_[2]]) gene = name_[4] if name in introns: continue introns.add(name) yield _to_line(comps, _index, name)
def run(args): logging.info("Starting process") vf = pq.ParquetFile(args.parquet_genotype_metadata) m = None last_chromosome = None r = [] for i, line in Utilities.iterate_file(args.regions): if i == 0: continue comps = line.strip().split() count, m, last_chromosome = count_variants(comps[0], comps[1], comps[2], vf, m, last_chromosome, args) r.append((comps[0], comps[1], comps[2], count)) r = Utilities.to_dataframe(r, ["chromosome", "start", "end", "count"]) Utilities.save_dataframe(r, args.output) logging.info("Finished process")
def run(args): r = re.compile(args.pattern) files = [x for x in os.listdir(args.folder) if r.search(x)] if args.sort_groups: files = sorted(files, key=lambda x: _key(x, r, args.sort_groups)) output_firstline = True Utilities.ensure_requisite_folders(args.output) logging.info("Starting concatenation") with gzip.open(args.output, "w") as o: for file in files: path = os.path.join(args.folder, file) logging.log(9, "Opening %s", path) for i, line in Utilities.iterate_file(path): if i==0: if output_firstline: o.write(line.encode()) if not args.headerless: output_firstline = False continue o.write(line.encode()) logging.info("Finished")