def main(): args = argparser() l_fam_sampleIDs = parse_fam(args.bfile + '.fam') l_vcf_sampleIDs = shorten_sampleIDs(l_fam_sampleIDs) if args.keep: l_keep_sampleIDs = parse_keep(args.keep, l_fam_sampleIDs) l_keep_index = index_keep(l_keep_sampleIDs, l_fam_sampleIDs) else: l_keep_index = list(range(len(l_fam_sampleIDs))) assert len(l_vcf_sampleIDs) == len(l_fam_sampleIDs) ## Get count of samples. n_samples = len(l_fam_sampleIDs) ## Get count of SNPs. Only needed to keep track of completion percent. with open(args.bfile + '.bim', 'r') as bim: n_SNPs = len(bim.readlines()) d_fai = read_fai(args.ref + '.fai') ## gzip.open(args.vcf, 'wt') as vcf, \ with open(args.bfile + '.bed', 'rb') as bed, \ BgzfWriter(args.vcf, 'wb') as vcf, \ open(args.bfile + '.bim', 'r') as bim, \ open(args.ref, 'r') as ref: convert( args, bed, vcf, bim, ref, d_fai, l_vcf_sampleIDs, l_keep_index, n_samples, n_SNPs) return
def write_split_vcf(vcf, prefix: str): """Write VCF data split by chromosome Parameters ---------- vcf Iterator giving lines of VCF data prefix : str Output prefix for split VCF files """ header = [] for key, group in itertools.groupby(vcf, key=lambda l: l[:2]): if key in {'##', '#C'}: header.extend(list(group)) else: with BgzfWriter(f'{prefix}.chr{key.rstrip()}.vcf.gz', 'wb') as f: f.write('\n'.join(itertools.chain(header, tuple(group), ('', ))).encode())
def outfile_handler(filepath: str, compression: Optional[str] = None) -> TextIO: """ Return a file handle in write mode using the appropriate handle depending on the compression mode. Valid compression mode: compress = None | "None" | "gzip" | "gz" | "bgzip" | "bgz" If compress = None or other input, open the file normally. """ if os.path.isfile(filepath): warnings.warn("Overwriting the existing file: %s" % filepath) if compression is None: return open(filepath, mode="wt") elif type(compression) == str: if compression.lower() in ["gzip", "gz"]: return gzip.open(filepath, mode="wt") elif compression.lower() in ["bgzip", "bgz"]: return BgzfWriter(filepath) elif compression.lower() == "none": return open(filepath, mode="wt") else: raise Exception("`compression = %s` invalid." % str(compression))
def main(argv): args = parse_argv(argv[1:]) # set e-mail for identification to NCBI Entrez.email = args.email # repo directory makedirs(args.repo.parent, exist_ok=True) # repo database path_db = args.repo.with_suffix(".db") # repo log path_log = args.repo.with_suffix(".log") logging.basicConfig( level=logging.INFO, format="%(asctime)s %(message)s", datefmt="%Y-%m-%dT%H:%M:%S", handlers=(logging.FileHandler(path_log), logging.StreamHandler()), ) logging.info(argv) # metadata accs, fdat, mdat = set(), {}, "" db, rettype, baseterm = args.db, args.rettype, args.term if path_db.exists(): with sqlite3.connect(path_db) as conn: # 0 -> key accs = { row[0] for row in conn.execute("SELECT key FROM offset_data") } # file_number -> name fdat = odict(conn.execute("SELECT * FROM file_data")) # key -> value meta = odict(conn.execute("SELECT * FROM meta_data")) # override args if the index database has metadata # mdat is the previous query execution start time db = meta.get("db", db) mdat = meta.get("mdat", mdat) rettype = meta.get("format", rettype) baseterm = meta.get("term", baseterm) # remote - local accessions term = baseterm + (f" AND {mdat}:{MAX_MDAT}[MDAT]" if mdat else "") logging.info(term) now = datetime.now().strftime("%Y/%m/%d") remote_accs = set(chain.from_iterable(esearch_accs(db, term, args.retmax))) accs = list(remote_accs - accs) logging.info(f"count = {len(accs)}") paths = [] width = len(str(len(accs))) for i, j in enumerate(range(0, len(accs), args.retmax), start=1): # fetch k = min(len(accs), j + args.retmax) csv = ",".join(accs[j:j + args.retmax]) with Entrez.efetch(db, id=csv, rettype=rettype, retmode="text") as handle: path = args.repo.parent / f"{args.repo.name}-{i}.{rettype}.bgz.tmp" # compress with BgzfWriter(path) as stream: print(handle.read(), file=stream) paths.append(path) logging.info( f"{j:0{width}} - {k:0{width}} {k / len(accs) * 100:06.2f}%") # truthy indicates new accessions if paths: # combine previous files with new ones paths = [args.repo.parent / ele for ele in fdat.values()] + paths # rename with zero-fill width = len(str(len(paths))) paths = { ele: ele.with_name(f"{args.repo.name}-{idx:0{width}}.{rettype}.bgz") for idx, ele in enumerate(paths, start=1) } for key, val in paths.items(): if key != val: logging.info(f"{key} -> {val}") key.rename(val) try: path_tmp = path_db.with_suffix(".tmp") path_tmp.exists() and path_tmp.unlink() print("index...") SeqIO.index_db(str(path_tmp), list(map(str, paths.values())), rettype) # update metadata with sqlite3.connect(path_tmp) as conn: conn.execute( "INSERT INTO meta_data VALUES ('db', ?), ('term', ?), ('mdat', ?)", (db, baseterm, now), ) path_tmp.rename(path_db) except Exception as e: logging.error(e) # revert original path names for key, val in paths.items(): logging.info(f"{val} -> {key}") val.exists() and val.rename(key) return 0
def bgzip_compress_fasta(filename): from Bio.bgzf import BgzfWriter with BgzfWriter(filename=filename + '.gz') as compressed, open(filename, 'r') as fasta: for line in fasta: compressed.write(line)