Exemple #1
0
def main():

    args = argparser()

    l_fam_sampleIDs = parse_fam(args.bfile + '.fam')
    l_vcf_sampleIDs = shorten_sampleIDs(l_fam_sampleIDs)

    if args.keep:
        l_keep_sampleIDs = parse_keep(args.keep, l_fam_sampleIDs)
        l_keep_index = index_keep(l_keep_sampleIDs, l_fam_sampleIDs)
    else:
        l_keep_index = list(range(len(l_fam_sampleIDs)))

    assert len(l_vcf_sampleIDs) == len(l_fam_sampleIDs)

    ## Get count of samples.
    n_samples = len(l_fam_sampleIDs)
    ## Get count of SNPs. Only needed to keep track of completion percent.
    with open(args.bfile + '.bim', 'r') as bim:
        n_SNPs = len(bim.readlines())

    d_fai = read_fai(args.ref + '.fai')

##         gzip.open(args.vcf, 'wt') as vcf, \
    with open(args.bfile + '.bed', 'rb') as bed, \
         BgzfWriter(args.vcf, 'wb') as vcf, \
         open(args.bfile + '.bim', 'r') as bim, \
         open(args.ref, 'r') as ref:
        convert(
            args, bed, vcf, bim, ref, d_fai,
            l_vcf_sampleIDs, l_keep_index, n_samples, n_SNPs)

    return
Exemple #2
0
def write_split_vcf(vcf, prefix: str):
    """Write VCF data split by chromosome
    
    Parameters
    ----------
    vcf
        Iterator giving lines of VCF data
    prefix : str
        Output prefix for split VCF files
    """

    header = []
    for key, group in itertools.groupby(vcf, key=lambda l: l[:2]):
        if key in {'##', '#C'}:
            header.extend(list(group))
        else:
            with BgzfWriter(f'{prefix}.chr{key.rstrip()}.vcf.gz', 'wb') as f:
                f.write('\n'.join(itertools.chain(header, tuple(group),
                                                  ('', ))).encode())
def outfile_handler(filepath: str,
                    compression: Optional[str] = None) -> TextIO:
    """
    Return a file handle in write mode using the appropriate
    handle depending on the compression mode.
    Valid compression mode:
        compress = None | "None" | "gzip" | "gz" | "bgzip" | "bgz"
    If compress = None or other input, open the file normally.
    """
    if os.path.isfile(filepath):
        warnings.warn("Overwriting the existing file: %s" % filepath)

    if compression is None:
        return open(filepath, mode="wt")
    elif type(compression) == str:
        if compression.lower() in ["gzip", "gz"]:
            return gzip.open(filepath, mode="wt")
        elif compression.lower() in ["bgzip", "bgz"]:
            return BgzfWriter(filepath)
        elif compression.lower() == "none":
            return open(filepath, mode="wt")
    else:
        raise Exception("`compression = %s` invalid." % str(compression))
Exemple #4
0
def main(argv):
    args = parse_argv(argv[1:])

    # set e-mail for identification to NCBI
    Entrez.email = args.email

    # repo directory
    makedirs(args.repo.parent, exist_ok=True)
    # repo database
    path_db = args.repo.with_suffix(".db")
    # repo log
    path_log = args.repo.with_suffix(".log")
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(message)s",
        datefmt="%Y-%m-%dT%H:%M:%S",
        handlers=(logging.FileHandler(path_log), logging.StreamHandler()),
    )
    logging.info(argv)

    # metadata
    accs, fdat, mdat = set(), {}, ""
    db, rettype, baseterm = args.db, args.rettype, args.term
    if path_db.exists():
        with sqlite3.connect(path_db) as conn:
            # 0 -> key
            accs = {
                row[0]
                for row in conn.execute("SELECT key FROM offset_data")
            }
            # file_number -> name
            fdat = odict(conn.execute("SELECT * FROM file_data"))
            # key -> value
            meta = odict(conn.execute("SELECT * FROM meta_data"))
            # override args if the index database has metadata
            # mdat is the previous query execution start time
            db = meta.get("db", db)
            mdat = meta.get("mdat", mdat)
            rettype = meta.get("format", rettype)
            baseterm = meta.get("term", baseterm)

    # remote - local accessions
    term = baseterm + (f" AND {mdat}:{MAX_MDAT}[MDAT]" if mdat else "")
    logging.info(term)
    now = datetime.now().strftime("%Y/%m/%d")
    remote_accs = set(chain.from_iterable(esearch_accs(db, term, args.retmax)))
    accs = list(remote_accs - accs)
    logging.info(f"count = {len(accs)}")

    paths = []
    width = len(str(len(accs)))
    for i, j in enumerate(range(0, len(accs), args.retmax), start=1):
        # fetch
        k = min(len(accs), j + args.retmax)
        csv = ",".join(accs[j:j + args.retmax])
        with Entrez.efetch(db, id=csv, rettype=rettype,
                           retmode="text") as handle:
            path = args.repo.parent / f"{args.repo.name}-{i}.{rettype}.bgz.tmp"
            # compress
            with BgzfWriter(path) as stream:
                print(handle.read(), file=stream)
        paths.append(path)
        logging.info(
            f"{j:0{width}} - {k:0{width}} {k / len(accs) * 100:06.2f}%")

    # truthy indicates new accessions
    if paths:
        # combine previous files with new ones
        paths = [args.repo.parent / ele for ele in fdat.values()] + paths
        # rename with zero-fill
        width = len(str(len(paths)))
        paths = {
            ele:
            ele.with_name(f"{args.repo.name}-{idx:0{width}}.{rettype}.bgz")
            for idx, ele in enumerate(paths, start=1)
        }
        for key, val in paths.items():
            if key != val:
                logging.info(f"{key} -> {val}")
                key.rename(val)
        try:
            path_tmp = path_db.with_suffix(".tmp")
            path_tmp.exists() and path_tmp.unlink()
            print("index...")
            SeqIO.index_db(str(path_tmp), list(map(str, paths.values())),
                           rettype)
            # update metadata
            with sqlite3.connect(path_tmp) as conn:
                conn.execute(
                    "INSERT INTO meta_data VALUES ('db', ?), ('term', ?), ('mdat', ?)",
                    (db, baseterm, now),
                )
            path_tmp.rename(path_db)
        except Exception as e:
            logging.error(e)
            # revert original path names
            for key, val in paths.items():
                logging.info(f"{val} -> {key}")
                val.exists() and val.rename(key)

    return 0
Exemple #5
0
def bgzip_compress_fasta(filename):
    from Bio.bgzf import BgzfWriter
    with BgzfWriter(filename=filename + '.gz') as compressed, open(filename, 'r') as fasta:
        for line in fasta:
            compressed.write(line)