예제 #1
0
 def test_append_mode(self):
     with bgzf.open(self.temp_file, "wb") as h:
         h.write(b">hello\n")
         h.write(b"aaaaaaaaaaaaaaaaaa\n")
         h.flush()
         previous_offsets = bgzf.split_virtual_offset(h.tell())
         # Just flushed, so new block
         self.assertEqual(previous_offsets[1], 0)
     with bgzf.open(self.temp_file, "ab") as h:
         append_position = h.tell()
         self.assertEqual(
             (previous_offsets[0] + 28, 0),
             bgzf.split_virtual_offset(append_position),
         )
         h.write(b">there\n")
         self.assertEqual(
             (previous_offsets[0] + 28, 7), bgzf.split_virtual_offset(h.tell())
         )
         h.write(b"cccccccccccccccccc\n")
     with bgzf.open(self.temp_file, "rb") as h:
         self.assertEqual(
             list(h),
             [
                 b">hello\n",
                 b"aaaaaaaaaaaaaaaaaa\n",
                 b">there\n",
                 b"cccccccccccccccccc\n",
             ],
         )
         h.seek(append_position)
         self.assertEqual(list(h), [b">there\n", b"cccccccccccccccccc\n"])
예제 #2
0
    def test_write_tell(self):
        """Check offset works during BGZF writing."""
        temp_file = self.temp_file

        h = bgzf.open(temp_file, "w")  # Text mode!
        # When opening new file, offset should be 0
        self.assertEqual(h.tell(), 0)

        h.write("X" * 100000)
        offset = h.tell()
        self.assertNotEqual(offset, 100000)  # Should be a virtual offset!

        # After writing the same data two times, size of the first and the second
        # write should be equal also in terms of offsets
        # (This is because the flush ensures two identical blocks written)
        h.flush()
        offset1 = h.tell()
        # Note 'offset' and 'offset1' effectively the same, but not equal
        # due to the flush - 'offet' is at the end of the first BGZF block,
        # while 'offset1' is at the start of the second BGZF block. In terms
        # of the decompressed data, they point to the same location!
        self.assertNotEqual(offset, offset1)  # New block started
        h.write("Magic" + "Y" * 100000)
        h.flush()
        offset2 = h.tell()
        h.write("Magic" + "Y" * 100000)
        h.flush()
        offset3 = h.tell()
        self.assertEqual(
            (offset3 << 16) - (offset2 << 16), (offset2 << 16) - (offset1 << 16)
        )

        # Flushing should change the offset
        h.flush()
        self.assertNotEqual(offset3, h.tell())

        h.close()

        h = bgzf.open(temp_file, "r")  # Text mode!

        h.seek(offset)  # i.e. End of first BGZF block
        self.assertEqual(offset1, h.tell())  # Note *not* seek offset
        # Now at start of second BGZF block
        self.assertEqual(h.read(5), "Magic")

        h.seek(offset2)
        self.assertEqual(offset2, h.tell())
        self.assertEqual(h.read(5), "Magic")

        # Now go back in the file,
        h.seek(offset1)
        self.assertEqual(offset1, h.tell())
        self.assertEqual(h.read(5), "Magic")

        h.close()
예제 #3
0
    def test_write_tell(self):
        """Check offset works during BGZF writing"""
        temp_file = self.temp_file

        h = bgzf.open(temp_file, "w")  # Text mode!
        # When opening new file, offset should be 0
        self.assertEqual(h.tell(), 0)

        h.write("X" * 100000)
        offset = h.tell()
        self.assertNotEqual(offset, 100000)  # Should be a virtual offset!

        # After writing the same data two times, size of the first and the second
        # write should be equal also in terms of offsets
        # (This is because the flush ensures two identical blocks written)
        h.flush()
        offset1 = h.tell()
        # Note 'offset' and 'offset1' effectively the same, but not equal
        # due to the flush - 'offet' is at the end of the first BGZF block,
        # while 'offset1' is at the start of the second BGZF block. In terms
        # of the decompressed data, they point to the same location!
        self.assertNotEqual(offset, offset1)  # New block started
        h.write("Magic" + "Y" * 100000)
        h.flush()
        offset2 = h.tell()
        h.write("Magic" + "Y" * 100000)
        h.flush()
        offset3 = h.tell()
        self.assertEqual(((offset3 << 16) - (offset2 << 16)),
                         ((offset2 << 16) - (offset1 << 16)))

        # Flushing should change the offset
        h.flush()
        self.assertNotEqual(offset3, h.tell())

        h.close()

        h = bgzf.open(temp_file, "r")  # Text mode!

        h.seek(offset)  # i.e. End of first BGZF block
        self.assertEqual(offset1, h.tell())  # Note *not* seek offset
        # Now at start of second BGZF block
        self.assertEqual(h.read(5), "Magic")

        h.seek(offset2)
        self.assertEqual(offset2, h.tell())
        self.assertEqual(h.read(5), "Magic")

        # Now go back in the file,
        h.seek(offset1)
        self.assertEqual(offset1, h.tell())
        self.assertEqual(h.read(5), "Magic")

        h.close()
예제 #4
0
    def test_write_tell(self):
        """Check offset works during BGZF writing"""
        temp_file = self.temp_file

        h = bgzf.open(temp_file, "w") #Text mode!
        h.write("X" * 100000)
        offset = h.tell()
        self.assertNotEqual(offset, 100000) #Should be a virtual offset!
        h.write("Magic" + "Y" * 100000)
        h.close()

        h = bgzf.open(temp_file, "r") #Text mode!
        h.seek(offset)
        self.assertEqual(h.read(5), "Magic")
        h.close()
예제 #5
0
def fasta_parser(fastas, cur, verbose):
    """Fasta iterator returning i, seqid as base64 and sequence str.
    Index sequence using proprietrary parser.
    Handle bgzip compressed files.
    """
    if verbose:
        sys.stderr.write("[%s] Hashing and indexing sequences...\n" %
                         datetime.ctime(datetime.now()))
    #parse fasta
    i = 0
    seqlen = 0
    cmd = "INSERT INTO offset_data VALUES (?, ?, ?, ?)"
    for fi, fn in enumerate(fastas):
        #add file to db
        cur.execute("INSERT INTO file_data VALUES (?, ?)", (fi, fn))
        #get handle and start byte
        if fn.endswith('.gz'):
            handle = bgzf.open(fn)
        else:
            handle = open(fn)
        #parse entries
        for i, (seq, offset, elen) in enumerate(get_seq_offset_length(handle),
                                                i + 1):
            #if i>10**6: break
            seqlen += len(seq)
            cur.execute(cmd, (i, fi, offset, elen))
            yield i, seq
    if verbose:
        sys.stderr.write(" %s letters in" % seqlen)
    #fill metadata
    cur.executemany("INSERT INTO meta_data VALUES (?, ?)", \
                    (('count', i), ('format', 'fasta'), ('dblength', seqlen)))
    #and commit changes
    cur.connection.commit()
def check_gvcf(vcf):
    #Reads the file and looks for GVCF hints
    gvcf = False
    name = vcf
    out_str = "\n"
    count = 0
    thresh = 10000
    tmp = open(vcf, 'r')
    magic_number = tmp.read(2)
    tmp.close()
    with open(vcf) if magic_number != '\x1f\x8b' else bgzf.open(vcf) as rf:
        for line in rf:
            count += 1
            if '<NON_REF>' in line:
                gvcf = True
                break
            if count >= thresh:
                break
    if gvcf:
        name = vcf + '.vcf'
        out_str = ("""
                        java -jar $GATK -T GenotypeGVCFs -R $REF --dbsnp $DBSNP --variant %s --out %s.vcf
                        """ % (vcf, vcf))

    return (name, out_str)
예제 #7
0
def sqlite2seq(cur, db, protids):
    """Return target fastas for protids from sqlite3."""
    #open target files
    cur.execute("SELECT name FROM file_data")
    files = {
    }  #name: open(os.path.join(os.path.dirname(db), name)) for name, in cur.fetchall()}
    for name, in cur.fetchall():
        # if db is in another directory
        if os.path.isfile(name):
            fpath = name
        else:
            fpath = os.path.join(os.path.dirname(db), name)
        # open fasta file
        if name.endswith('.gz'):
            files[name] = bgzf.open(fpath)
        else:
            files[name] = open(fpath)

    #get targets
    cmd = """SELECT f.name, offset, length FROM offset_data o JOIN file_data f
    ON o.file_number=f.file_number WHERE key IN (%s)""" % ",".join(
        str(p) for p in protids)
    cur.execute(cmd)
    targets = []
    for name, offset, length in sorted(cur.fetchall()):
        try:
            files[name].seek(offset)
            targets.append(files[name].read(length))
        except:
            #bgzip sometimes doesn't work at first seek
            sys.stderr.write(
                "[Warning] Cannot fetch sequence for %s at %s + %s bytes\n" %
                (name, offset, length))
    return "".join(targets)
예제 #8
0
def do_something(data, out):
    o = bgzf.open(out, "w")
    with gzip.open(data) as f:
        while True:
            line = f.readline()
            l = line.decode()
            if not l:
                break
            if l[0] == "#":
                print(l.strip(), file=o)
                if "FORMAT" in l:
                    print(
                        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
                        file=o)
                continue
            lt = l.split()
            # lt[4] can have more than one letter: drop ; can be reference (. or <*> ) or alternative (one letter with or without <*>)
            if lt[4] == "<*>": lt[4] = "."
            elif ",<*>" in lt[4]: lt[4] = lt[4].replace(",<*>", "")
            elif "," in lt[4]: continue  # We remove the tri-allelic positions
            # Now that lt[4] is correct we use it for the genotype part
            lt[8] = "GT"
            if lt[4] == ".": lt[9] = "0/0"
            else: lt[9] = "1/1"
            line = "\t".join(lt)
            print(line, file=o)
    o.close()
예제 #9
0
def store_random_entries(outbase, infiles, n, verbose):
    """Return target fastas for protids from sqlite3."""
    if verbose:
        sys.stderr.write("Preparing files...\n")
    files = []
    cursors = []
    outfiles = []
    #open target files/cursors
    for fi, f in enumerate(infiles, 1):
        #get & store cursor
        cur = sqlite3.connect(f.name + '.idx').cursor()
        cursors.append(cur)
        #get files
        cur.execute("SELECT name FROM file_data")
        files.append({})
        #open outfiles
        outfiles.append(gzip.open(outbase + ".%s.fq.gz" % fi, "w"))
        for name, in cur.fetchall():
            #fpath = os.path.join(os.path.dirname(db), name)
            if name.endswith('.gz'):
                files[-1][name] = bgzf.open(name)
            else:
                files[-1][name] = open(name)

    #preload offset data for other files
    if verbose:
        sys.stderr.write("Loading offset_data...\n")
    cmd1 = """SELECT f.name, offset, length FROM offset_data o JOIN file_data f
    ON o.file_number=f.file_number"""
    offset_data = [
        [],
    ]
    for cur in cursors[1:]:
        cur.execute(cmd1)
        offset_data.append(cur.fetchall())
    #get randomly sorted first file
    if verbose:
        sys.stderr.write("Selecting random entries...\n")
    cmd0 = """SELECT key, f.name, offset, length FROM offset_data o JOIN file_data f
    ON o.file_number=f.file_number ORDER BY RANDOM()"""
    if n > 0:
        cmd0 += " LIMIT %s" % n
    cursors[0].execute(cmd0)
    #combine randomised and preloaded data
    if verbose:
        sys.stderr.write("Reporting...\n")
    for i, (key, name, offset, length) in enumerate(cursors[0].fetchall(), 1):
        if verbose and i % 10000 == 1:
            sys.stderr.write(" %s     \r" % i)
        #store first file random sequence
        outfiles[0].write(get_seq(files[0][name], offset, length))
        #store sequence from the remaining files
        key = int(key)
        for fi in range(1, len(outfiles)):
            name, offset, length = offset_data[fi][key]
            outfiles[fi].write(get_seq(files[fi][name], offset, length))
    #close
    for out in outfiles:
        out.close()
예제 #10
0
def store_random_entries(outbase, infiles, n, verbose):
    """Return target fastas for protids from sqlite3."""
    if verbose:
        sys.stderr.write("Preparing files...\n")
    files = []
    cursors = []
    outfiles = []
    #open target files/cursors
    for fi, f in enumerate(infiles, 1):
        #get & store cursor
        cur = sqlite3.connect(f.name+'.idx').cursor()
        cursors.append(cur)
        #get files
        cur.execute("SELECT name FROM file_data")
        files.append({})
        #open outfiles
        outfiles.append(gzip.open(outbase+".%s.fq.gz"%fi, "w"))
        for name, in cur.fetchall():
            #fpath = os.path.join(os.path.dirname(db), name)
            if name.endswith('.gz'):
                files[-1][name] = bgzf.open(name)
            else:
                files[-1][name] =      open(name)

    #preload offset data for other files
    if verbose:
        sys.stderr.write("Loading offset_data...\n")                
    cmd1 = """SELECT f.name, offset, length FROM offset_data o JOIN file_data f
    ON o.file_number=f.file_number"""
    offset_data = [[], ]
    for cur in cursors[1:]:
        cur.execute(cmd1)
        offset_data.append(cur.fetchall())
    #get randomly sorted first file
    if verbose:
        sys.stderr.write("Selecting random entries...\n")
    cmd0 = """SELECT key, f.name, offset, length FROM offset_data o JOIN file_data f
    ON o.file_number=f.file_number ORDER BY RANDOM()"""
    if n>0:
        cmd0 += " LIMIT %s"%n
    cursors[0].execute(cmd0)
    #combine randomised and preloaded data
    if verbose:
        sys.stderr.write("Reporting...\n")    
    for i, (key, name, offset, length) in enumerate(cursors[0].fetchall(), 1):
        if verbose and i%10000==1:
            sys.stderr.write(" %s     \r"%i)
        #store first file random sequence
        outfiles[0].write(get_seq(files[0][name], offset, length))
        #store sequence from the remaining files
        key = int(key)
        for fi in range(1, len(outfiles)):
            name, offset, length = offset_data[fi][key]
            outfiles[fi].write(get_seq(files[fi][name], offset, length))
    #close
    for out in outfiles:
        out.close()
예제 #11
0
def open_file(filename, mode='r'):
    if 'w' in mode and filename.endswith('.gz'):
        with bgzf.open(filename, mode) as f:
            yield f
    else:
        if 't' not in mode:
            mode += 't'
        _open = gzip.open if filename.endswith('.gz') else open
        with _open(filename, mode, encoding='utf-8') as f:
            yield f
예제 #12
0
 def test_double_flush(self):
     with bgzf.open(self.temp_file, "wb") as h:
         h.write(b">hello\n")
         h.write(b"aaaaaaaaaaaaaaaaaa\n")
         h.flush()
         pos = h.tell()
         h.flush()
         self.assertGreater(h.tell(), pos)  # sanity check
         h.write(b">there\n")
         h.write(b"cccccccccccccccccc\n")
     with bgzf.open(self.temp_file, "rb") as h:
         self.assertEqual(
             list(h),
             [
                 b">hello\n",
                 b"aaaaaaaaaaaaaaaaaa\n",
                 b">there\n",
                 b"cccccccccccccccccc\n",
             ],
         )
예제 #13
0
    def test_many_blocks_in_single_read(self):
        n = 1000

        with bgzf.open(self.temp_file, "wb") as h:
            # create a file with a lot of a small blocks
            for i in range(n):
                h.write(b"\x01\x02\x03\x04")
                h.flush()
            h.write(b"\nABCD")

        with bgzf.open(self.temp_file, "rb") as h:
            data = h.read(4 * n)
            self.assertEqual(len(data), 4 * n)
            self.assertEqual(data[:4], b"\x01\x02\x03\x04")
            self.assertEqual(data[-4:], b"\x01\x02\x03\x04")

            h.seek(0)
            data = h.readline()
            self.assertEqual(len(data), 4 * n + 1)
            self.assertEqual(data[:4], b"\x01\x02\x03\x04")
            self.assertEqual(data[-5:], b"\x01\x02\x03\x04\n")
예제 #14
0
    def test_many_blocks_in_single_read(self):
        n = 1000

        h = bgzf.open(self.temp_file, 'wb')
        # create a file with a lot of a small blocks
        for i in range(n):
            h.write(b'\x01\x02\x03\x04')
            h.flush()
        h.write(b'\nABCD')
        h.close()

        h = bgzf.open(self.temp_file, 'rb')
        data = h.read(4 * n)
        self.assertEqual(len(data), 4 * n)
        self.assertEqual(data[:4], b'\x01\x02\x03\x04')
        self.assertEqual(data[-4:], b'\x01\x02\x03\x04')

        h.seek(0)
        data = h.readline()
        self.assertEqual(len(data), 4 * n + 1)
        self.assertEqual(data[:4], b'\x01\x02\x03\x04')
        self.assertEqual(data[-5:], b'\x01\x02\x03\x04\n')
예제 #15
0
    def test_many_blocks_in_single_read(self):
        n = 1000

        h = bgzf.open(self.temp_file, 'wb')
        # create a file with a lot of a small blocks
        for i in range(n):
            h.write(b'\x01\x02\x03\x04')
            h.flush()
        h.write(b'\nABCD')
        h.close()

        h = bgzf.open(self.temp_file, 'rb')
        data = h.read(4 * n)
        self.assertEqual(len(data), 4 * n)
        self.assertEqual(data[:4], b'\x01\x02\x03\x04')
        self.assertEqual(data[-4:], b'\x01\x02\x03\x04')

        h.seek(0)
        data = h.readline()
        self.assertEqual(len(data), 4 * n + 1)
        self.assertEqual(data[:4], b'\x01\x02\x03\x04')
        self.assertEqual(data[-5:], b'\x01\x02\x03\x04\n')
예제 #16
0
def create_multi_fastq(fasta_files, output_file):
    print(
        "Creating the multifastq file with all the simulated taxa ...",
        file=sys.stderr,
    )

    # TODO this is super slow, see https://sites.google.com/site/tfsidc/linux-tricks/processing-a-large-number-of-files
    with bgzf.open(output_file, "wt") as fout:
        for fasta_file in fasta_files:
            print("Adding reads from ", fasta_file, file=sys.stderr)
            with gzip.open(fasta_file, "rb") as fin:
                shutil.copyfileobj(fin, fout)

    print("Multi fastq file created", file=sys.stderr)
예제 #17
0
def zopen(fname, *args, **kwargs):
    if os.path.isfile(fname):
        f = open(fname, *args, **kwargs)
        token = f.read(3)
        f.seek(0)
        if token == b'\x1f\x8b\x08':
            return gzip.GzipFile(fileobj=f)
        elif token == b'\x42\x5a\x68':
            return bz2.BZ2File(f)
        else:
            return f
    else:
        if fname.endswith('.gz'):
            return bgzf.open(fname, *args, **kwargs)
        elif fname.endswith('.bz2'):
            return bz2.open(fname, *args, **kwargs)
        else:
            return open(fname, *args, **kwargs)
예제 #18
0
def parse_raw_swiss(filename, filter_fn=None):
    """
    Given a raw SwissProt format file containing many sequences, return an iterator of
    raw sequence strings.

    Option filter_fn argument is for a function which takes in a raw
    SwissProt format entry and returns a boolean.  If True, the string is returned
    in the iterator, if False it is not.
    """
    handle = bgzf.open(filename)
    while True:
        res = _get_record(handle)
        if not res:
            break

        if filter_fn and filter_fn(res):
            yield res
        elif not filter_fn:
            yield res
예제 #19
0
 def test_BgzfBlocks_TypeError(self):
     """Check get expected TypeError from BgzfBlocks."""
     for mode in ("r", "rb"):
         decompressed = bgzf.open("GenBank/cor6_6.gb.bgz", mode)
         with self.assertRaises(TypeError):
             list(bgzf.BgzfBlocks(decompressed))
예제 #20
0
 def test_append_mode(self):
     with self.assertRaises(NotImplementedError):
         bgzf.open(self.temp_file, "ab")
예제 #21
0
    def __init__(self,
                 out_prefix,
                 paired=False,
                 bam_header=None,
                 vcf_header=None,
                 no_fastq=False,
                 fasta_instead=False):

        self.fasta_instead = fasta_instead
        # TODO Eliminate paired end as an option for fastas. Plan is to create a write fasta method.
        if self.fasta_instead:
            fq1 = pathlib.Path(out_prefix + '.fasta.gz')
            fq2 = None
        else:
            fq1 = pathlib.Path(out_prefix + '_read1.fq.gz')
            fq2 = pathlib.Path(out_prefix + '_read2.fq.gz')
        bam = pathlib.Path(out_prefix + '_golden.bam')
        vcf = pathlib.Path(out_prefix + '_golden.vcf.gz')

        # TODO Make a fasta-specific method
        self.no_fastq = no_fastq
        if not self.no_fastq:
            self.fq1_file = bgzf.open(fq1, 'w')

            self.fq2_file = None
            if paired:
                self.fq2_file = bgzf.open(fq2, 'w')

        # VCF OUTPUT
        self.vcf_file = None
        if vcf_header is not None:
            self.vcf_file = bgzf.open(vcf, 'wb')

            # WRITE VCF HEADER
            self.vcf_file.write('##fileformat=VCFv4.1\n'.encode('utf-8'))
            reference = '##reference=' + vcf_header[0] + '\n'
            self.vcf_file.write(reference.encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=VMX,Number=1,Type=String,Description="SNP is Missense in these Read Frames">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=VNX,Number=1,Type=String,Description="SNP is Nonsense in these Read Frames">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=VFX,Number=1,Type=String,Description="Indel Causes Frameshift">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##INFO=<ID=WP,Number=A,Type=Integer,Description="NEAT-GenReads ploidy indicator">\n'
                .encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=DEL,Description="Deletion">\n'.encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=DUP,Description="Duplication">\n'.encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=INS,Description="Insertion of novel sequence">\n'.
                encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=INV,Description="Inversion">\n'.encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=CNV,Description="Copy number variable region">\n'.
                encode('utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=TRANS,Description="Translocation">\n'.encode(
                    'utf-8'))
            self.vcf_file.write(
                '##ALT=<ID=INV-TRANS,Description="Inverted translocation">\n'.
                encode('utf-8'))
            # TODO add sample to vcf output
            self.vcf_file.write(
                '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'.encode(
                    'utf-8'))

        # BAM OUTPUT
        self.bam_file = None
        if bam_header is not None:
            self.bam_file = bgzf.BgzfWriter(
                bam, 'w', compresslevel=BAM_COMPRESSION_LEVEL)

            # WRITE BAM HEADER
            self.bam_file.write("BAM\1")
            header = '@HD\tVN:1.5\tSO:coordinate\n'
            for n in bam_header[0]:
                header += '@SQ\tSN:' + n[0] + '\tLN:' + str(n[3]) + '\n'
            header += '@RG\tID:NEAT\tSM:NEAT\tLB:NEAT\tPL:NEAT\n'
            header_bytes = len(header)
            num_refs = len(bam_header[0])
            self.bam_file.write(pack('<i', header_bytes))
            self.bam_file.write(header)
            self.bam_file.write(pack('<i', num_refs))

            for n in bam_header[0]:
                l_name = len(n[0]) + 1
                self.bam_file.write(pack('<i', l_name))
                self.bam_file.write(n[0] + '\0')
                self.bam_file.write(pack('<i', n[3]))

        # buffers for more efficient writing
        self.fq1_buffer = []
        self.fq2_buffer = []
        self.bam_buffer = []
예제 #22
0
    def split_by_tags(self, infiles=None, inpath=None, outpath=None, out_filename=None):
        ''' Split the file into separate files based on MID tags '''
        
        c = self.c
        
        if outpath is None:
            outpath = c.tag_splitby_sample_outpath
        
        if out_filename is None:
            out_filename = c.experiment_name
             
        # Setup Record Cycler        
        if infiles is None:
            infiles = self.next_input_files
        if inpath is None:
            inpath = self.next_input_path
        
        RecCycler = Cycler(infiles=infiles, filepattern=False, data_inpath=inpath)
         
        print ('\nSpliting {0} file(s) based on MID tags'
               '').format(RecCycler.numfiles)
        
        outfiles_dict = {}
        
        first_run = 1
        
        # Running through all records in all passed files 
        for recordgen in RecCycler.seqfilegen:
            
            # Set / reset Counter
            tag_counter = Counter()
            
            dbtags = self.get_data4file(RecCycler.curfilename, fields=['MIDtag', 'description'])
            # tags is returned as a list of tuples for each record            
            MID_length = len(dbtags[0][0])
            # as tuple of descriptions then tuple of MIDtags
            tups = zip(*dbtags)
            # Check using MIDtags as keys would be unique
            assert len(set(tups[1])) == len(tups[1]), 'Duplicate MIDtags returned for file {0}'.format(RecCycler.curfilename) 

            # Convert to dictionary  {'MIDtag': 'description'} 
            dbtags = dict(dbtags)
            
            # Open Files for Writing for each tag  
            for tag, desc in dbtags.iteritems():
                
                fname = '-'.join([out_filename, tag, desc]) + '.bgzf'
                fnamevar = 'f_' + desc

                # Check that files don't already exist
                if first_run:
                    # If file already exists, overwrite it.
                    if os.path.isfile(os.path.join(outpath, fname)):
                        f = open(os.path.join(outpath, fname), 'w')
                        f.close()
                    
                vars()[fnamevar] = bgzf.open(os.path.join(outpath, fname), 'a')
                outfiles_dict[fnamevar] = fname
    
            first_run = 0
            
            for rec in recordgen:
        
                recMIDtag = rec.seq[:MID_length].tostring()
                
                if recMIDtag not in dbtags:
                    raise Exception('MID tag not found in database for file {0}'.format(RecCycler.curfilename))
                else:                   
                    fnamevar = 'f_' + dbtags[recMIDtag]           
                    SeqIO.write(rec, vars()[fnamevar], 'fastq');
                    tag_counter[recMIDtag] += 1

            # Flush and Close Files for each tag  
            for tag, desc in dbtags.iteritems():

                fnamevar = 'f_' + desc
                vars()[fnamevar].flush()
                vars()[fnamevar].close()
                
                # Update datafiles in database
                filename = outfiles_dict[fnamevar]
                self.db.add_datafile(filename, [desc], datafile_type='1sample')

            print 'Finished Splitting MIDtags for input file: {0}'.format(RecCycler.curfilename)
            
            # Update counts
            for tag, desc in dbtags.iteritems():
                  
                row = self.db.select('''read_count FROM samples WHERE description=? ''', (desc,))
                current_value = row[0]['read_count']
                
                if current_value is None:
                    current_value = 0
                    
                self.db.update('''samples SET read_count=? WHERE description=?''',
                                ( current_value + tag_counter[tag], desc))

        # Store file names 
        for outfile in outfiles_dict.itervalues():

            # Find sample description            
            fname = os.path.split(outfile)[1]
            if fname.endswith('.bgzf'):
                fname = fname[:-5]
            fname_parts = fname.split('-') 
            desc = fname_parts[-1]
            
            self.db.update('''samples SET read_file=? WHERE description=?''',
                                (outfile, desc))
            
        # Outputs return / update next inputs
        self.next_input_path = outpath
        self.next_input_files = outfiles_dict.values()
        
        return (outfiles_dict.values(), outpath)
예제 #23
0
def entrez_download_sequence(accession, output_file, force=False, mtdna=False):
    """
    Fetch the Entrez fasta record for a nuccore accession.
    """

    # query the assembly database to see if there is an FTP url we can use
    ftp_url = entrez_assembly_ftp(accession, force) if not mtdna else ""

    try:
        if ftp_url:
            download_entrez_ftp(ftp_url, output_file)
            return

    except urllib.error.URLError:
        pass

    try:
        # fetch the fasta record from nuccore
        r = entrez_request("efetch.fcgi", {"db": "nuccore", "id": accession, "rettype": "fasta", "retmode": "text"})

        # the fasta may be empty if this is a "master record" containing multiple other records (NZ_APLR00000000.1)
        if len(r.text.strip()) > 1:
            with bgzf.open(output_file, "w") as fout:
                print(r.text, file=fout)

            return

    except requests.exceptions.HTTPError:
        pass

    try:
        # get the full GenBank XML record
        r = entrez_request("efetch.fcgi", {"db": "nuccore", "id": accession, "rettype": "gb", "retmode": "xml"})

    except requests.exceptions.HTTPError:
        # check for a replacement accession (there may be a newer version if this a WGS project)
        updated_accession = entrez_find_replacement_accession(accession)

        # download the updated accession instead
        entrez_download_sequence(updated_accession, output_file, force)

        return

    # parse the XML result
    etree = ElementTree.XML(r.text)

    # get the first and last accession codes for this master record
    first = etree.find(".//GBAltSeqItem_first-accn")
    last = etree.find(".//GBAltSeqItem_last-accn")

    if first is None or last is None:
        print_error(
            f"Could not download the fasta file for {accession}. Please consider using the `--exclude-accessions` "
            f"flag to remove accession '{accession}' from this query."
        )

    # get all the related accession codes
    accessions = entrez_range_accessions(accession, first.text, last.text)

    try:
        with bgzf.open(output_file, "w") as fout:
            # fetch all the accessions in batches
            for id_list in chunker(accessions, ENTREZ_MAX_UID):
                r = entrez_request(
                    "efetch.fcgi",
                    {"db": "nuccore", "id": id_list, "rettype": "fasta", "retmode": "text"},
                )

                # write the fasta data to our bgzip file
                print(r.text, file=fout)

    except requests.exceptions.HTTPError:
        print_error(
            f"Could not download the accession range '{first.text}-{last.text}' for master record '{accession}'. "
            f"Please consider using the `--exclude-accessions` flag to remove accession '{accession}' from this query."
        )
예제 #24
0
파일: upa_util.py 프로젝트: mjobin/UPA
def mergeref(refvcf, othervcf, diploid, mergefoundonly, annotate):
    """ Adds the read group information by using Picard

    :param refvcf: VCF file mapped to reference given by ref argument on input, normally the samples.
    :param othervcf: VCF file of external dataset.
    :param diploid: Are samples diploid? Are friends electric.
    :param mergefoundonly: Merged file will contain sites found in both file only.
    :param annotate: Annotate the ID column of the merged file from the external dataset (othervcf).
    :param verbose: Verbose output to log.
    :param cmdfile: File storing external commands invoked.
    :param logfile: Output log.
    :return: Name of merged VCF file.
    """
    #First read in the reference (normally, the sample) VCF, and create a line dictionary based on position

    mergevcf = refvcf[:-7]
    mergevcf += "-MERGED.vcf.gz"
    # do a bgzf rad if it is zipped
    if refvcf[-3:] == ".gz":
        refun = refvcf[:-3]
        with bgzf.open(refvcf, 'rb') as f_in, open(refun, 'w') as f_out:
            shutil.copyfileobj(f_in, f_out)
        refvcf = refun

    if othervcf[-3:] == ".gz":
        otherun = othervcf[:-3]
        with bgzf.open(othervcf, 'rb') as f_in, open(otherun, 'w') as f_out:
            shutil.copyfileobj(f_in, f_out)
        othervcf = otherun

    print "\nReading " + refvcf + "..."
    reffile = open(refvcf, 'r')
    ref_data = []
    for file_line in reffile:
        if len(file_line.rstrip()) > 0:  # Strip blank lines
            ref_data.append(file_line.rstrip())
    refheaderline = ""
    refheaderlist = []
    refdict = {}
    foundheader = False
    # bar = progressbar.ProgressBar()
    # for i in bar(range(len(ref_data))):
    for i in range(len(ref_data)):
        file_line = ref_data[i]
        cols = file_line.split()
        # print cols
        if foundheader:  #from here on, its data
            # print cols[0]+"-"+cols[1] + " " + str(i)
            refdict[cols[0] + "-" + cols[1]] = i
        else:  ##just add to header repository
            if cols[0] == '#CHROM':
                refheaderline = file_line
                refhdrcols = cols
                print " number of total columns in ref " + str(len(refhdrcols))
                foundheader = True
            elif "##fileformat" not in file_line:
                refheaderlist.append(file_line)
    reffile.close()

    foundheader = False
    #Next, read in
    print "\nReading " + othervcf + "..."
    otherfile = open(othervcf, 'r')
    other_data = []
    for file_line in otherfile:
        if len(file_line.rstrip()) > 0:  # Strip blank lines
            other_data.append(file_line.rstrip())
    otherheaderline = ""
    otherheaderlist = []
    otherdict = {}
    foundheader = False
    othersamplenames = []
    bar = progressbar.ProgressBar()
    for i in bar(range(len(other_data))):
        file_line = other_data[i]
        cols = file_line.split('\t')
        if foundheader:  #from here on, its data
            otherdict[cols[0] + "-" + cols[1]] = i
        else:  ##just add to header repository
            if cols[0] == '#CHROM':
                otherheaderline = file_line
                othersamplenames = cols[9:]
                print " number of sample columns in other " + str(
                    len(othersamplenames))
                foundheader = True
            elif "##fileformat" not in file_line:
                otherheaderlist.append(file_line)
    otherfile.close()

    oslen = len(othersamplenames)

    print "Writing to " + mergevcf
    mergeout = gzip.open(mergevcf, 'wb')

    #Merged header
    mergeout.write("##fileformat=VCFv4.2\n")
    mergeout.write("##UPA merged file headers for " + refvcf + "\n")
    for refhdrline in refheaderlist:
        mergeout.write(refhdrline)
        mergeout.write("\n")
    mergeout.write("##UPA merged file headers for " + othervcf + "\n")
    for otherhdrline in otherheaderlist:
        mergeout.write(otherhdrline)
        mergeout.write("\n")
    mergeout.write("##UPA merged " + refvcf + " and " + othervcf +
                   " with REF alleles set to those of " + refvcf +
                   " and all-missing sites ignored.\n")

    outhdr = refhdrcols
    for osn in othersamplenames:
        outhdr.append(osn)
    outhdrlen = len(outhdr)
    print "Header has " + str(outhdrlen) + " columns."
    hdrline = '\t'.join(outhdr)
    mergeout.write(hdrline)
    mergeout.write("\n")

    print "Merging...."
    bar = progressbar.ProgressBar()
    for key, lnum in bar(sorted(refdict.items(), key=refkeysort)):
        # for key, lnum in sorted(refdict.items(), key=refkeysort):
        foundother = False
        refline = linecache.getline(
            refvcf, lnum +
            1).strip()  # Add one because linecache lines start on 1 not 0
        # print key + " " + str(lnum+1) + " " + refline
        refcols = refline.split('\t')
        if key in otherdict:
            foundother = True
            otnum = otherdict[key]
            otherline = linecache.getline(othervcf, otnum + 1).strip()

            complist = []

            othertm = {}
            # print otherline
            othercols = otherline.split()

            # print "\n"
            #
            # print key + " " + str(lnum + 1) + " " + refcols[1] +  " Otherdict " + othercols[1]

            trueref = refcols[3]
            complist.append(trueref)
            truealts = refcols[4].split(",")
            for alt in truealts:
                complist.append(alt)

            # print "True REF " + trueref
            otherref = othercols[3]
            otheralts = othercols[4].split(",")

            if otherref in complist:
                pass
            else:
                complist.append(otherref)

            for k in range(len(otheralts)):
                if otheralts[k] in complist:
                    pass
                else:
                    complist.append(otheralts[k])

            # print complist

            otherrefloc = complist.index(otherref)
            othertm[0] = otherrefloc
            for k in range(len(otheralts)):
                othertm[k + 1] = complist.index(otheralts[k])

            altlist = complist
            altlist.remove(trueref)

            # print "TM "
            # print othertm

            siteline = []
            for l in range(len(refcols)):
                if l == 4:
                    siteline.append(','.join(altlist))
                elif l == 2:
                    if annotate:
                        siteline.append(othercols[l])
                    else:
                        siteline.append(refcols[l])
                else:
                    siteline.append(refcols[l])

            #
            # print "final siteline"

            #construct
            for othersite in othercols[9:]:
                othersites = re.split("[/|]+", othersite)

                # print othersites
                olen = len(othersites)
                # print olen
                if olen > 1 and not diploid:
                    print "ERROR: not diploid but more than one site at " + key
                    exit(1)
                oconstruct = ""
                for i in xrange(olen):
                    osite = othersites[i]
                    if osite == ".":
                        oconstruct += "."
                        # print osite + " becomes ."
                    else:
                        # print osite + " becomes " + str(othertm[int(osite)])
                        oconstruct += str(othertm[int(osite)])
                    if i < olen - 1:
                        oconstruct += "/"  # FIXME this always ouputs the unphased marker

                siteline.append(oconstruct)
        else:
            # print key + " " + str(lnum+1) + " no match"
            if mergefoundonly:
                siteline = ""
            else:
                refline = linecache.getline(refvcf, lnum + 1).strip()
                refcols = refline.split('\t')
                siteline = refcols
                for nom in range(oslen):
                    if diploid:
                        siteline.append(
                            "./."
                        )  # FIXME this always ouputs the unphased marker
                    else:
                        siteline.append(".")

        ##Now check if its all missing or empty
        allmissing = True
        for i in xrange(9, len(siteline)):
            site = siteline[i]
            if site != "./." and site != "." and site != ".|.":
                allmissing = False
        if allmissing:
            # print "At " + key + " all sites missing, skipping."
            pass
        else:
            siteout = '\t'.join([str(x) for x in siteline])
            # print siteout
            siteout += "\n"
            if mergefoundonly:
                if foundother:
                    if len(siteline) != len(outhdr):
                        print "ERROR: Line in merged VCF has " + str(
                            len(siteline)) + " but header line has " + str(
                                len(outhdr))
                    mergeout.write(siteout)
            else:
                if len(siteline) != len(outhdr):
                    print "ERROR: Line in merged VCF has " + str(
                        len(siteline)) + " but header line has " + str(
                            len(outhdr))
                mergeout.write(siteout)
    mergeout.close()
    return mergevcf
예제 #25
0
def read_vcf(filepath):
    """
    Read a VCF.
    :param filepath: str;
    :return: dict;
    """

    vcf = {
        'meta_information': {
            'INFO': {},
            'FILTER': {},
            'FORMAT': {},
            'reference': {},
        },
        'header': [],
        'samples': [],
        'data': None,
    }

    # Open VCF
    try:
        f = open(filepath)
        f.readline()
        f.seek(0)
        bgzipped = False
    except UnicodeDecodeError:
        f = bgzf.open(filepath)
        bgzipped = True

    for row in f:

        if bgzipped:
            row = row.decode()
        row = row.strip()

        if row.startswith('##'):  # Meta-information
            # Remove '##' prefix
            row = row[2:]

            # Find the 1st '='
            ei = row.find('=')

            # Get field name and field line
            fn, fl = row[:ei], row[ei + 1:]

            if fl.startswith('<') and fl.endswith('>'):
                # Strip '<' and '>'
                fl = fl[1:-1]

                # Split field line
                fl_split = split_ignoring_inside_quotes(fl, ',')

                # Get ID
                id_ = fl_split[0].split('=')[1]

                # Parse field line
                fd_v = {}
                for s in fl_split[1:]:
                    ei = s.find('=')
                    k, v = s[:ei], s[ei + 1:]
                    fd_v[k] = remove_nested_quotes(v)

                # Save
                if fn in vcf['meta_information']:
                    if id_ in vcf['meta_information'][fn]:
                        raise ValueError('Duplicated ID {}.'.format(id_))
                    else:
                        vcf['meta_information'][fn][id_] = fd_v
                else:
                    vcf['meta_information'][fn] = {id_: fd_v}
            else:
                print('Didn\'t parse: {}.'.format(fl))

        elif row.startswith('#CHROM'):  # Header
            # Remove '#' prefix
            row = row[1:]

            # Get header line number
            vcf['header'] = row.split('\t')
            vcf['samples'] = vcf['header'][9:]
        else:
            break

    # Close VCF
    f.close()

    # Read data
    vcf['data'] = read_csv(filepath,
                           sep='\t',
                           comment='#',
                           header=None,
                           names=vcf['header'])

    return vcf
예제 #26
0
    def split_by_subgroups(self, subgroups=None, infiles=None, inpath=None, outpath=None, 
                           out_filename=None ):
        ''' Split the file into separate files based on MID tags '''
        
        if subgroups is None:
            # Dictionary of regular expressions to match sample discription
            subgroups = { 'zebra'  : '.*zebra.*',
                         'gazelle' : '.*gazelle.*'}
        
        # Compile regexes
        for k,v in subgroups.iteritems():
            subgroups[k] = re.compile(v)
        
        c = self.c
        
        if outpath is None:
            outpath = c.tag_splitby_subgroup_outpath
        if out_filename is None:
            out_filename = c.experiment_name
        
        if not os.path.exists(outpath):
            os.makedirs(outpath)
        
        # Setup Record Cycler        
        if infiles is None:
            infiles = self.next_input_files
        if inpath is None:
            inpath = self.next_input_path
        
        RecCycler = Cycler(infiles=infiles, filepattern=False, data_inpath=inpath)
         
        print ('\nSpliting {0} file(s) into zebras and gazelles'
               '').format(RecCycler.numfiles)
        
        outfiles_dict = {}
        
        first_run = 1
        
        for recordgen in RecCycler.seqfilegen:
            
            # Set / reset Counter
            tag_counter = Counter()
            
            dbtags = self.get_data4file(RecCycler.curfilename, fields=['MIDtag', 'description'])
            # tags is returned as a list of tuples for each record            
            MID_length = len(dbtags[0][0])
            # Convert to dictionary  {'MIDtag' : 'description' }
            dbtags = dict(dbtags)
            
            # Open Files for Writing for each subgroup  
            for group in subgroups.iterkeys():
                
                fname = '-'.join([out_filename, group]) + '.bgzf'
                fnamevar = 'f_' + group

                # Check that files don't already exist
                if first_run:
                    # If file already exists, overwrite it.
                    if os.path.isfile(os.path.join(outpath, fname)):
                        f = open(os.path.join(outpath, fname), 'w')
                        f.close()
                    
                vars()[fnamevar] = bgzf.open(os.path.join(outpath, fname), 'a')
                outfiles_dict[fnamevar] = fname
    
            first_run = 0
            
            for rec in recordgen:
        
                recMIDtag = rec.seq[:MID_length].tostring()
                
                if recMIDtag not in dbtags:
                    raise Exception('MID tag not found in database for file {0}'.format(RecCycler.curfilename))
                else:
                    # Get description
                    desc = dbtags[recMIDtag]
                    # Write to approprite file if it matches the regex
                    for group in subgroups.iterkeys():
                        if subgroups[group].match(desc):
                            
                            fnamevar = 'f_' + group                
                            SeqIO.write(rec, vars()[fnamevar], 'fastq');
                            tag_counter[recMIDtag] += 1
                            
            # Flush and Close Files for each tag  
            for group in subgroups.iterkeys():

                fnamevar = 'f_' + group
                vars()[fnamevar].flush()
                vars()[fnamevar].close()
                
                # Update datafiles in database
                filename = outfiles_dict[fnamevar]
                
                desc_list = filter(subgroups[group].match ,dbtags.values())
                
                self.db.add_datafile(filename, desc_list, datafile_type='group')

            print 'Finished Splitting reads for input file: {0}'.format(RecCycler.curfilename)
            
        # Outputs return / update next inputs
        self.next_input_path = outpath
        self.next_input_files = outfiles_dict.values()
        
        return (outfiles_dict.values(), outpath)
예제 #27
0
파일: upa.py 프로젝트: mjobin/UPA
        logfile.write("\n")

    if mito or ychr:
        diploid = False

    bcname = ""
    samplevcffile = ""
    bampreprocess = True

    print "\nChecking for input files..."
    if vcf_file:
        bampreprocess = False
        bcbase = os.path.basename(vcf_file)
        if bcbase[-7:] == ".vcf.gz":
            bcname = bcbase[:-7]
            with bgzf.open(bcbase, 'rb') as f_in, open(bcname + ".vcf",
                                                       'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        elif bcbase[-4:] == ".vcf":
            bcname = bcbase[:-4]
        else:
            print "ERROR: Must supply a .vcf or .gz file is using -vcf_file."
            exit(1)

    elif bcfile != "" and bamlist == "":
        bcbase = os.path.basename(bcfile)
        bcname, fileext = os.path.splitext(bcbase)
        bcin = open(bcfile, 'r')
        for bcline in bcin:
            bccols = bcline.split()
            binfile = wd + "/" + bccols[1] + "/BWA_" + refname + "/" + bccols[
예제 #28
0
def genocaller(flist, bedfile, bcname, indent, ref, regionrestrict, threads, verbose, cmdfile, logfile):
    """ Calls genotypes using Krishna Veeramah's GenoCaller_indent

    :param flist: File list.
    :param bedfile: UCSC-style BED file.
    :param bcname: Base name of input file.
    :param indent: Indent depth to each end of read.
    :param ref: Reference genome.
    :param regionrestrict: Area of genome to limit calling.
    :param threads: Number of multiprocessing threads to use.
    :param verbose: Verbose output to log.
    :param cmdfile: File storing external commands invoked.
    :param logfile: Output log.
    :return: Name of merged sample VCF.
    """
    print "\nGenoCaller..."
    samplevcfnames = []
    for i in range(len(flist)):
        sample = flist[i]
        gccmd = "GenoCaller_indent.py " + sample + ".bam " + bedfile + " " + ref + " " + indent
        upa_util.bash_command(gccmd, verbose, cmdfile, logfile)

        #Must compress to allow bcftools to merge

        with open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf", 'r') as f_in, bgzf.open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz", 'wb') as f_out:
        # with open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf", 'r') as f_in, gzip.open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz", 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

        samplevcfname = sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz"
        # sampleemitallname = sample + "." + bedfile + ".indent" + str(indent) + ".emit_all.vcf.gz"

        if os.path.isfile(samplevcfname):
            upa_util.vcf_name_strip(samplevcfname)
            upa_util.bash_command("bcftools index --threads " + threads + " " + samplevcfname, verbose, cmdfile, logfile)
            samplevcfnames.append(samplevcfname)
        else:
            print "ERROR: Cannot find " + samplevcfname

    #Merge the resulting VCFs together using bcftools
    bcfmergecmd = "bcftools merge --threads " + threads + " -Oz -o " + bcname + "-samples.vcf.gz "
    if regionrestrict:
        bcfmergecmd = bcfmergecmd + " -r " + regionrestrict
    for samplevcfname in samplevcfnames:
        bcfmergecmd = bcfmergecmd + samplevcfname + " "
    upa_util.bash_command(bcfmergecmd, verbose, cmdfile,logfile)
    return bcname + "-samples.vcf"
예제 #29
0
    def addFile(self, filename):
        """
        Add a new FASTA file of sequences.

        @param filename: A C{str} file name, with the file in FASTA format.
            This file must (obviously) exist at indexing time. When __getitem__
            is used to access sequences, it is possible to provide a
            C{fastaDirectory} argument to our C{__init__} to indicate the
            directory containing the original FASTA files, in which case the
            basename of the file here provided in C{filename} is used to find
            the file in the given directory. This allows the construction of a
            sqlite database from the shell in one directory and its use
            programmatically from another directory.
        @raise ValueError: If a file with this name has already been added or
            if the file contains a sequence whose id has already been seen.
        @return: The C{int} number of sequences added from the file.
        """
        endswith = filename.lower().endswith
        if endswith('.bgz') or endswith('.gz'):
            useBgzf = True
        elif endswith('.bz2'):
            raise ValueError(
                'Compressed FASTA is only supported in BGZF format. Use '
                'bgzip to compresss your FASTA.')
        else:
            useBgzf = False

        fileNumber = self._addFilename(filename)
        connection = self._connection
        count = 0
        try:
            with connection:
                if useBgzf:
                    try:
                        fp = bgzf.open(filename, 'rb')
                    except ValueError as e:
                        if str(e).find('BGZF') > -1:
                            raise ValueError(
                                'Compressed FASTA is only supported in BGZF '
                                'format. Use the samtools bgzip utility '
                                '(instead of gzip) to compresss your FASTA.')
                        else:
                            raise
                    else:
                        try:
                            for line in fp:
                                if line[0] == '>':
                                    count += 1
                                    id_ = line[1:].rstrip(' \t\n\r')
                                    connection.execute(
                                        'INSERT INTO sequences(id, '
                                        'fileNumber, offset) VALUES (?, ?, ?)',
                                        (id_, fileNumber, fp.tell()))
                        finally:
                            fp.close()
                else:
                    with open(filename) as fp:
                        offset = 0
                        for line in fp:
                            offset += len(line)
                            if line[0] == '>':
                                count += 1
                                id_ = line[1:].rstrip(' \t\n\r')
                                connection.execute(
                                    'INSERT INTO sequences(id, fileNumber, '
                                    'offset) VALUES (?, ?, ?)',
                                    (id_, fileNumber, offset))
        except sqlite3.IntegrityError as e:
            if str(e).find('UNIQUE constraint failed') > -1:
                original = self._find(id_)
                if original is None:
                    # The id must have appeared twice in the current file,
                    # because we could not look it up in the database
                    # (i.e., it was INSERTed but not committed).
                    raise ValueError(
                        "FASTA sequence id '%s' found twice in file '%s'." %
                        (id_, filename))
                else:
                    origFilename, _ = original
                    raise ValueError(
                        "FASTA sequence id '%s', found in file '%s', was "
                        "previously added from file '%s'." %
                        (id_, filename, origFilename))
            else:
                raise
        else:
            return count
예제 #30
0
파일: fasta.py 프로젝트: acorg/dark-matter
    def addFile(self, filename):
        """
        Add a new FASTA file of sequences.

        @param filename: A C{str} file name, with the file in FASTA format.
            This file must (obviously) exist at indexing time. When __getitem__
            is used to access sequences, it is possible to provide a
            C{fastaDirectory} argument to our C{__init__} to indicate the
            directory containing the original FASTA files, in which case the
            basename of the file here provided in C{filename} is used to find
            the file in the given directory. This allows the construction of a
            sqlite database from the shell in one directory and its use
            programmatically from another directory.
        @raise ValueError: If a file with this name has already been added or
            if the file contains a sequence whose id has already been seen.
        @return: The C{int} number of sequences added from the file.
        """
        endswith = filename.lower().endswith
        if endswith('.bgz') or endswith('.gz'):
            useBgzf = True
        elif endswith('.bz2'):
            raise ValueError(
                'Compressed FASTA is only supported in BGZF format. Use '
                'bgzip to compresss your FASTA.')
        else:
            useBgzf = False

        fileNumber = self._addFilename(filename)
        connection = self._connection
        count = 0
        try:
            with connection:
                if useBgzf:
                    try:
                        fp = bgzf.open(filename, 'rb')
                    except ValueError as e:
                        if str(e).find('BGZF') > -1:
                            raise ValueError(
                                'Compressed FASTA is only supported in BGZF '
                                'format. Use the samtools bgzip utility '
                                '(instead of gzip) to compresss your FASTA.')
                        else:
                            raise
                    else:
                        try:
                            for line in fp:
                                if line[0] == '>':
                                    count += 1
                                    id_ = line[1:].rstrip(' \t\n\r')
                                    connection.execute(
                                        'INSERT INTO sequences(id, '
                                        'fileNumber, offset) VALUES (?, ?, ?)',
                                        (id_, fileNumber, fp.tell()))
                        finally:
                            fp.close()
                else:
                    with open(filename) as fp:
                        offset = 0
                        for line in fp:
                            offset += len(line)
                            if line[0] == '>':
                                count += 1
                                id_ = line[1:].rstrip(' \t\n\r')
                                connection.execute(
                                    'INSERT INTO sequences(id, fileNumber, '
                                    'offset) VALUES (?, ?, ?)',
                                    (id_, fileNumber, offset))
        except sqlite3.IntegrityError as e:
            if str(e).find('UNIQUE constraint failed') > -1:
                original = self._find(id_)
                if original is None:
                    # The id must have appeared twice in the current file,
                    # because we could not look it up in the database
                    # (i.e., it was INSERTed but not committed).
                    raise ValueError(
                        "FASTA sequence id '%s' found twice in file '%s'." %
                        (id_, filename))
                else:
                    origFilename, _ = original
                    raise ValueError(
                        "FASTA sequence id '%s', found in file '%s', was "
                        "previously added from file '%s'." %
                        (id_, filename, origFilename))
            else:
                raise
        else:
            return count
예제 #31
0
    f = open("Data/popAFs4_%s.csv" % chromosome, "w")
    f.write(sep.join(meta + pops) + "\n")
    omniChr = frq2[frq2['CHR'] == chromosome].sort_values(
        3).reset_index().set_index(3)
    compatibilityFails = 0

    freqStats = {}
    ## GNOMAD annotation of our AFs
    # First starting with lines in Gnomad vcf, as this is a vcf flatfile,
    # our genotype array is put into a dataframe

    gme22 = gme[gme['chrom'] == chromosome].set_index('pos')
    gme22 = gme22[~gme22.index.duplicated(
        keep='first')]  ## removing duplicate entries (wrt. index)
    drops = []
    for line in bgzf.open("%s/gnomad.genomes.r2.0.2.sites.chr%s.vcf.gz" %
                          (gnomadDir, chromosome1)):
        line = line.decode()  ## dealing with binary stuff
        if line.startswith("#"): continue
        gnomadLine = GnomADLine(line)
        if gnomadLine.indel: continue
        whichAlt = 0  ## normally just consider first Alternative allele
        uae = False
        fs = None
        if gnomadLine.pos in omniChr.index:
            drops.append(gnomadLine.pos)
            fs = FreqStats(omniChr.loc[gnomadLine.pos])
            if fs.compatibilityCheck(gnomadLine):
                uae = True
                whichAlt = fs.whichAlt  ## a bit hackish: whichAlt is wrt gnomad here

        gnomadLine.info2popfreqs(whichAlt)