def test_append_mode(self): with bgzf.open(self.temp_file, "wb") as h: h.write(b">hello\n") h.write(b"aaaaaaaaaaaaaaaaaa\n") h.flush() previous_offsets = bgzf.split_virtual_offset(h.tell()) # Just flushed, so new block self.assertEqual(previous_offsets[1], 0) with bgzf.open(self.temp_file, "ab") as h: append_position = h.tell() self.assertEqual( (previous_offsets[0] + 28, 0), bgzf.split_virtual_offset(append_position), ) h.write(b">there\n") self.assertEqual( (previous_offsets[0] + 28, 7), bgzf.split_virtual_offset(h.tell()) ) h.write(b"cccccccccccccccccc\n") with bgzf.open(self.temp_file, "rb") as h: self.assertEqual( list(h), [ b">hello\n", b"aaaaaaaaaaaaaaaaaa\n", b">there\n", b"cccccccccccccccccc\n", ], ) h.seek(append_position) self.assertEqual(list(h), [b">there\n", b"cccccccccccccccccc\n"])
def test_write_tell(self): """Check offset works during BGZF writing.""" temp_file = self.temp_file h = bgzf.open(temp_file, "w") # Text mode! # When opening new file, offset should be 0 self.assertEqual(h.tell(), 0) h.write("X" * 100000) offset = h.tell() self.assertNotEqual(offset, 100000) # Should be a virtual offset! # After writing the same data two times, size of the first and the second # write should be equal also in terms of offsets # (This is because the flush ensures two identical blocks written) h.flush() offset1 = h.tell() # Note 'offset' and 'offset1' effectively the same, but not equal # due to the flush - 'offet' is at the end of the first BGZF block, # while 'offset1' is at the start of the second BGZF block. In terms # of the decompressed data, they point to the same location! self.assertNotEqual(offset, offset1) # New block started h.write("Magic" + "Y" * 100000) h.flush() offset2 = h.tell() h.write("Magic" + "Y" * 100000) h.flush() offset3 = h.tell() self.assertEqual( (offset3 << 16) - (offset2 << 16), (offset2 << 16) - (offset1 << 16) ) # Flushing should change the offset h.flush() self.assertNotEqual(offset3, h.tell()) h.close() h = bgzf.open(temp_file, "r") # Text mode! h.seek(offset) # i.e. End of first BGZF block self.assertEqual(offset1, h.tell()) # Note *not* seek offset # Now at start of second BGZF block self.assertEqual(h.read(5), "Magic") h.seek(offset2) self.assertEqual(offset2, h.tell()) self.assertEqual(h.read(5), "Magic") # Now go back in the file, h.seek(offset1) self.assertEqual(offset1, h.tell()) self.assertEqual(h.read(5), "Magic") h.close()
def test_write_tell(self): """Check offset works during BGZF writing""" temp_file = self.temp_file h = bgzf.open(temp_file, "w") # Text mode! # When opening new file, offset should be 0 self.assertEqual(h.tell(), 0) h.write("X" * 100000) offset = h.tell() self.assertNotEqual(offset, 100000) # Should be a virtual offset! # After writing the same data two times, size of the first and the second # write should be equal also in terms of offsets # (This is because the flush ensures two identical blocks written) h.flush() offset1 = h.tell() # Note 'offset' and 'offset1' effectively the same, but not equal # due to the flush - 'offet' is at the end of the first BGZF block, # while 'offset1' is at the start of the second BGZF block. In terms # of the decompressed data, they point to the same location! self.assertNotEqual(offset, offset1) # New block started h.write("Magic" + "Y" * 100000) h.flush() offset2 = h.tell() h.write("Magic" + "Y" * 100000) h.flush() offset3 = h.tell() self.assertEqual(((offset3 << 16) - (offset2 << 16)), ((offset2 << 16) - (offset1 << 16))) # Flushing should change the offset h.flush() self.assertNotEqual(offset3, h.tell()) h.close() h = bgzf.open(temp_file, "r") # Text mode! h.seek(offset) # i.e. End of first BGZF block self.assertEqual(offset1, h.tell()) # Note *not* seek offset # Now at start of second BGZF block self.assertEqual(h.read(5), "Magic") h.seek(offset2) self.assertEqual(offset2, h.tell()) self.assertEqual(h.read(5), "Magic") # Now go back in the file, h.seek(offset1) self.assertEqual(offset1, h.tell()) self.assertEqual(h.read(5), "Magic") h.close()
def test_write_tell(self): """Check offset works during BGZF writing""" temp_file = self.temp_file h = bgzf.open(temp_file, "w") #Text mode! h.write("X" * 100000) offset = h.tell() self.assertNotEqual(offset, 100000) #Should be a virtual offset! h.write("Magic" + "Y" * 100000) h.close() h = bgzf.open(temp_file, "r") #Text mode! h.seek(offset) self.assertEqual(h.read(5), "Magic") h.close()
def fasta_parser(fastas, cur, verbose): """Fasta iterator returning i, seqid as base64 and sequence str. Index sequence using proprietrary parser. Handle bgzip compressed files. """ if verbose: sys.stderr.write("[%s] Hashing and indexing sequences...\n" % datetime.ctime(datetime.now())) #parse fasta i = 0 seqlen = 0 cmd = "INSERT INTO offset_data VALUES (?, ?, ?, ?)" for fi, fn in enumerate(fastas): #add file to db cur.execute("INSERT INTO file_data VALUES (?, ?)", (fi, fn)) #get handle and start byte if fn.endswith('.gz'): handle = bgzf.open(fn) else: handle = open(fn) #parse entries for i, (seq, offset, elen) in enumerate(get_seq_offset_length(handle), i + 1): #if i>10**6: break seqlen += len(seq) cur.execute(cmd, (i, fi, offset, elen)) yield i, seq if verbose: sys.stderr.write(" %s letters in" % seqlen) #fill metadata cur.executemany("INSERT INTO meta_data VALUES (?, ?)", \ (('count', i), ('format', 'fasta'), ('dblength', seqlen))) #and commit changes cur.connection.commit()
def check_gvcf(vcf): #Reads the file and looks for GVCF hints gvcf = False name = vcf out_str = "\n" count = 0 thresh = 10000 tmp = open(vcf, 'r') magic_number = tmp.read(2) tmp.close() with open(vcf) if magic_number != '\x1f\x8b' else bgzf.open(vcf) as rf: for line in rf: count += 1 if '<NON_REF>' in line: gvcf = True break if count >= thresh: break if gvcf: name = vcf + '.vcf' out_str = (""" java -jar $GATK -T GenotypeGVCFs -R $REF --dbsnp $DBSNP --variant %s --out %s.vcf """ % (vcf, vcf)) return (name, out_str)
def sqlite2seq(cur, db, protids): """Return target fastas for protids from sqlite3.""" #open target files cur.execute("SELECT name FROM file_data") files = { } #name: open(os.path.join(os.path.dirname(db), name)) for name, in cur.fetchall()} for name, in cur.fetchall(): # if db is in another directory if os.path.isfile(name): fpath = name else: fpath = os.path.join(os.path.dirname(db), name) # open fasta file if name.endswith('.gz'): files[name] = bgzf.open(fpath) else: files[name] = open(fpath) #get targets cmd = """SELECT f.name, offset, length FROM offset_data o JOIN file_data f ON o.file_number=f.file_number WHERE key IN (%s)""" % ",".join( str(p) for p in protids) cur.execute(cmd) targets = [] for name, offset, length in sorted(cur.fetchall()): try: files[name].seek(offset) targets.append(files[name].read(length)) except: #bgzip sometimes doesn't work at first seek sys.stderr.write( "[Warning] Cannot fetch sequence for %s at %s + %s bytes\n" % (name, offset, length)) return "".join(targets)
def do_something(data, out): o = bgzf.open(out, "w") with gzip.open(data) as f: while True: line = f.readline() l = line.decode() if not l: break if l[0] == "#": print(l.strip(), file=o) if "FORMAT" in l: print( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">", file=o) continue lt = l.split() # lt[4] can have more than one letter: drop ; can be reference (. or <*> ) or alternative (one letter with or without <*>) if lt[4] == "<*>": lt[4] = "." elif ",<*>" in lt[4]: lt[4] = lt[4].replace(",<*>", "") elif "," in lt[4]: continue # We remove the tri-allelic positions # Now that lt[4] is correct we use it for the genotype part lt[8] = "GT" if lt[4] == ".": lt[9] = "0/0" else: lt[9] = "1/1" line = "\t".join(lt) print(line, file=o) o.close()
def store_random_entries(outbase, infiles, n, verbose): """Return target fastas for protids from sqlite3.""" if verbose: sys.stderr.write("Preparing files...\n") files = [] cursors = [] outfiles = [] #open target files/cursors for fi, f in enumerate(infiles, 1): #get & store cursor cur = sqlite3.connect(f.name + '.idx').cursor() cursors.append(cur) #get files cur.execute("SELECT name FROM file_data") files.append({}) #open outfiles outfiles.append(gzip.open(outbase + ".%s.fq.gz" % fi, "w")) for name, in cur.fetchall(): #fpath = os.path.join(os.path.dirname(db), name) if name.endswith('.gz'): files[-1][name] = bgzf.open(name) else: files[-1][name] = open(name) #preload offset data for other files if verbose: sys.stderr.write("Loading offset_data...\n") cmd1 = """SELECT f.name, offset, length FROM offset_data o JOIN file_data f ON o.file_number=f.file_number""" offset_data = [ [], ] for cur in cursors[1:]: cur.execute(cmd1) offset_data.append(cur.fetchall()) #get randomly sorted first file if verbose: sys.stderr.write("Selecting random entries...\n") cmd0 = """SELECT key, f.name, offset, length FROM offset_data o JOIN file_data f ON o.file_number=f.file_number ORDER BY RANDOM()""" if n > 0: cmd0 += " LIMIT %s" % n cursors[0].execute(cmd0) #combine randomised and preloaded data if verbose: sys.stderr.write("Reporting...\n") for i, (key, name, offset, length) in enumerate(cursors[0].fetchall(), 1): if verbose and i % 10000 == 1: sys.stderr.write(" %s \r" % i) #store first file random sequence outfiles[0].write(get_seq(files[0][name], offset, length)) #store sequence from the remaining files key = int(key) for fi in range(1, len(outfiles)): name, offset, length = offset_data[fi][key] outfiles[fi].write(get_seq(files[fi][name], offset, length)) #close for out in outfiles: out.close()
def store_random_entries(outbase, infiles, n, verbose): """Return target fastas for protids from sqlite3.""" if verbose: sys.stderr.write("Preparing files...\n") files = [] cursors = [] outfiles = [] #open target files/cursors for fi, f in enumerate(infiles, 1): #get & store cursor cur = sqlite3.connect(f.name+'.idx').cursor() cursors.append(cur) #get files cur.execute("SELECT name FROM file_data") files.append({}) #open outfiles outfiles.append(gzip.open(outbase+".%s.fq.gz"%fi, "w")) for name, in cur.fetchall(): #fpath = os.path.join(os.path.dirname(db), name) if name.endswith('.gz'): files[-1][name] = bgzf.open(name) else: files[-1][name] = open(name) #preload offset data for other files if verbose: sys.stderr.write("Loading offset_data...\n") cmd1 = """SELECT f.name, offset, length FROM offset_data o JOIN file_data f ON o.file_number=f.file_number""" offset_data = [[], ] for cur in cursors[1:]: cur.execute(cmd1) offset_data.append(cur.fetchall()) #get randomly sorted first file if verbose: sys.stderr.write("Selecting random entries...\n") cmd0 = """SELECT key, f.name, offset, length FROM offset_data o JOIN file_data f ON o.file_number=f.file_number ORDER BY RANDOM()""" if n>0: cmd0 += " LIMIT %s"%n cursors[0].execute(cmd0) #combine randomised and preloaded data if verbose: sys.stderr.write("Reporting...\n") for i, (key, name, offset, length) in enumerate(cursors[0].fetchall(), 1): if verbose and i%10000==1: sys.stderr.write(" %s \r"%i) #store first file random sequence outfiles[0].write(get_seq(files[0][name], offset, length)) #store sequence from the remaining files key = int(key) for fi in range(1, len(outfiles)): name, offset, length = offset_data[fi][key] outfiles[fi].write(get_seq(files[fi][name], offset, length)) #close for out in outfiles: out.close()
def open_file(filename, mode='r'): if 'w' in mode and filename.endswith('.gz'): with bgzf.open(filename, mode) as f: yield f else: if 't' not in mode: mode += 't' _open = gzip.open if filename.endswith('.gz') else open with _open(filename, mode, encoding='utf-8') as f: yield f
def test_double_flush(self): with bgzf.open(self.temp_file, "wb") as h: h.write(b">hello\n") h.write(b"aaaaaaaaaaaaaaaaaa\n") h.flush() pos = h.tell() h.flush() self.assertGreater(h.tell(), pos) # sanity check h.write(b">there\n") h.write(b"cccccccccccccccccc\n") with bgzf.open(self.temp_file, "rb") as h: self.assertEqual( list(h), [ b">hello\n", b"aaaaaaaaaaaaaaaaaa\n", b">there\n", b"cccccccccccccccccc\n", ], )
def test_many_blocks_in_single_read(self): n = 1000 with bgzf.open(self.temp_file, "wb") as h: # create a file with a lot of a small blocks for i in range(n): h.write(b"\x01\x02\x03\x04") h.flush() h.write(b"\nABCD") with bgzf.open(self.temp_file, "rb") as h: data = h.read(4 * n) self.assertEqual(len(data), 4 * n) self.assertEqual(data[:4], b"\x01\x02\x03\x04") self.assertEqual(data[-4:], b"\x01\x02\x03\x04") h.seek(0) data = h.readline() self.assertEqual(len(data), 4 * n + 1) self.assertEqual(data[:4], b"\x01\x02\x03\x04") self.assertEqual(data[-5:], b"\x01\x02\x03\x04\n")
def test_many_blocks_in_single_read(self): n = 1000 h = bgzf.open(self.temp_file, 'wb') # create a file with a lot of a small blocks for i in range(n): h.write(b'\x01\x02\x03\x04') h.flush() h.write(b'\nABCD') h.close() h = bgzf.open(self.temp_file, 'rb') data = h.read(4 * n) self.assertEqual(len(data), 4 * n) self.assertEqual(data[:4], b'\x01\x02\x03\x04') self.assertEqual(data[-4:], b'\x01\x02\x03\x04') h.seek(0) data = h.readline() self.assertEqual(len(data), 4 * n + 1) self.assertEqual(data[:4], b'\x01\x02\x03\x04') self.assertEqual(data[-5:], b'\x01\x02\x03\x04\n')
def create_multi_fastq(fasta_files, output_file): print( "Creating the multifastq file with all the simulated taxa ...", file=sys.stderr, ) # TODO this is super slow, see https://sites.google.com/site/tfsidc/linux-tricks/processing-a-large-number-of-files with bgzf.open(output_file, "wt") as fout: for fasta_file in fasta_files: print("Adding reads from ", fasta_file, file=sys.stderr) with gzip.open(fasta_file, "rb") as fin: shutil.copyfileobj(fin, fout) print("Multi fastq file created", file=sys.stderr)
def zopen(fname, *args, **kwargs): if os.path.isfile(fname): f = open(fname, *args, **kwargs) token = f.read(3) f.seek(0) if token == b'\x1f\x8b\x08': return gzip.GzipFile(fileobj=f) elif token == b'\x42\x5a\x68': return bz2.BZ2File(f) else: return f else: if fname.endswith('.gz'): return bgzf.open(fname, *args, **kwargs) elif fname.endswith('.bz2'): return bz2.open(fname, *args, **kwargs) else: return open(fname, *args, **kwargs)
def parse_raw_swiss(filename, filter_fn=None): """ Given a raw SwissProt format file containing many sequences, return an iterator of raw sequence strings. Option filter_fn argument is for a function which takes in a raw SwissProt format entry and returns a boolean. If True, the string is returned in the iterator, if False it is not. """ handle = bgzf.open(filename) while True: res = _get_record(handle) if not res: break if filter_fn and filter_fn(res): yield res elif not filter_fn: yield res
def test_BgzfBlocks_TypeError(self): """Check get expected TypeError from BgzfBlocks.""" for mode in ("r", "rb"): decompressed = bgzf.open("GenBank/cor6_6.gb.bgz", mode) with self.assertRaises(TypeError): list(bgzf.BgzfBlocks(decompressed))
def test_append_mode(self): with self.assertRaises(NotImplementedError): bgzf.open(self.temp_file, "ab")
def __init__(self, out_prefix, paired=False, bam_header=None, vcf_header=None, no_fastq=False, fasta_instead=False): self.fasta_instead = fasta_instead # TODO Eliminate paired end as an option for fastas. Plan is to create a write fasta method. if self.fasta_instead: fq1 = pathlib.Path(out_prefix + '.fasta.gz') fq2 = None else: fq1 = pathlib.Path(out_prefix + '_read1.fq.gz') fq2 = pathlib.Path(out_prefix + '_read2.fq.gz') bam = pathlib.Path(out_prefix + '_golden.bam') vcf = pathlib.Path(out_prefix + '_golden.vcf.gz') # TODO Make a fasta-specific method self.no_fastq = no_fastq if not self.no_fastq: self.fq1_file = bgzf.open(fq1, 'w') self.fq2_file = None if paired: self.fq2_file = bgzf.open(fq2, 'w') # VCF OUTPUT self.vcf_file = None if vcf_header is not None: self.vcf_file = bgzf.open(vcf, 'wb') # WRITE VCF HEADER self.vcf_file.write('##fileformat=VCFv4.1\n'.encode('utf-8')) reference = '##reference=' + vcf_header[0] + '\n' self.vcf_file.write(reference.encode('utf-8')) self.vcf_file.write( '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n' .encode('utf-8')) self.vcf_file.write( '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n' .encode('utf-8')) self.vcf_file.write( '##INFO=<ID=VMX,Number=1,Type=String,Description="SNP is Missense in these Read Frames">\n' .encode('utf-8')) self.vcf_file.write( '##INFO=<ID=VNX,Number=1,Type=String,Description="SNP is Nonsense in these Read Frames">\n' .encode('utf-8')) self.vcf_file.write( '##INFO=<ID=VFX,Number=1,Type=String,Description="Indel Causes Frameshift">\n' .encode('utf-8')) self.vcf_file.write( '##INFO=<ID=WP,Number=A,Type=Integer,Description="NEAT-GenReads ploidy indicator">\n' .encode('utf-8')) self.vcf_file.write( '##ALT=<ID=DEL,Description="Deletion">\n'.encode('utf-8')) self.vcf_file.write( '##ALT=<ID=DUP,Description="Duplication">\n'.encode('utf-8')) self.vcf_file.write( '##ALT=<ID=INS,Description="Insertion of novel sequence">\n'. encode('utf-8')) self.vcf_file.write( '##ALT=<ID=INV,Description="Inversion">\n'.encode('utf-8')) self.vcf_file.write( '##ALT=<ID=CNV,Description="Copy number variable region">\n'. encode('utf-8')) self.vcf_file.write( '##ALT=<ID=TRANS,Description="Translocation">\n'.encode( 'utf-8')) self.vcf_file.write( '##ALT=<ID=INV-TRANS,Description="Inverted translocation">\n'. encode('utf-8')) # TODO add sample to vcf output self.vcf_file.write( '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'.encode( 'utf-8')) # BAM OUTPUT self.bam_file = None if bam_header is not None: self.bam_file = bgzf.BgzfWriter( bam, 'w', compresslevel=BAM_COMPRESSION_LEVEL) # WRITE BAM HEADER self.bam_file.write("BAM\1") header = '@HD\tVN:1.5\tSO:coordinate\n' for n in bam_header[0]: header += '@SQ\tSN:' + n[0] + '\tLN:' + str(n[3]) + '\n' header += '@RG\tID:NEAT\tSM:NEAT\tLB:NEAT\tPL:NEAT\n' header_bytes = len(header) num_refs = len(bam_header[0]) self.bam_file.write(pack('<i', header_bytes)) self.bam_file.write(header) self.bam_file.write(pack('<i', num_refs)) for n in bam_header[0]: l_name = len(n[0]) + 1 self.bam_file.write(pack('<i', l_name)) self.bam_file.write(n[0] + '\0') self.bam_file.write(pack('<i', n[3])) # buffers for more efficient writing self.fq1_buffer = [] self.fq2_buffer = [] self.bam_buffer = []
def split_by_tags(self, infiles=None, inpath=None, outpath=None, out_filename=None): ''' Split the file into separate files based on MID tags ''' c = self.c if outpath is None: outpath = c.tag_splitby_sample_outpath if out_filename is None: out_filename = c.experiment_name # Setup Record Cycler if infiles is None: infiles = self.next_input_files if inpath is None: inpath = self.next_input_path RecCycler = Cycler(infiles=infiles, filepattern=False, data_inpath=inpath) print ('\nSpliting {0} file(s) based on MID tags' '').format(RecCycler.numfiles) outfiles_dict = {} first_run = 1 # Running through all records in all passed files for recordgen in RecCycler.seqfilegen: # Set / reset Counter tag_counter = Counter() dbtags = self.get_data4file(RecCycler.curfilename, fields=['MIDtag', 'description']) # tags is returned as a list of tuples for each record MID_length = len(dbtags[0][0]) # as tuple of descriptions then tuple of MIDtags tups = zip(*dbtags) # Check using MIDtags as keys would be unique assert len(set(tups[1])) == len(tups[1]), 'Duplicate MIDtags returned for file {0}'.format(RecCycler.curfilename) # Convert to dictionary {'MIDtag': 'description'} dbtags = dict(dbtags) # Open Files for Writing for each tag for tag, desc in dbtags.iteritems(): fname = '-'.join([out_filename, tag, desc]) + '.bgzf' fnamevar = 'f_' + desc # Check that files don't already exist if first_run: # If file already exists, overwrite it. if os.path.isfile(os.path.join(outpath, fname)): f = open(os.path.join(outpath, fname), 'w') f.close() vars()[fnamevar] = bgzf.open(os.path.join(outpath, fname), 'a') outfiles_dict[fnamevar] = fname first_run = 0 for rec in recordgen: recMIDtag = rec.seq[:MID_length].tostring() if recMIDtag not in dbtags: raise Exception('MID tag not found in database for file {0}'.format(RecCycler.curfilename)) else: fnamevar = 'f_' + dbtags[recMIDtag] SeqIO.write(rec, vars()[fnamevar], 'fastq'); tag_counter[recMIDtag] += 1 # Flush and Close Files for each tag for tag, desc in dbtags.iteritems(): fnamevar = 'f_' + desc vars()[fnamevar].flush() vars()[fnamevar].close() # Update datafiles in database filename = outfiles_dict[fnamevar] self.db.add_datafile(filename, [desc], datafile_type='1sample') print 'Finished Splitting MIDtags for input file: {0}'.format(RecCycler.curfilename) # Update counts for tag, desc in dbtags.iteritems(): row = self.db.select('''read_count FROM samples WHERE description=? ''', (desc,)) current_value = row[0]['read_count'] if current_value is None: current_value = 0 self.db.update('''samples SET read_count=? WHERE description=?''', ( current_value + tag_counter[tag], desc)) # Store file names for outfile in outfiles_dict.itervalues(): # Find sample description fname = os.path.split(outfile)[1] if fname.endswith('.bgzf'): fname = fname[:-5] fname_parts = fname.split('-') desc = fname_parts[-1] self.db.update('''samples SET read_file=? WHERE description=?''', (outfile, desc)) # Outputs return / update next inputs self.next_input_path = outpath self.next_input_files = outfiles_dict.values() return (outfiles_dict.values(), outpath)
def entrez_download_sequence(accession, output_file, force=False, mtdna=False): """ Fetch the Entrez fasta record for a nuccore accession. """ # query the assembly database to see if there is an FTP url we can use ftp_url = entrez_assembly_ftp(accession, force) if not mtdna else "" try: if ftp_url: download_entrez_ftp(ftp_url, output_file) return except urllib.error.URLError: pass try: # fetch the fasta record from nuccore r = entrez_request("efetch.fcgi", {"db": "nuccore", "id": accession, "rettype": "fasta", "retmode": "text"}) # the fasta may be empty if this is a "master record" containing multiple other records (NZ_APLR00000000.1) if len(r.text.strip()) > 1: with bgzf.open(output_file, "w") as fout: print(r.text, file=fout) return except requests.exceptions.HTTPError: pass try: # get the full GenBank XML record r = entrez_request("efetch.fcgi", {"db": "nuccore", "id": accession, "rettype": "gb", "retmode": "xml"}) except requests.exceptions.HTTPError: # check for a replacement accession (there may be a newer version if this a WGS project) updated_accession = entrez_find_replacement_accession(accession) # download the updated accession instead entrez_download_sequence(updated_accession, output_file, force) return # parse the XML result etree = ElementTree.XML(r.text) # get the first and last accession codes for this master record first = etree.find(".//GBAltSeqItem_first-accn") last = etree.find(".//GBAltSeqItem_last-accn") if first is None or last is None: print_error( f"Could not download the fasta file for {accession}. Please consider using the `--exclude-accessions` " f"flag to remove accession '{accession}' from this query." ) # get all the related accession codes accessions = entrez_range_accessions(accession, first.text, last.text) try: with bgzf.open(output_file, "w") as fout: # fetch all the accessions in batches for id_list in chunker(accessions, ENTREZ_MAX_UID): r = entrez_request( "efetch.fcgi", {"db": "nuccore", "id": id_list, "rettype": "fasta", "retmode": "text"}, ) # write the fasta data to our bgzip file print(r.text, file=fout) except requests.exceptions.HTTPError: print_error( f"Could not download the accession range '{first.text}-{last.text}' for master record '{accession}'. " f"Please consider using the `--exclude-accessions` flag to remove accession '{accession}' from this query." )
def mergeref(refvcf, othervcf, diploid, mergefoundonly, annotate): """ Adds the read group information by using Picard :param refvcf: VCF file mapped to reference given by ref argument on input, normally the samples. :param othervcf: VCF file of external dataset. :param diploid: Are samples diploid? Are friends electric. :param mergefoundonly: Merged file will contain sites found in both file only. :param annotate: Annotate the ID column of the merged file from the external dataset (othervcf). :param verbose: Verbose output to log. :param cmdfile: File storing external commands invoked. :param logfile: Output log. :return: Name of merged VCF file. """ #First read in the reference (normally, the sample) VCF, and create a line dictionary based on position mergevcf = refvcf[:-7] mergevcf += "-MERGED.vcf.gz" # do a bgzf rad if it is zipped if refvcf[-3:] == ".gz": refun = refvcf[:-3] with bgzf.open(refvcf, 'rb') as f_in, open(refun, 'w') as f_out: shutil.copyfileobj(f_in, f_out) refvcf = refun if othervcf[-3:] == ".gz": otherun = othervcf[:-3] with bgzf.open(othervcf, 'rb') as f_in, open(otherun, 'w') as f_out: shutil.copyfileobj(f_in, f_out) othervcf = otherun print "\nReading " + refvcf + "..." reffile = open(refvcf, 'r') ref_data = [] for file_line in reffile: if len(file_line.rstrip()) > 0: # Strip blank lines ref_data.append(file_line.rstrip()) refheaderline = "" refheaderlist = [] refdict = {} foundheader = False # bar = progressbar.ProgressBar() # for i in bar(range(len(ref_data))): for i in range(len(ref_data)): file_line = ref_data[i] cols = file_line.split() # print cols if foundheader: #from here on, its data # print cols[0]+"-"+cols[1] + " " + str(i) refdict[cols[0] + "-" + cols[1]] = i else: ##just add to header repository if cols[0] == '#CHROM': refheaderline = file_line refhdrcols = cols print " number of total columns in ref " + str(len(refhdrcols)) foundheader = True elif "##fileformat" not in file_line: refheaderlist.append(file_line) reffile.close() foundheader = False #Next, read in print "\nReading " + othervcf + "..." otherfile = open(othervcf, 'r') other_data = [] for file_line in otherfile: if len(file_line.rstrip()) > 0: # Strip blank lines other_data.append(file_line.rstrip()) otherheaderline = "" otherheaderlist = [] otherdict = {} foundheader = False othersamplenames = [] bar = progressbar.ProgressBar() for i in bar(range(len(other_data))): file_line = other_data[i] cols = file_line.split('\t') if foundheader: #from here on, its data otherdict[cols[0] + "-" + cols[1]] = i else: ##just add to header repository if cols[0] == '#CHROM': otherheaderline = file_line othersamplenames = cols[9:] print " number of sample columns in other " + str( len(othersamplenames)) foundheader = True elif "##fileformat" not in file_line: otherheaderlist.append(file_line) otherfile.close() oslen = len(othersamplenames) print "Writing to " + mergevcf mergeout = gzip.open(mergevcf, 'wb') #Merged header mergeout.write("##fileformat=VCFv4.2\n") mergeout.write("##UPA merged file headers for " + refvcf + "\n") for refhdrline in refheaderlist: mergeout.write(refhdrline) mergeout.write("\n") mergeout.write("##UPA merged file headers for " + othervcf + "\n") for otherhdrline in otherheaderlist: mergeout.write(otherhdrline) mergeout.write("\n") mergeout.write("##UPA merged " + refvcf + " and " + othervcf + " with REF alleles set to those of " + refvcf + " and all-missing sites ignored.\n") outhdr = refhdrcols for osn in othersamplenames: outhdr.append(osn) outhdrlen = len(outhdr) print "Header has " + str(outhdrlen) + " columns." hdrline = '\t'.join(outhdr) mergeout.write(hdrline) mergeout.write("\n") print "Merging...." bar = progressbar.ProgressBar() for key, lnum in bar(sorted(refdict.items(), key=refkeysort)): # for key, lnum in sorted(refdict.items(), key=refkeysort): foundother = False refline = linecache.getline( refvcf, lnum + 1).strip() # Add one because linecache lines start on 1 not 0 # print key + " " + str(lnum+1) + " " + refline refcols = refline.split('\t') if key in otherdict: foundother = True otnum = otherdict[key] otherline = linecache.getline(othervcf, otnum + 1).strip() complist = [] othertm = {} # print otherline othercols = otherline.split() # print "\n" # # print key + " " + str(lnum + 1) + " " + refcols[1] + " Otherdict " + othercols[1] trueref = refcols[3] complist.append(trueref) truealts = refcols[4].split(",") for alt in truealts: complist.append(alt) # print "True REF " + trueref otherref = othercols[3] otheralts = othercols[4].split(",") if otherref in complist: pass else: complist.append(otherref) for k in range(len(otheralts)): if otheralts[k] in complist: pass else: complist.append(otheralts[k]) # print complist otherrefloc = complist.index(otherref) othertm[0] = otherrefloc for k in range(len(otheralts)): othertm[k + 1] = complist.index(otheralts[k]) altlist = complist altlist.remove(trueref) # print "TM " # print othertm siteline = [] for l in range(len(refcols)): if l == 4: siteline.append(','.join(altlist)) elif l == 2: if annotate: siteline.append(othercols[l]) else: siteline.append(refcols[l]) else: siteline.append(refcols[l]) # # print "final siteline" #construct for othersite in othercols[9:]: othersites = re.split("[/|]+", othersite) # print othersites olen = len(othersites) # print olen if olen > 1 and not diploid: print "ERROR: not diploid but more than one site at " + key exit(1) oconstruct = "" for i in xrange(olen): osite = othersites[i] if osite == ".": oconstruct += "." # print osite + " becomes ." else: # print osite + " becomes " + str(othertm[int(osite)]) oconstruct += str(othertm[int(osite)]) if i < olen - 1: oconstruct += "/" # FIXME this always ouputs the unphased marker siteline.append(oconstruct) else: # print key + " " + str(lnum+1) + " no match" if mergefoundonly: siteline = "" else: refline = linecache.getline(refvcf, lnum + 1).strip() refcols = refline.split('\t') siteline = refcols for nom in range(oslen): if diploid: siteline.append( "./." ) # FIXME this always ouputs the unphased marker else: siteline.append(".") ##Now check if its all missing or empty allmissing = True for i in xrange(9, len(siteline)): site = siteline[i] if site != "./." and site != "." and site != ".|.": allmissing = False if allmissing: # print "At " + key + " all sites missing, skipping." pass else: siteout = '\t'.join([str(x) for x in siteline]) # print siteout siteout += "\n" if mergefoundonly: if foundother: if len(siteline) != len(outhdr): print "ERROR: Line in merged VCF has " + str( len(siteline)) + " but header line has " + str( len(outhdr)) mergeout.write(siteout) else: if len(siteline) != len(outhdr): print "ERROR: Line in merged VCF has " + str( len(siteline)) + " but header line has " + str( len(outhdr)) mergeout.write(siteout) mergeout.close() return mergevcf
def read_vcf(filepath): """ Read a VCF. :param filepath: str; :return: dict; """ vcf = { 'meta_information': { 'INFO': {}, 'FILTER': {}, 'FORMAT': {}, 'reference': {}, }, 'header': [], 'samples': [], 'data': None, } # Open VCF try: f = open(filepath) f.readline() f.seek(0) bgzipped = False except UnicodeDecodeError: f = bgzf.open(filepath) bgzipped = True for row in f: if bgzipped: row = row.decode() row = row.strip() if row.startswith('##'): # Meta-information # Remove '##' prefix row = row[2:] # Find the 1st '=' ei = row.find('=') # Get field name and field line fn, fl = row[:ei], row[ei + 1:] if fl.startswith('<') and fl.endswith('>'): # Strip '<' and '>' fl = fl[1:-1] # Split field line fl_split = split_ignoring_inside_quotes(fl, ',') # Get ID id_ = fl_split[0].split('=')[1] # Parse field line fd_v = {} for s in fl_split[1:]: ei = s.find('=') k, v = s[:ei], s[ei + 1:] fd_v[k] = remove_nested_quotes(v) # Save if fn in vcf['meta_information']: if id_ in vcf['meta_information'][fn]: raise ValueError('Duplicated ID {}.'.format(id_)) else: vcf['meta_information'][fn][id_] = fd_v else: vcf['meta_information'][fn] = {id_: fd_v} else: print('Didn\'t parse: {}.'.format(fl)) elif row.startswith('#CHROM'): # Header # Remove '#' prefix row = row[1:] # Get header line number vcf['header'] = row.split('\t') vcf['samples'] = vcf['header'][9:] else: break # Close VCF f.close() # Read data vcf['data'] = read_csv(filepath, sep='\t', comment='#', header=None, names=vcf['header']) return vcf
def split_by_subgroups(self, subgroups=None, infiles=None, inpath=None, outpath=None, out_filename=None ): ''' Split the file into separate files based on MID tags ''' if subgroups is None: # Dictionary of regular expressions to match sample discription subgroups = { 'zebra' : '.*zebra.*', 'gazelle' : '.*gazelle.*'} # Compile regexes for k,v in subgroups.iteritems(): subgroups[k] = re.compile(v) c = self.c if outpath is None: outpath = c.tag_splitby_subgroup_outpath if out_filename is None: out_filename = c.experiment_name if not os.path.exists(outpath): os.makedirs(outpath) # Setup Record Cycler if infiles is None: infiles = self.next_input_files if inpath is None: inpath = self.next_input_path RecCycler = Cycler(infiles=infiles, filepattern=False, data_inpath=inpath) print ('\nSpliting {0} file(s) into zebras and gazelles' '').format(RecCycler.numfiles) outfiles_dict = {} first_run = 1 for recordgen in RecCycler.seqfilegen: # Set / reset Counter tag_counter = Counter() dbtags = self.get_data4file(RecCycler.curfilename, fields=['MIDtag', 'description']) # tags is returned as a list of tuples for each record MID_length = len(dbtags[0][0]) # Convert to dictionary {'MIDtag' : 'description' } dbtags = dict(dbtags) # Open Files for Writing for each subgroup for group in subgroups.iterkeys(): fname = '-'.join([out_filename, group]) + '.bgzf' fnamevar = 'f_' + group # Check that files don't already exist if first_run: # If file already exists, overwrite it. if os.path.isfile(os.path.join(outpath, fname)): f = open(os.path.join(outpath, fname), 'w') f.close() vars()[fnamevar] = bgzf.open(os.path.join(outpath, fname), 'a') outfiles_dict[fnamevar] = fname first_run = 0 for rec in recordgen: recMIDtag = rec.seq[:MID_length].tostring() if recMIDtag not in dbtags: raise Exception('MID tag not found in database for file {0}'.format(RecCycler.curfilename)) else: # Get description desc = dbtags[recMIDtag] # Write to approprite file if it matches the regex for group in subgroups.iterkeys(): if subgroups[group].match(desc): fnamevar = 'f_' + group SeqIO.write(rec, vars()[fnamevar], 'fastq'); tag_counter[recMIDtag] += 1 # Flush and Close Files for each tag for group in subgroups.iterkeys(): fnamevar = 'f_' + group vars()[fnamevar].flush() vars()[fnamevar].close() # Update datafiles in database filename = outfiles_dict[fnamevar] desc_list = filter(subgroups[group].match ,dbtags.values()) self.db.add_datafile(filename, desc_list, datafile_type='group') print 'Finished Splitting reads for input file: {0}'.format(RecCycler.curfilename) # Outputs return / update next inputs self.next_input_path = outpath self.next_input_files = outfiles_dict.values() return (outfiles_dict.values(), outpath)
logfile.write("\n") if mito or ychr: diploid = False bcname = "" samplevcffile = "" bampreprocess = True print "\nChecking for input files..." if vcf_file: bampreprocess = False bcbase = os.path.basename(vcf_file) if bcbase[-7:] == ".vcf.gz": bcname = bcbase[:-7] with bgzf.open(bcbase, 'rb') as f_in, open(bcname + ".vcf", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) elif bcbase[-4:] == ".vcf": bcname = bcbase[:-4] else: print "ERROR: Must supply a .vcf or .gz file is using -vcf_file." exit(1) elif bcfile != "" and bamlist == "": bcbase = os.path.basename(bcfile) bcname, fileext = os.path.splitext(bcbase) bcin = open(bcfile, 'r') for bcline in bcin: bccols = bcline.split() binfile = wd + "/" + bccols[1] + "/BWA_" + refname + "/" + bccols[
def genocaller(flist, bedfile, bcname, indent, ref, regionrestrict, threads, verbose, cmdfile, logfile): """ Calls genotypes using Krishna Veeramah's GenoCaller_indent :param flist: File list. :param bedfile: UCSC-style BED file. :param bcname: Base name of input file. :param indent: Indent depth to each end of read. :param ref: Reference genome. :param regionrestrict: Area of genome to limit calling. :param threads: Number of multiprocessing threads to use. :param verbose: Verbose output to log. :param cmdfile: File storing external commands invoked. :param logfile: Output log. :return: Name of merged sample VCF. """ print "\nGenoCaller..." samplevcfnames = [] for i in range(len(flist)): sample = flist[i] gccmd = "GenoCaller_indent.py " + sample + ".bam " + bedfile + " " + ref + " " + indent upa_util.bash_command(gccmd, verbose, cmdfile, logfile) #Must compress to allow bcftools to merge with open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf", 'r') as f_in, bgzf.open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz", 'wb') as f_out: # with open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf", 'r') as f_in, gzip.open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz", 'wb') as f_out: shutil.copyfileobj(f_in, f_out) samplevcfname = sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz" # sampleemitallname = sample + "." + bedfile + ".indent" + str(indent) + ".emit_all.vcf.gz" if os.path.isfile(samplevcfname): upa_util.vcf_name_strip(samplevcfname) upa_util.bash_command("bcftools index --threads " + threads + " " + samplevcfname, verbose, cmdfile, logfile) samplevcfnames.append(samplevcfname) else: print "ERROR: Cannot find " + samplevcfname #Merge the resulting VCFs together using bcftools bcfmergecmd = "bcftools merge --threads " + threads + " -Oz -o " + bcname + "-samples.vcf.gz " if regionrestrict: bcfmergecmd = bcfmergecmd + " -r " + regionrestrict for samplevcfname in samplevcfnames: bcfmergecmd = bcfmergecmd + samplevcfname + " " upa_util.bash_command(bcfmergecmd, verbose, cmdfile,logfile) return bcname + "-samples.vcf"
def addFile(self, filename): """ Add a new FASTA file of sequences. @param filename: A C{str} file name, with the file in FASTA format. This file must (obviously) exist at indexing time. When __getitem__ is used to access sequences, it is possible to provide a C{fastaDirectory} argument to our C{__init__} to indicate the directory containing the original FASTA files, in which case the basename of the file here provided in C{filename} is used to find the file in the given directory. This allows the construction of a sqlite database from the shell in one directory and its use programmatically from another directory. @raise ValueError: If a file with this name has already been added or if the file contains a sequence whose id has already been seen. @return: The C{int} number of sequences added from the file. """ endswith = filename.lower().endswith if endswith('.bgz') or endswith('.gz'): useBgzf = True elif endswith('.bz2'): raise ValueError( 'Compressed FASTA is only supported in BGZF format. Use ' 'bgzip to compresss your FASTA.') else: useBgzf = False fileNumber = self._addFilename(filename) connection = self._connection count = 0 try: with connection: if useBgzf: try: fp = bgzf.open(filename, 'rb') except ValueError as e: if str(e).find('BGZF') > -1: raise ValueError( 'Compressed FASTA is only supported in BGZF ' 'format. Use the samtools bgzip utility ' '(instead of gzip) to compresss your FASTA.') else: raise else: try: for line in fp: if line[0] == '>': count += 1 id_ = line[1:].rstrip(' \t\n\r') connection.execute( 'INSERT INTO sequences(id, ' 'fileNumber, offset) VALUES (?, ?, ?)', (id_, fileNumber, fp.tell())) finally: fp.close() else: with open(filename) as fp: offset = 0 for line in fp: offset += len(line) if line[0] == '>': count += 1 id_ = line[1:].rstrip(' \t\n\r') connection.execute( 'INSERT INTO sequences(id, fileNumber, ' 'offset) VALUES (?, ?, ?)', (id_, fileNumber, offset)) except sqlite3.IntegrityError as e: if str(e).find('UNIQUE constraint failed') > -1: original = self._find(id_) if original is None: # The id must have appeared twice in the current file, # because we could not look it up in the database # (i.e., it was INSERTed but not committed). raise ValueError( "FASTA sequence id '%s' found twice in file '%s'." % (id_, filename)) else: origFilename, _ = original raise ValueError( "FASTA sequence id '%s', found in file '%s', was " "previously added from file '%s'." % (id_, filename, origFilename)) else: raise else: return count
f = open("Data/popAFs4_%s.csv" % chromosome, "w") f.write(sep.join(meta + pops) + "\n") omniChr = frq2[frq2['CHR'] == chromosome].sort_values( 3).reset_index().set_index(3) compatibilityFails = 0 freqStats = {} ## GNOMAD annotation of our AFs # First starting with lines in Gnomad vcf, as this is a vcf flatfile, # our genotype array is put into a dataframe gme22 = gme[gme['chrom'] == chromosome].set_index('pos') gme22 = gme22[~gme22.index.duplicated( keep='first')] ## removing duplicate entries (wrt. index) drops = [] for line in bgzf.open("%s/gnomad.genomes.r2.0.2.sites.chr%s.vcf.gz" % (gnomadDir, chromosome1)): line = line.decode() ## dealing with binary stuff if line.startswith("#"): continue gnomadLine = GnomADLine(line) if gnomadLine.indel: continue whichAlt = 0 ## normally just consider first Alternative allele uae = False fs = None if gnomadLine.pos in omniChr.index: drops.append(gnomadLine.pos) fs = FreqStats(omniChr.loc[gnomadLine.pos]) if fs.compatibilityCheck(gnomadLine): uae = True whichAlt = fs.whichAlt ## a bit hackish: whichAlt is wrt gnomad here gnomadLine.info2popfreqs(whichAlt)