def chunk_fastq_file(fastq_filename, new_filename, parse_rec): """ Create a new FASTQ file from an existing one. :param str fastq_filename: the name of the original BAM file :param str new_filename: the name of the new BAM file :param class:`ParseRecord` parse_rec: the information containing where to extract :return: """ try: os.remove(new_filename) except Exception as e: pass # copy the header from original BAM file to new bytes_from_file(fastq_filename, new_filename, 0, parse_rec.header_size) if parse_rec.begin_read_offset > 0: # if there are reads before a chunk offset, we need to extract them b = bgzf.BgzfReader(fastq_filename) b2 = bgzf.BgzfWriter(new_filename, mode="a") b.seek(parse_rec.begin_read_offset) b2.write(b.read(parse_rec.begin_read_size)) b2.close() # grab bgzf chunks from the OLD FASTQ file and append to NEW FASTQ file bytes_from_file(fastq_filename, new_filename, parse_rec.file_offset, parse_rec.file_bytes) if parse_rec.end_read_offset > 0: # if there are reads after a chunk offset, we need to extract them b = bgzf.BgzfReader(fastq_filename) b2 = bgzf.BgzfWriter(new_filename, mode="a") b.seek(parse_rec.end_read_offset) b2.write(b.read(parse_rec.end_read_size)) b2.close()
def filter_pe_fasq_by_len(fq_1, fq_2, minlen, prefix): '''filter pe reads by min length''' fq_1_ = prefix + ".gt" + str(minlen) + ".1.fq.gz" fq_2_ = prefix + ".gt" + str(minlen) + ".2.fq.gz" with bgzf.BgzfWriter(fq_1_, 'wb') as out_1, bgzf.BgzfWriter(fq_2_, 'wb') as out_2: with gzip.open(fq_1, 'rt') as in_1, gzip.open(fq_2, 'rt') as in_2: for rec_a, rec_b in zip(SeqIO.parse(in_1, 'fastq'), SeqIO.parse(in_2, 'fastq')): if (len(rec_a.seq) > minlen) and (len(rec_b.seq) > minlen): SeqIO.write(rec_a, out_1, 'fastq') SeqIO.write(rec_b, out_2, 'fastq')
def __call__(self): global F_Flag F_Flag = self.Fa_Flag FastQFlag = self.fqflag read_lists = [] MinTime = time.time() FailedReads = 0 ChrList = [[] for _ in range(512)] global Visits Visits = [] ChannelDict = {str(il): () for il in range(1, 513)} Outdata = [] global f5 f5 = h5py.File(self.multi_fast5, 'r') f5.visit(Visits.append) reads_list_to_read = f5.keys() RefStart = time.mktime( par.parse(f5[str(reads_list_to_read[0]) + '/tracking_id'].attrs['exp_start_time'].decode( 'UTF-8')).timetuple()) for r in reads_list_to_read: res = self.get_content(r, FastQFlag) Outdata.append(res) if FastQFlag == True: file_out = os.path.join(self.t_dir, 'tmp.' + str(self.Norder) + '.fastq.gz') Gzout = yielder(read_lists) with bgzf.BgzfWriter(file_out, "wb") as outgz: SeqIO.write(sequences=Gzout, handle=outgz, format="fastq") Outdata.append(RefStart) return Outdata
def output_vcf_population(self, control_size, test_size, male_odds, compression_level): """ Output a population .vcf file and companion .fam file. :param compression_level: level of gzip compression (1-9) :param test_size: size of control group :param control_size: size of cases/test group :param male_odds: odds of a person being a biological male :return: """ if not self.ordered_snps: raise Exception("No SNPs to Process! Exiting.") # pick deleterious groups for population size deleterious_group_list = PopulationFactory.pick_deleterious_groups(list(self.deleterious.values()), test_size) fam_data = self.generate_fam_file(control_size, test_size, male_odds, deleterious_group_list) main_file = self.population_dir + "population.vcf.gz" CHUNK_SIZE = 500000 # Defines work chunks that have a sync point after each one. Helps with memory issues. with bgzf.BgzfWriter(filename=main_file, mode='wt+', compresslevel=compression_level) as f: header = gen_vcf_header(fam_data) f.write(header) print("Outputing VCF lines", flush=True) chunks = int(len(self.ordered_snps) / CHUNK_SIZE) if chunks < 1: chunks = 1 for i, snps_list in enumerate(split_list(self.ordered_snps, chunks)): self.write_vcf_snps(fam_data, snps_list, f) print("%s Finished work chunk %i of %i." % (datetime.now().strftime("%Y-%m-%d %H:%M"), i + 1, chunks), flush=True) print("Finished VCF file output.", flush=True)
def __call__(self): file_out = os.path.join(self.t_dir, 'tmp.' + str(self.Norder) + '.fastq.gz') Gzout = get_content(self.datas) with bgzf.BgzfWriter(file_out, "wb") as outgz: SeqIO.write(sequences=Gzout, handle=outgz, format="fastq") return file_out
def __init__(self, path, mode='r'): """ Store tabular information tied to genomic locations in a bgzipped file Args: path (str) : path to file mode (str) : mode, r: read, w: write """ self.path = path self.index_path = f'{path}.idx' self.prev_contig = None self.mode = mode self.index = {} if self.mode == 'w': self.bgzf_handle = bgzf.BgzfWriter(self.path, 'w') self.index_handle = open(self.index_path, 'wt') elif self.mode == 'r': if not os.path.exists(self.path): raise ValueError(f'BGZIP index file missing at {self.path}') self.bgzf_handle = bgzf.BgzfReader(self.path, 'rt') if not os.path.exists(self.index_path): raise ValueError( f'BGZIP index file missing at {self.index_path}') self.index_handle = open(self.index_path, 'rt') for line in self.index_handle: contig, start = line.strip().split() self.index[contig] = int(start) else: raise ValueError('Mode can be r or w') self.cache = {}
def filter(logger, source_file, target_file, output_prefix, method): check_file_exists(source_file) check_file_exists(target_file) check_tool_exists("bedtools") check_tool_exists("tabix") tmp_bed = output_prefix + ".tmp.bed" method_command = "bedtools %s -a \"%s\" -b \"%s\" > %s" % ( method, source_file, target_file, tmp_bed) runCmd(method_command, logger) if not os.path.isfile(tmp_bed): raise Exception("bedtools failed, no output file generated.") tmp_file = output_prefix + ".tmp.bed.bgz" logger.info("Writing dinucleotide to " + tmp_file + " ...") with bgzf.BgzfWriter(tmp_file, "wb") as fout: with open(tmp_bed, "rt") as fin: for line in fin: fout.write(line) os.remove(tmp_bed) output_file = output_prefix + ".bed.bgz" if os.path.exists(output_file): os.remove(output_file) os.rename(tmp_file, output_file) runCmd("tabix -p bed %s " % output_file, logger) count_file = output_prefix + ".count" dinucleotide_to_count(logger, output_file, count_file) logger.info("done.")
def main(args): if args.file: with open(args.file[0]) as f: data = f.read() elif not sys.stdin.isatty(): data = sys.stdin.read() else: raise ValueError('No input data detected') w = bgzf.BgzfWriter(fileobj=sys.stdout.buffer) w.write(data) w.close()
def exract_diff_seq(diff_genes, fasta): gene_dict = {each.strip(): 1 for each in open(diff_genes)} seq_list = [] for seq_record in SeqIO.parse(fasta, "fasta"): gene_id = re.search('gene=(\S+)', seq_record.description).groups()[0] if gene_id in gene_dict: seq_list.append(seq_record) d_path, d_name = os.path.split(os.path.abspath(diff_genes)) d_prefix = os.path.splitext(d_name)[0] d_fa_path = os.path.join(d_path, '{p}.fa.gz'.format(p=d_prefix)) with bgzf.BgzfWriter(d_fa_path, "wb") as outgz: SeqIO.write(sequences=seq_list, handle=outgz, format="fasta")
def __init__(self, path, mode='r', read_all=False): """ Store tabular information tied to genomic locations in a bgzipped file Args: path (str) : path to file mode (str) : mode, r: read, w: write read_all(bool) : when enabled all data is read from the file and the handles are closed """ self.path = path self.index_path = f'{path}.idx' self.prev_contig = None self.mode = mode self.index = {} self.cache = {} if self.mode == 'w': self.bgzf_handle = bgzf.BgzfWriter(self.path, 'w') self.index_handle = open(self.index_path, 'wt') elif self.mode == 'r': if not os.path.exists(self.path): raise ValueError(f'BGZIP index file missing at {self.path}') self.bgzf_handle = bgzf.BgzfReader(self.path, 'rt') if not os.path.exists(self.index_path): raise ValueError( f'BGZIP index file missing at {self.index_path}') self.index_handle = open(self.index_path, 'rt') for line in self.index_handle: contig, start = line.strip().split() self.index[contig] = int(start) if read_all: for line in self.bgzf_handle: if len(line) == 0: continue line_contig, line_pos, line_strand, rest = self.read_file_line( line) #print((line_pos, line_strand,rest)) if not line_contig in self.cache: self.cache[line_contig] = {} self.cache[line_contig][(line_pos, line_strand)] = rest cpos = line_pos self.bgzf_handle.close() self.bgzf_handle = None self.index_handle.close() self.index_handle = None else: raise ValueError('Mode can be r or w')
def sideEffect(self, filename, *args, **kwargs): if self.count <= 1: self.test.assertEqual('filename.fasta.bgz', filename) self.count += 1 writerIO = BytesIO() writer = bgzf.BgzfWriter(fileobj=writerIO) writer.write(b'>id0\nAC\n') writer.flush() fileobj = BytesIO(writerIO.getvalue()) fileobj.mode = 'rb' return bgzf.BgzfReader(fileobj=fileobj) else: self.test.fail( 'Open called too many times. Filename: %r, Args: %r, ' 'Keyword args: %r.' % (filename, args, kwargs))
def main(vcf_input, output, remove_fields=[], keep_fields=[]): ''' Remove INFO fields from VCF. Args: vcf_input: input VCF file output: VCF output file. Will write to STDOUT if not provided. remove_fields: One or more INFO fields to remove. Can not be used in conjunction with keep_fields argument. keep_fields: One or more INFO fields to keep. All other INFO fields defined in the VCF header will be removed. Can not be used in conjunction with remove_fields argument. ''' if remove_fields and keep_fields: raise RuntimeError("remove_fields and keep_fields arguments are " + "mutually exclusive.") vcf = VcfReader(vcf_input) if output is None: vcf_writer = sys.stdout elif output.endswith(('.gz', '.bgz')): from Bio import bgzf vcf_writer = bgzf.BgzfWriter(output) else: vcf_writer = open(output, 'w') new_head = [] if keep_fields: remove_fields = [ x for x in vcf.header.metadata['INFO'].keys() if x not in keep_fields ] for h in vcf.header.meta_header: match = info_re.match(h) if not match or match.group(1) not in remove_fields: new_head.append(h) vcf_writer.write("\n".join(new_head) + "\n") vcf_writer.write("\t".join(vcf.col_header) + "\n") for record in vcf: record.remove_info_fields(remove_fields) vcf_writer.write(str(record) + "\n") vcf_writer.close()
def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz): if in_gz: in_h = gzip.open(fa_in, 'rt') else: in_h = open(fa_in, 'r') if gz: out_h = bgzf.BgzfWriter(fa_out, 'wb') else: out_h = open(fa_out, 'w') writer = FastaWriter(out_h) writer.write_header() for rec in FastaIterator(in_h, title2ids=header_function): writer.write_record(rec) writer.write_footer() out_h.close() in_h.close()
def rewrite(self, compressed_input_file, output_file): h = gzip.open(compressed_input_file, "rb") data = h.read() h.close() h = bgzf.BgzfWriter(output_file, "wb") h.write(data) h.close() #Gives empty BGZF block as BAM EOF marker h = gzip.open(output_file) new_data = h.read() h.close() #Check the decompressed files agree self.assert_(new_data, "Empty BGZF file?") self.assertEqual(len(data), len(new_data)) self.assertEqual(data, new_data)
def fasta2dinucleotide(logger, fasta_file, bed_file, output_prefix, is_test=False): check_file_exists(fasta_file) check_file_exists(bed_file) regions = read_coordinate_file(bed_file, "region", checkOverlap=True) chromRegionMap = {} for region in regions: chromRegionMap.setdefault(region.reference_name, []).append(region) tmp_file = output_prefix + ".tmp.bed.bgz" with bgzf.BgzfWriter(tmp_file, "wb") as fout: with open(fasta_file, "rt") as fin: for record in SeqIO.parse(fin, 'fasta'): id = record.id if id not in chromRegionMap: continue logger.info("Extracting dinucleotide of " + id + " ...") seq = str(record.seq) catItems = chromRegionMap[id] for ci in catItems: catSeq = seq[ci.reference_start:ci.reference_end].upper() if ci.strand == '-': catSeq = str(Seq(catSeq).reverse_complement()) for si in range(0, len(catSeq) - 2): dinu = catSeq[si:(si + 2)].upper() fout.write( "%s\t%d\t%d\t%s\t%d\t%s\n" % (id, ci.reference_start + si, ci.reference_start + si + 2, dinu, 1, ci.strand)) output_file = output_prefix + ".bed.bgz" if os.path.exists(output_file): os.remove(output_file) os.rename(tmp_file, output_file) runCmd("tabix -p bed %s " % output_file, logger) count_file = output_prefix + ".count" dinucleotide_to_count(logger, output_file, count_file) logger.info("done.")
def barcode_to_tag(input_file, output_file, barcode, verbose): samfile = pysam.AlignmentFile(input_file, "rb") header = str(samfile.header) barcode_length = len(regex.findall('B', barcode)) umi_length = len(regex.findall('U', barcode)) barcode_pattern = '(' if barcode_length > 0: barcode_pattern += '(.?)BC:Z:[A-Z]{' + str(barcode_length) + '}' if umi_length > 0: barcode_pattern += '(.?)RX:Z:[A-Z]{' + str(umi_length) + '}' barcode_pattern += ')' total = 0 wrote = 0 dirname, basename = os.path.split(input_file) with tempfile.TemporaryFile(prefix=basename, dir=dirname) as tmp: with pysam.AlignmentFile(input_file, "rb") as infile: for read in infile: name_list = regex.split(barcode_pattern, read.query_name) if name_list[0] == '': name_list[2] = name_list[2].replace('.', '', 1) tags = regex.match(barcode_pattern, read.query_name).group().replace('.', '', 1) read.query_name = ''.join([name_list[0], name_list[2]]) # tags = tuple(name_list[1].replace('.', ':Z:').split(':Z:'))[1:] # read.tags = read.tags + [ tags[x:x + 2] for x in range(0, len(tags), 2) ] tags = tuple(tags.replace('.', ':Z:').split(':Z:')) read.tags = read.tags + [ tags[x:x + 2] for x in range(0, len(tags), 2) ] tmp.write((pysam.AlignedSegment.to_string(read)+'\n').encode('utf8')) total += 1 tmp.seek(0) with bgzf.BgzfWriter(output_file, "wb") as outfile: outfile.write(header.encode('utf8')) for read in tmp: outfile.write(read) wrote += 1 if verbose: print(total, "entries read from the input file.") print(wrote, "entries written to the output file.") return 0
def rewrite(self, compressed_input_file, output_file): with gzip.open(compressed_input_file, "rb") as h: data = h.read() with bgzf.BgzfWriter(output_file, "wb") as h: h.write(data) self.assertFalse(h.seekable()) self.assertFalse(h.isatty()) self.assertEqual(h.fileno(), h._handle.fileno()) # Context manager should call close(), # Gives empty BGZF block as BAM EOF marker with gzip.open(output_file) as h: new_data = h.read() # Check the decompressed files agree self.assertTrue(new_data, "Empty BGZF file?") self.assertEqual(len(data), len(new_data)) self.assertEqual(data, new_data)
def rewrite(self, compressed_input_file, output_file): h = gzip.open(compressed_input_file, "rb") data = h.read() h.close() h = bgzf.BgzfWriter(output_file, "wb") h.write(data) self.assertFalse(h.seekable()) self.assertFalse(h.isatty()) self.assertEqual(h.fileno(), h._handle.fileno()) h.close() # Gives empty BGZF block as BAM EOF marker h = gzip.open(output_file) new_data = h.read() h.close() #Check the decompressed files agree self.assertTrue(new_data, "Empty BGZF file?") self.assertEqual(len(data), len(new_data)) self.assertEqual(data, new_data)
def __call__(self, string): # the special argument "-" means sys.std{in,out} if string == '-': if 'r' in self._mode: return sys.stdin elif 'w' in self._mode: return sys.stdout else: raise ValueError('argument "-" with mode %r' % self._mode) # all other arguments are used as file names try: if string[-3:] == ".gz": from Bio import bgzf if 'r' in self._mode: return bgzf.BgzfReader(string, self._mode) elif 'w' in self._mode or 'a' in self._mode: return bgzf.BgzfWriter(string, self._mode) else: return open(string, self._mode, self._bufsize) except OSError as e: raise ArgumentTypeError("can't open '%s': %s" % (string, e))
def gzip_speed(): gzip_pipe = subprocess.Popen(args="gzip -c > tmp_file.gz", shell=True, stdin=subprocess.PIPE) randos = [] for i in range(20000): rands = numpy.random.rand(300) string = " ".join(map(lambda x: str(x), rands)) randos.append(string) with timer.Timer(logger=print, name="OS GZip") as t: for r in randos: gzip_pipe.stdin.write(r.encode()) gzip_pipe.stdin.close() gzip_pipe.wait() with timer.Timer(logger=print, name="GZipLib") as t, gzip.open("tmp2_file.gz", 'wt+', compresslevel=4) as f: for r in randos: f.write(r) with timer.Timer(logger=print, name="BGZipLib") as t, bgzf.BgzfWriter("tmp3_file.gz", 'wt+', compresslevel=4) as f: for r in randos: f.write(r)
def phase_segment(out_filename, res, tumor_recs_used_reg, de_ads): with bgzf.BgzfWriter(out_filename) as fout: for info, ads in zip(tumor_recs_used_reg[res.phased], de_ads[res.phased]): segment_first = True first_max = -1 for rec, ad in zip(info, ads): line = f'{rec[0]}\t{rec[1]}\t{rec[2]}\t{rec[3]}\t{rec[4]}\t{rec[5]}\t{ad[0]}\t{ad[1]}' if first_max < 0: if ad[0] > ad[1]: first_max = 0 else: first_max = 1 if segment_first: if first_max == 0: if ad[0] > ad[1]: line += '\t0/1' else: line += '\t1/0' else: if ad[0] > ad[1]: line += '\t1/0' else: line += '\t0/1' segment_first = False else: if first_max == 0: if ad[0] > ad[1]: line += '\t0|1' else: line += '\t1|0' else: if ad[0] > ad[1]: line += '\t1|0' else: line += '\t0|1' fout.write(line + '\n')
def bam2dinucleotide(logger, bamFile, output_prefix, genomeFastaFile, mappingQuality=20, uniqueOnly=False, minCoverage=1, isTest=False): check_file_exists(bamFile) check_file_exists(genomeFastaFile) logger.info("reading bam file %s ..." % bamFile) dinuItems = [] count = 0 with pysam.AlignmentFile(bamFile, "rb") as sf: for s in sf.fetch(): count = count + 1 if count % 1000000 == 0: logger.info(count) if isTest: break if s.is_unmapped: continue if s.is_paired and (not s.is_read1): continue if s.mapping_quality < mappingQuality: continue if uniqueOnly: isUnique = True for tag in s.tags: if tag[0] == 'XS': isUnique = False break if not isUnique: continue if s.is_reverse: dinuItems.append( DinucleotideItem(s.reference_name, s.reference_end, s.reference_end + 2, s.query_name, s.mapping_quality, "-", "")) else: dinuItems.append( DinucleotideItem(s.reference_name, s.reference_start - 2, s.reference_start, s.query_name, s.mapping_quality, "+", "")) chrDinuMap = OrderedDict() for di in dinuItems: chrDinuMap.setdefault(di.reference_name, []).append(di) for chr in chrDinuMap.keys(): values = chrDinuMap[chr] logger.info("sort %d dinucleotides in chromosome %s..." % (len(values), chr)) values.sort(key=get_reference_start) logger.info("combine %d dinucleotides in chromosome %s..." % (len(values), chr)) idx = len(values) - 1 deleteList = set() while (idx > 0): curDinu = values[idx] prev = idx - 1 while (prev >= 0): prevDinu = values[prev] if curDinu.reference_start != prevDinu.reference_start: break if curDinu.strand == prevDinu.strand: prevDinu.count = prevDinu.count + curDinu.count deleteList.add(idx) break prev = prev - 1 idx = idx - 1 chrDinuMap[chr] = [ i for j, i in enumerate(values) if j not in deleteList ] logger.info( "after combine, there is %d dinucleotides in chromosome %s..." % (len(chrDinuMap[chr]), chr)) if minCoverage > 1: for chr in chrDinuMap.keys(): values = chrDinuMap[chr] chrDinuMap[chr] = [v for v in values if v.count >= minCoverage] with open(genomeFastaFile, "rt") as fin: for record in SeqIO.parse(fin, 'fasta'): id = record.id logger.info("Filling dinucleotide of " + id + " ...") if id in chrDinuMap.keys(): seq = str(record.seq) seqlen = len(seq) chrDinuItems = chrDinuMap[id] for di in chrDinuItems: if di.reference_start >= 0 and di.reference_end <= seqlen: dinu = seq[di.reference_start:di.reference_end].upper() if di.strand == "+": dinu = str(Seq(dinu).reverse_complement()) di.dinucleotide = dinu tmp_file = output_prefix + ".tmp.bed.bgz" logger.info("Writing dinucleotide to " + tmp_file + " ...") with bgzf.BgzfWriter(tmp_file, "wb") as fout: for chrom in chrDinuMap.keys(): diList = chrDinuMap[chrom] for s in diList: if (s.dinucleotide != "") and (not 'N' in s.dinucleotide): fout.write( "%s\t%d\t%d\t%s\t%d\t%s\n" % (s.reference_name, s.reference_start, s.reference_end, s.dinucleotide, s.count, s.strand)) output_file = output_prefix + ".bed.bgz" if os.path.exists(output_file): os.remove(output_file) os.rename(tmp_file, output_file) runCmd("tabix -p bed %s " % output_file, logger) count_file = output_prefix + ".count" dinucleotide_to_count(logger, output_file, count_file) logger.info("done.")
try: allele_info = open(my_parser().allele, "r") except FileNotFoundError: bomb(f"Missing argument or '{my_parser().allele}' may be empty\n") toto = my_parser().type_ if not my_parser().out: bomb('Missing argument, "-o PREFIX", "--out PREFIX"\n' 'Error: run `./snprecode -h` for complete arguments list ' 'required to recode from FImpute\n') fo = my_parser().out # open headers if toto == 1: geno_out = bgzf.BgzfWriter(fo + ".vcf.gz", "wb") # write header geno_out.write("".join('''##fileformat=VCFv4.2 ##filedate=%s ##source="snprecode v1.0.4" ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> ''' % (datetime.today().strftime('%Y%m%d')))) elif toto == 2: geno_out = open(fo + ".ped", "w") else: print( f'\nError!: Missing argument. Specify the recode type: `-t 1` (for VCF) or `-t 2` (for PED/MAP)]' ) print( 'run `./snprecode -h` for complete arguments list required to recode from FImpute\n' )
import gzip import argparse parser = argparse.ArgumentParser(prog='fastq_trim_umi') parser.add_argument('-i', help='input fastq.gz file', dest='input_fastq_gz') parser.add_argument('-o', help='output fastq.gz file', dest='output_fastq_gz') parser.add_argument('-l', help='length of the UMI barcode', dest='umi_len', default=12, type=int) args = parser.parse_args() ifilename = args.input_fastq_gz ofilename = args.output_fastq_gz umilen = args.umi_len with gzip.open(ifilename, "r") as handle, bgzf.BgzfWriter(ofilename, "wb") as fout: for rec in SeqIO.parse(handle, "fastq"): umi = rec.seq[0:umilen] rid = rec.description + ":" + str(umi) rseq = Seq(str(rec.seq)[umilen:], SingleLetterAlphabet) rq = rec.letter_annotations["phred_quality"][umilen:] nrec = SeqRecord(rseq, id=rid, description="") nrec.letter_annotations["phred_quality"] = rq SeqIO.write(sequences=nrec, handle=fout, format="fastq")
args = parser.parse_args() if DEBUG: args.input = "T:/Shared/Labs/Linton Lab/20180913_linton_exomeseq_2118_human_cutadapt/bwa_refine_hc_gvcf_hardfilter/result/linton_exomeseq_2118.pass.vcf" args.output = "T:/Shared/Labs/Linton Lab/20180913_linton_exomeseq_2118_human_cutadapt/bwa_refine_hc_gvcf_hardfilter_vep/result/linton_exomeseq_2118.pass.filtered.vcf" percentage = float(args.percentage) frequency = float(args.frequency) logger = initialize_logger(args.output + ".log", 'filterVcf', args.debug) logger.info(str(args)) basename = os.path.splitext(args.output)[0] if args.output.endswith(".gz"): outputTemp = basename + ".tmp.gz" fout = bgzf.BgzfWriter(outputTemp, "wb") fdiscard = bgzf.BgzfWriter(basename + ".discard.gz", "wb") else: outputTemp = basename + ".tmp" fout = open(outputTemp, "wt") fdiscard = open(basename + ".discard", "wt") if args.input.endswith(".gz"): if is_version_2(): fin = gzip.open(args.input, 'rb') else: fin = gzip.open(args.input, 'rt') else: fin = open(args.input, "r") with fout:
def __call__(self): FastQFlag = self.flag datas = self.datas read_lists = [] MinTime = time.time() FailedReads = 0 ChrList = [] try: file_path_check = next(i for i in datas if 'pass' in i) except: try: file_path_check = next(i for i in datas if 'fail' in i) except: file_path_check = datas[0] p_check = Pathcheck(file_path_check) for ds in datas: try: res = get_content(ds, FastQFlag, p_check) ch = res[0][0] mu = res[0][1] ChrList.append(res[0][1:]) NewTime = float(res[0][2]) if NewTime < MinTime: MinTime = NewTime if FastQFlag == True: if float(res[0][4]) >= 7.0: read_lists.append(res[1]) except: del ds ChrList[:] = [ ChrList[z] for z in (y[0] for y in sorted(enumerate(zip(*ChrList)[1]), key=lambda z: z[1])) ] TimeVec = map( lambda x: int( time.strftime( '%H', time.localtime(float(x) - float(self.RefStart)))), zip(*ChrList)[1]) #print MinTime, str(self.RefStart) #print ChrList #print TimeVec for e in range(len(TimeVec)): hour = TimeVec[e] if e > 1: if hour < TimeVec[e - 1]: TimeVec[e] = hour + 24 ChrList = zip(TimeVec, list(zip(*ChrList)[2]), list(zip(*ChrList)[3]), list(zip(*ChrList)[0]), list(zip(*ChrList)[4])) ReadPerChannel = len(ChrList) BasesPerChannel = sum(map(int, zip(*ChrList)[1])) MuxProductivity = {str(il): [] for il in range(1, 5)} for k in MuxProductivity.keys(): MuxProd = {} MuxBase = {} MucList = map( lambda y: ChrList[y], [i for i, x in enumerate(zip(*ChrList)[3]) if x == k]) for hr, val, qual, muc, gcs in MucList: if float(qual) < 7.0: FailedReads += 1 if str(hr) in MuxProd: MuxProd[str(hr)] += int(val) MuxBase[str(hr)] += 1 else: MuxProd[str(hr)] = int(val) MuxBase[str(hr)] = 1 MuxProductivity[k].append( dict( zip(MuxBase.keys(), zip(MuxBase.values(), MuxProd.values())))) if FastQFlag == True: file_out = os.path.join(self.t_dir, 'tmp.' + str(self.Norder) + '.fastq.gz') Gzout = yielder(read_lists) with bgzf.BgzfWriter(file_out, "wb") as outgz: SeqIO.write(sequences=Gzout, handle=outgz, format="fastq") ObjectOut = [] ObjectOut.append(str(self.Norder)) ObjectOut.append(str(ReadPerChannel)) ObjectOut.append(str(BasesPerChannel)) ObjectOut.append(str(FailedReads)) ObjectOut.append(ChrList) ObjectOut.append(MuxProductivity) return ObjectOut
import sys import re import os import gzip from Bio import SeqIO, bgzf from Bio.Seq import reverse_complement rev_com = [] output = "rev_com_" + sys.argv[1] with gzip.open(sys.argv[1], "rt") as handle: for my_seq in SeqIO.parse(handle, "fastq"): rev_com.append( my_seq.reverse_complement(id=my_seq.id, description=my_seq.description)) with bgzf.BgzfWriter(output, "wb") as outgz: SeqIO.write(rev_com, outgz, "fastq")
def __init__(self, out_prefix, paired=False, bam_header=None, vcf_header=None, no_fastq=False, fasta_instead=False): self.fasta_instead = fasta_instead # TODO Eliminate paired end as an option for fastas. Plan is to create a write fasta method. if self.fasta_instead: fq1 = pathlib.Path(out_prefix + '.fasta.gz') fq2 = None else: fq1 = pathlib.Path(out_prefix + '_read1.fq.gz') fq2 = pathlib.Path(out_prefix + '_read2.fq.gz') bam = pathlib.Path(out_prefix + '_golden.bam') vcf = pathlib.Path(out_prefix + '_golden.vcf.gz') # TODO Make a fasta-specific method self.no_fastq = no_fastq if not self.no_fastq: self.fq1_file = bgzf.open(fq1, 'w') self.fq2_file = None if paired: self.fq2_file = bgzf.open(fq2, 'w') # VCF OUTPUT self.vcf_file = None if vcf_header is not None: self.vcf_file = bgzf.open(vcf, 'wb') # WRITE VCF HEADER self.vcf_file.write('##fileformat=VCFv4.1\n'.encode('utf-8')) reference = '##reference=' + vcf_header[0] + '\n' self.vcf_file.write(reference.encode('utf-8')) self.vcf_file.write( '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">\n' .encode('utf-8')) self.vcf_file.write( '##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">\n' .encode('utf-8')) self.vcf_file.write( '##INFO=<ID=VMX,Number=1,Type=String,Description="SNP is Missense in these Read Frames">\n' .encode('utf-8')) self.vcf_file.write( '##INFO=<ID=VNX,Number=1,Type=String,Description="SNP is Nonsense in these Read Frames">\n' .encode('utf-8')) self.vcf_file.write( '##INFO=<ID=VFX,Number=1,Type=String,Description="Indel Causes Frameshift">\n' .encode('utf-8')) self.vcf_file.write( '##INFO=<ID=WP,Number=A,Type=Integer,Description="NEAT-GenReads ploidy indicator">\n' .encode('utf-8')) self.vcf_file.write( '##ALT=<ID=DEL,Description="Deletion">\n'.encode('utf-8')) self.vcf_file.write( '##ALT=<ID=DUP,Description="Duplication">\n'.encode('utf-8')) self.vcf_file.write( '##ALT=<ID=INS,Description="Insertion of novel sequence">\n'. encode('utf-8')) self.vcf_file.write( '##ALT=<ID=INV,Description="Inversion">\n'.encode('utf-8')) self.vcf_file.write( '##ALT=<ID=CNV,Description="Copy number variable region">\n'. encode('utf-8')) self.vcf_file.write( '##ALT=<ID=TRANS,Description="Translocation">\n'.encode( 'utf-8')) self.vcf_file.write( '##ALT=<ID=INV-TRANS,Description="Inverted translocation">\n'. encode('utf-8')) # TODO add sample to vcf output self.vcf_file.write( '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n'.encode( 'utf-8')) # BAM OUTPUT self.bam_file = None if bam_header is not None: self.bam_file = bgzf.BgzfWriter( bam, 'w', compresslevel=BAM_COMPRESSION_LEVEL) # WRITE BAM HEADER self.bam_file.write("BAM\1") header = '@HD\tVN:1.5\tSO:coordinate\n' for n in bam_header[0]: header += '@SQ\tSN:' + n[0] + '\tLN:' + str(n[3]) + '\n' header += '@RG\tID:NEAT\tSM:NEAT\tLB:NEAT\tPL:NEAT\n' header_bytes = len(header) num_refs = len(bam_header[0]) self.bam_file.write(pack('<i', header_bytes)) self.bam_file.write(header) self.bam_file.write(pack('<i', num_refs)) for n in bam_header[0]: l_name = len(n[0]) + 1 self.bam_file.write(pack('<i', l_name)) self.bam_file.write(n[0] + '\0') self.bam_file.write(pack('<i', n[3])) # buffers for more efficient writing self.fq1_buffer = [] self.fq2_buffer = [] self.bam_buffer = []
from Bio import SeqIO, bgzf from gzip import open as gzopen import random fq1 = SeqIO.parse(gzopen( "/home/wanghm/whm/ATAC/S0821_05A_CHG036758-Lane41-PH1-7d-ACAGTGGT_L001_R1.fastq.gz", "rt"), format="fastq") fq2 = SeqIO.parse(gzopen( "/home/wanghm/whm/ATAC/S0821_05A_CHG036758-Lane41-PH1-7d-ACAGTGGT_L001_R2.fastq.gz", "rt"), format="fastq") handle_out_rep1_r1 = bgzf.BgzfWriter( "/home/wanghm/whm/ATAC/split/sex_rep1_r1.fastq.gz", "ab") handle_out_rep1_r2 = bgzf.BgzfWriter( "/home/wanghm/whm/ATAC/split/sex_rep1_r2.fastq.gz", "ab") rep1_count = 0 handle_out_rep2_r1 = bgzf.BgzfWriter( "/home/wanghm/whm/ATAC/split/sex_rep2_r1.fastq.gz", "ab") handle_out_rep2_r2 = bgzf.BgzfWriter( "/home/wanghm/whm/ATAC/split/sex_rep2_r2.fastq.gz", "ab") rep2_count = 0 ll = [[handle_out_rep1_r1, handle_out_rep1_r2, rep1_count], [handle_out_rep2_r1, handle_out_rep2_r2, rep2_count]] reads_count = 42304520 # reads count in fastq file for seq in zip(fq1, fq2): tmp_repo = random.choice(ll) # choose random element in list
#import ipdb; ipdb.set_trace() #sys.stderr.write('\nprocessing the reads...') for reads_file in options.reads: # output filename output = reads_file.split('.')[0] + '.collapsed.fasta.gz' # dictionary to count multiplicities of unique read sequences as key values, preserving the order of appearance unique_seqs = OrderedDict() with gzip.open(reads_file, 'rt') as fd: for r in SeqIO.parse(fd, 'fastq'): seq = str(r.seq) _ = unique_seqs.setdefault( seq, int(0) ) # it the key is there the count is returned, otherwise the count is set to zero unique_seqs[seq] += 1 # save FASTA records one at a time, let the OS deal with the IO... with bgzf.BgzfWriter(output, 'wb') as fd: for n, (seq, count) in enumerate(unique_seqs.items()): SeqIO.write( SeqRecord(Seq(seq, IUPAC.ambiguous_dna), id='read_' + str(n + 1) + '_x' + str(count), description=''), fd, 'fasta') #sys.stderr.write('done!\n')