class GenericFastqFileReader: def __init__(self,filename): self.filename = filename self.gfr = GenericFileReader(self.filename) self.previous_name = None def close(self): self.gfr.close() def read_entry(self): line1 = self.gfr.readline() if not line1: return False line2 = self.gfr.readline() if not line2: sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n") line3 = self.gfr.readline() if not line3: sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n") line4 = self.gfr.readline() if not line4: sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n") m = re.match('^@([^\t]+)',line1.rstrip()) if not m: sys.stderr.write("Warning: Could not read name\n") t = {} t['name'] = m.group(1) t['seq'] = line2.rstrip() t['quality'] = line4.rstrip() return t
def main(): parser = argparse.ArgumentParser(description="Make a gpd with unique transcript names and a key to their original gpd entry\n") parser.add_argument("gpd_infile",help="FILENAME genepred file") parser.add_argument("gpd_outfile",help="FILENAME genepred file") args = parser.parse_args() gfr = GenericFileReader(args.gpd_infile) seen = {} while True: line = gfr.readline() if not line: break if re.match('^#',line): continue line = line.rstrip() f = line.split("\t") if f[1] not in seen: seen[f[1]] = [] seen[f[1]].append(line) gfr.close() of_gpd = open(args.gpd_outfile,'w') of_key = open(args.gpd_outfile+".key_file",'w') for tx in seen: for i in range(0,len(seen[tx])): name = tx if len(seen[tx]) > 1: name = tx + '['+str(i+1)+'/'+str(len(seen[tx]))+']' f = seen[tx][i].split("\t") f[1] = name newline = "\t".join(f) of_key.write(name + "\t" + seen[tx][i] + "\n") of_gpd.write(newline + "\n") of_key.close() of_gpd.close()
def main(): parser = argparse.ArgumentParser( description= "Make a gpd with unique transcript names and a key to their original gpd entry\n" ) parser.add_argument("gpd_infile", help="FILENAME genepred file") parser.add_argument("gpd_outfile", help="FILENAME genepred file") args = parser.parse_args() gfr = GenericFileReader(args.gpd_infile) seen = {} while True: line = gfr.readline() if not line: break if re.match('^#', line): continue line = line.rstrip() f = line.split("\t") if f[1] not in seen: seen[f[1]] = [] seen[f[1]].append(line) gfr.close() of_gpd = open(args.gpd_outfile, 'w') of_key = open(args.gpd_outfile + ".key_file", 'w') for tx in seen: for i in range(0, len(seen[tx])): name = tx if len(seen[tx]) > 1: name = tx + '[' + str(i + 1) + '/' + str(len(seen[tx])) + ']' f = seen[tx][i].split("\t") f[1] = name newline = "\t".join(f) of_key.write(name + "\t" + seen[tx][i] + "\n") of_gpd.write(newline + "\n") of_key.close() of_gpd.close()
def set_mapping_counts(self,psl_filename): self.mapping_counts_set = True gfr0 = GenericFileReader(psl_filename) qcnts = {} while True: line = gfr0.readline() if not line: break try: psle = PSLBasics.line_to_entry(line.rstrip()) except: sys.stderr.write("Problem parsing line:\n"+line.rstrip()+"\n") continue if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0 qcnts[psle['qName']] += 1 gfr0.close() self.mapping_counts = qcnts
def set_mapping_counts(self, psl_filename): self.mapping_counts_set = True gfr0 = GenericFileReader(psl_filename) qcnts = {} while True: line = gfr0.readline() if not line: break try: psle = PSLBasics.line_to_entry(line.rstrip()) except: sys.stderr.write("Problem parsing line:\n" + line.rstrip() + "\n") continue if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0 qcnts[psle['qName']] += 1 gfr0.close() self.mapping_counts = qcnts
class GenericFastaFileReader: def __init__(self, filename): self.filename = filename self.gfr = GenericFileReader(self.filename) self.previous_name = None def close(self): self.gfr.close() def read_entry(self): buffer = '' original = '' t = {} t['name'] = '' t['seq'] = '' t['original'] = '' while True: newline = self.gfr.readline() if not self.previous_name and not newline: # no name in the buffer and new data being input, exit return None if not newline: # end of the line, then finish it t['name'] = self.previous_name t['seq'] = buffer t['original'] = original self.previous_name = None t['original'] = '>' + t['name'] + "\n" + t['original'] return t m = re.match('^>(.*)$', newline.rstrip()) if not self.previous_name and m: self.previous_name = m.group(1) #special case of our first entry continue if m: t['name'] = self.previous_name t['seq'] = buffer t['original'] = original self.previous_name = m.group(1) t['original'] = '>' + t['name'] + "\n" + t['original'] return t buffer += newline.rstrip() original += newline
class GenericFastaFileReader: def __init__(self,filename): self.filename = filename self.gfr = GenericFileReader(self.filename) self.previous_name = None def close(self): self.gfr.close() def read_entry(self): buffer = '' original = '' t = {} t['name'] = '' t['seq'] = '' t['original'] = '' while True: newline = self.gfr.readline() if not self.previous_name and not newline: # no name in the buffer and new data being input, exit return None if not newline: # end of the line, then finish it t['name'] = self.previous_name t['seq'] = buffer t['original'] = original self.previous_name = None t['original'] = '>'+t['name'] + "\n" + t['original'] return t m = re.match('^>(.*)$',newline.rstrip()) if not self.previous_name and m: self.previous_name = m.group(1) #special case of our first entry continue if m: t['name'] = self.previous_name t['seq'] = buffer t['original'] = original self.previous_name = m.group(1) t['original'] = '>'+t['name'] + "\n" + t['original'] return t buffer += newline.rstrip() original += newline
def main(): parser = argparse.ArgumentParser( description= 'Split FASTQ file(s) into smaller ones with as many entries as you specify' ) parser.add_argument('size', type=int, help='Number of sequences to put into each file') parser.add_argument('output_directory', help='Name of the directory to put sequences') parser.add_argument('fastq_files', nargs='+', help='FILENAME(S) for fastq files') args = parser.parse_args() if len(args.fastq_files) > 2: sys.stderr.write("ERROR only two fastq files at most are supported\n") return if os.path.exists(args.output_directory): sys.stderr.write("ERROR output directory exists already\n") return os.makedirs(args.output_directory) if len(args.fastq_files) == 1: out_iter = 1 fcount = 0 of = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '.fq', 'w') gfr = sys.stdin if (args.fastq_files[0] != '-'): gfr = GenericFileReader(args.fastq_files[0]) while True: lineA = gfr.readline() if not lineA: break lineB = gfr.readline() lineC = gfr.readline() lineD = gfr.readline() of.write(lineA) of.write(lineB) of.write(lineC) of.write(lineD) fcount += 1 if args.size <= fcount: fcount = 0 out_iter += 1 of.close() of = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '.fq', 'w') gfr.close() else: # we have two fastq files out_iter = 1 fcount = 0 of1 = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '_1.fq', 'w') gfr1 = GenericFileReader(args.fastq_files[0]) of2 = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '_2.fq', 'w') gfr2 = GenericFileReader(args.fastq_files[1]) while True: line1a = gfr1.readline() line2a = gfr2.readline() if not line1a or not line2a: if line1a or line2a: sys.stderr.write( "WARNING paired file lengths appear different\n") break line1b = gfr1.readline() line2b = gfr2.readline() line1c = gfr1.readline() line2c = gfr2.readline() line1d = gfr1.readline() line2d = gfr2.readline() of1.write(line1a) of2.write(line2a) of1.write(line1b) of2.write(line2b) of1.write(line1c) of2.write(line2c) of1.write(line1d) of2.write(line2d) fcount += 1 if args.size <= fcount: fcount = 0 out_iter += 1 of1.close() of2.close() of1 = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '_1.fq', 'w') of2 = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '_2.fq', 'w') gfr1.close() gfr2.close()
def main(): parser = argparse.ArgumentParser(description='Split FASTQ file(s) into smaller ones with as many entries as you specify') parser.add_argument('size',type=int,help='Number of sequences to put into each file') parser.add_argument('output_directory',help='Name of the directory to put sequences') parser.add_argument('fastq_files',nargs='+',help='FILENAME(S) for fastq files') args = parser.parse_args() if len(args.fastq_files) > 2: sys.stderr.write("ERROR only two fastq files at most are supported\n") return if os.path.exists(args.output_directory): sys.stderr.write("ERROR output directory exists already\n") return os.makedirs(args.output_directory) if len(args.fastq_files) == 1: out_iter = 1 fcount = 0 of = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'.fq','w') gfr = sys.stdin if(args.fastq_files[0] != '-'): gfr = GenericFileReader(args.fastq_files[0]) while True: lineA = gfr.readline() if not lineA: break lineB = gfr.readline() lineC = gfr.readline() lineD = gfr.readline() of.write(lineA) of.write(lineB) of.write(lineC) of.write(lineD) fcount += 1 if args.size <= fcount: fcount = 0 out_iter += 1 of.close() of = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'.fq','w') gfr.close() else: # we have two fastq files out_iter = 1 fcount = 0 of1 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_1.fq','w') gfr1 = GenericFileReader(args.fastq_files[0]) of2 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_2.fq','w') gfr2 = GenericFileReader(args.fastq_files[1]) while True: line1a = gfr1.readline() line2a = gfr2.readline() if not line1a or not line2a: if line1a or line2a: sys.stderr.write("WARNING paired file lengths appear different\n") break line1b = gfr1.readline() line2b = gfr2.readline() line1c = gfr1.readline() line2c = gfr2.readline() line1d = gfr1.readline() line2d = gfr2.readline() of1.write(line1a) of2.write(line2a) of1.write(line1b) of2.write(line2b) of1.write(line1c) of2.write(line2c) of1.write(line1d) of2.write(line2d) fcount += 1 if args.size <= fcount: fcount = 0 out_iter += 1 of1.close() of2.close() of1 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_1.fq','w') of2 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_2.fq','w') gfr1.close() gfr2.close()