def main(): parser = argparse.ArgumentParser( description= "Make a gpd with unique transcript names and a key to their original gpd entry\n" ) parser.add_argument("gpd_infile", help="FILENAME genepred file") parser.add_argument("gpd_outfile", help="FILENAME genepred file") args = parser.parse_args() gfr = GenericFileReader(args.gpd_infile) seen = {} while True: line = gfr.readline() if not line: break if re.match('^#', line): continue line = line.rstrip() f = line.split("\t") if f[1] not in seen: seen[f[1]] = [] seen[f[1]].append(line) gfr.close() of_gpd = open(args.gpd_outfile, 'w') of_key = open(args.gpd_outfile + ".key_file", 'w') for tx in seen: for i in range(0, len(seen[tx])): name = tx if len(seen[tx]) > 1: name = tx + '[' + str(i + 1) + '/' + str(len(seen[tx])) + ']' f = seen[tx][i].split("\t") f[1] = name newline = "\t".join(f) of_key.write(name + "\t" + seen[tx][i] + "\n") of_gpd.write(newline + "\n") of_key.close() of_gpd.close()
def main(): parser = argparse.ArgumentParser(description="Make a gpd with unique transcript names and a key to their original gpd entry\n") parser.add_argument("gpd_infile",help="FILENAME genepred file") parser.add_argument("gpd_outfile",help="FILENAME genepred file") args = parser.parse_args() gfr = GenericFileReader(args.gpd_infile) seen = {} while True: line = gfr.readline() if not line: break if re.match('^#',line): continue line = line.rstrip() f = line.split("\t") if f[1] not in seen: seen[f[1]] = [] seen[f[1]].append(line) gfr.close() of_gpd = open(args.gpd_outfile,'w') of_key = open(args.gpd_outfile+".key_file",'w') for tx in seen: for i in range(0,len(seen[tx])): name = tx if len(seen[tx]) > 1: name = tx + '['+str(i+1)+'/'+str(len(seen[tx]))+']' f = seen[tx][i].split("\t") f[1] = name newline = "\t".join(f) of_key.write(name + "\t" + seen[tx][i] + "\n") of_gpd.write(newline + "\n") of_key.close() of_gpd.close()
def main(): parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size") parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it") parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference") parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1,len(e['exonStarts'])): if e['exonEnds'][i-1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i-1]] = {} if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]: ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#',line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe,ge,refjuns,args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def __init__(self, filename): self.filename = filename self.gfr = GenericFileReader(filename) self.entries = [] while True: line = self.gfr.readline() if not line: break if re.match('^#', line): continue gpe = GenePredEntry() gpe.line_to_entry(line) self.entries.append(gpe) return
def set_mapping_counts(self,psl_filename): self.mapping_counts_set = True gfr0 = GenericFileReader(psl_filename) qcnts = {} while True: line = gfr0.readline() if not line: break try: psle = PSLBasics.line_to_entry(line.rstrip()) except: sys.stderr.write("Problem parsing line:\n"+line.rstrip()+"\n") continue if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0 qcnts[psle['qName']] += 1 gfr0.close() self.mapping_counts = qcnts
def set_mapping_counts(self, psl_filename): self.mapping_counts_set = True gfr0 = GenericFileReader(psl_filename) qcnts = {} while True: line = gfr0.readline() if not line: break try: psle = PSLBasics.line_to_entry(line.rstrip()) except: sys.stderr.write("Problem parsing line:\n" + line.rstrip() + "\n") continue if psle['qName'] not in qcnts: qcnts[psle['qName']] = 0 qcnts[psle['qName']] += 1 gfr0.close() self.mapping_counts = qcnts
class GenericFastaFileReader: def __init__(self, filename): self.filename = filename self.gfr = GenericFileReader(self.filename) self.previous_name = None def close(self): self.gfr.close() def read_entry(self): buffer = '' original = '' t = {} t['name'] = '' t['seq'] = '' t['original'] = '' while True: newline = self.gfr.readline() if not self.previous_name and not newline: # no name in the buffer and new data being input, exit return None if not newline: # end of the line, then finish it t['name'] = self.previous_name t['seq'] = buffer t['original'] = original self.previous_name = None t['original'] = '>' + t['name'] + "\n" + t['original'] return t m = re.match('^>(.*)$', newline.rstrip()) if not self.previous_name and m: self.previous_name = m.group(1) #special case of our first entry continue if m: t['name'] = self.previous_name t['seq'] = buffer t['original'] = original self.previous_name = m.group(1) t['original'] = '>' + t['name'] + "\n" + t['original'] return t buffer += newline.rstrip() original += newline
class GenericFastaFileReader: def __init__(self,filename): self.filename = filename self.gfr = GenericFileReader(self.filename) self.previous_name = None def close(self): self.gfr.close() def read_entry(self): buffer = '' original = '' t = {} t['name'] = '' t['seq'] = '' t['original'] = '' while True: newline = self.gfr.readline() if not self.previous_name and not newline: # no name in the buffer and new data being input, exit return None if not newline: # end of the line, then finish it t['name'] = self.previous_name t['seq'] = buffer t['original'] = original self.previous_name = None t['original'] = '>'+t['name'] + "\n" + t['original'] return t m = re.match('^>(.*)$',newline.rstrip()) if not self.previous_name and m: self.previous_name = m.group(1) #special case of our first entry continue if m: t['name'] = self.previous_name t['seq'] = buffer t['original'] = original self.previous_name = m.group(1) t['original'] = '>'+t['name'] + "\n" + t['original'] return t buffer += newline.rstrip() original += newline
def __init__(self,filename): self.filename = filename self.gfr = GenericFileReader(filename) self.entries = [] while True: line = self.gfr.readline() if not line: break if re.match('^#',line): continue gpe = GenePredEntry() gpe.line_to_entry(line) self.entries.append(gpe) return
class GenericFastqFileReader: def __init__(self,filename): self.filename = filename self.gfr = GenericFileReader(self.filename) self.previous_name = None def close(self): self.gfr.close() def read_entry(self): line1 = self.gfr.readline() if not line1: return False line2 = self.gfr.readline() if not line2: sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n") line3 = self.gfr.readline() if not line3: sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n") line4 = self.gfr.readline() if not line4: sys.stderr.write("Warning: Improperly terminated fastq file line count not a multiple of 4\n") m = re.match('^@([^\t]+)',line1.rstrip()) if not m: sys.stderr.write("Warning: Could not read name\n") t = {} t['name'] = m.group(1) t['seq'] = line2.rstrip() t['quality'] = line4.rstrip() return t
def __init__(self,filename): self.filename = filename self.gfr = GenericFileReader(self.filename) self.previous_name = None
def main(): parser = argparse.ArgumentParser( description= 'Split FASTQ file(s) into smaller ones with as many entries as you specify' ) parser.add_argument('size', type=int, help='Number of sequences to put into each file') parser.add_argument('output_directory', help='Name of the directory to put sequences') parser.add_argument('fastq_files', nargs='+', help='FILENAME(S) for fastq files') args = parser.parse_args() if len(args.fastq_files) > 2: sys.stderr.write("ERROR only two fastq files at most are supported\n") return if os.path.exists(args.output_directory): sys.stderr.write("ERROR output directory exists already\n") return os.makedirs(args.output_directory) if len(args.fastq_files) == 1: out_iter = 1 fcount = 0 of = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '.fq', 'w') gfr = sys.stdin if (args.fastq_files[0] != '-'): gfr = GenericFileReader(args.fastq_files[0]) while True: lineA = gfr.readline() if not lineA: break lineB = gfr.readline() lineC = gfr.readline() lineD = gfr.readline() of.write(lineA) of.write(lineB) of.write(lineC) of.write(lineD) fcount += 1 if args.size <= fcount: fcount = 0 out_iter += 1 of.close() of = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '.fq', 'w') gfr.close() else: # we have two fastq files out_iter = 1 fcount = 0 of1 = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '_1.fq', 'w') gfr1 = GenericFileReader(args.fastq_files[0]) of2 = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '_2.fq', 'w') gfr2 = GenericFileReader(args.fastq_files[1]) while True: line1a = gfr1.readline() line2a = gfr2.readline() if not line1a or not line2a: if line1a or line2a: sys.stderr.write( "WARNING paired file lengths appear different\n") break line1b = gfr1.readline() line2b = gfr2.readline() line1c = gfr1.readline() line2c = gfr2.readline() line1d = gfr1.readline() line2d = gfr2.readline() of1.write(line1a) of2.write(line2a) of1.write(line1b) of2.write(line2b) of1.write(line1c) of2.write(line2c) of1.write(line1d) of2.write(line2d) fcount += 1 if args.size <= fcount: fcount = 0 out_iter += 1 of1.close() of2.close() of1 = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '_1.fq', 'w') of2 = open( args.output_directory.rstrip('/') + '/' + str(out_iter) + '_2.fq', 'w') gfr1.close() gfr2.close()
def main(): parser = argparse.ArgumentParser(description='Split FASTQ file(s) into smaller ones with as many entries as you specify') parser.add_argument('size',type=int,help='Number of sequences to put into each file') parser.add_argument('output_directory',help='Name of the directory to put sequences') parser.add_argument('fastq_files',nargs='+',help='FILENAME(S) for fastq files') args = parser.parse_args() if len(args.fastq_files) > 2: sys.stderr.write("ERROR only two fastq files at most are supported\n") return if os.path.exists(args.output_directory): sys.stderr.write("ERROR output directory exists already\n") return os.makedirs(args.output_directory) if len(args.fastq_files) == 1: out_iter = 1 fcount = 0 of = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'.fq','w') gfr = sys.stdin if(args.fastq_files[0] != '-'): gfr = GenericFileReader(args.fastq_files[0]) while True: lineA = gfr.readline() if not lineA: break lineB = gfr.readline() lineC = gfr.readline() lineD = gfr.readline() of.write(lineA) of.write(lineB) of.write(lineC) of.write(lineD) fcount += 1 if args.size <= fcount: fcount = 0 out_iter += 1 of.close() of = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'.fq','w') gfr.close() else: # we have two fastq files out_iter = 1 fcount = 0 of1 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_1.fq','w') gfr1 = GenericFileReader(args.fastq_files[0]) of2 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_2.fq','w') gfr2 = GenericFileReader(args.fastq_files[1]) while True: line1a = gfr1.readline() line2a = gfr2.readline() if not line1a or not line2a: if line1a or line2a: sys.stderr.write("WARNING paired file lengths appear different\n") break line1b = gfr1.readline() line2b = gfr2.readline() line1c = gfr1.readline() line2c = gfr2.readline() line1d = gfr1.readline() line2d = gfr2.readline() of1.write(line1a) of2.write(line2a) of1.write(line1b) of2.write(line2b) of1.write(line1c) of2.write(line2c) of1.write(line1d) of2.write(line2d) fcount += 1 if args.size <= fcount: fcount = 0 out_iter += 1 of1.close() of2.close() of1 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_1.fq','w') of2 = open(args.output_directory.rstrip('/')+'/'+str(out_iter)+'_2.fq','w') gfr1.close() gfr2.close()
def main(): parser = argparse.ArgumentParser( description='Use reference junctions when they are close', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size', type=int, default=68, help="INT min intron size") parser.add_argument( '--min_local_support', type=int, default=0, help= "INT min number of junctions within search_size of a junction in order to count it" ) parser.add_argument('--search_size', type=int, default=10, help="INT search space for reference") parser.add_argument( '--output_fake_psl', help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred', help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1, len(e['exonStarts'])): if e['exonEnds'][i - 1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i - 1]] = {} if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i - 1]]: ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] + 1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#', line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len( pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps( GenePredBasics.line_to_entry(genepred_line), args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe, ge, refjuns, args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line( new_ge, genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)