def main(): parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file") parser.add_argument('input',help="PSLFILE or - for STIDN") parser.add_argument('reference',help="FASTAFILE reference genome") parser.add_argument('query',help="FASTAFILE query sequences") parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT") #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance") args = parser.parse_args() # Read in the reference genome sys.stderr.write("Reading in reference genome\n") g = read_fasta_into_hash(args.reference) sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n") inf = sys.stdin if args.input != '-': inf = open(args.input) fhr = FastaHandleReader(open(args.query)) last_fasta = fhr.read_entry() if not last_fasta: sys.stderr.write("ERROR: No query sequences\n") sys.exit() for line in inf: p = PSLBasics.PSL(line) if not p.validate(): sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n") n = p.value('qName') if not last_fasta: sys.stderr.write("ERROR: Ran out of query sequences too soon. Are they sorted properly\n") sys.exit() while last_fasta['name'] != n: last_fasta = fhr.read_entry() p.set_query(last_fasta['seq']) p.set_reference_dictionary(g) print p.get_line() p.pretty_print(50) fhr.close()
def main(): parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size") parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it") parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference") parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1,len(e['exonStarts'])): if e['exonEnds'][i-1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i-1]] = {} if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]: ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#',line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe,ge,refjuns,args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def main(): parser = argparse.ArgumentParser(description="For every genepred entry report its alignability",formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input',help="Genepred can be gzipped or - for STDIN") parser.add_argument('-r','--reference',required=True,help="Reference fasta") parser.add_argument('-k','--fragment_size',default=100,type=int,help="Fragment size to try to align") parser.add_argument('-x','--hisat_index',required=True,help="HISAT index base name") parser.add_argument('--threads',type=int,default=cpu_count(),help="number of threads") parser.add_argument('--type',choices=['mean','median'],default='mean',help="How to bring together overlapping reads") parser.add_argument('--perbase',action='store_true') parser.add_argument('--output','-o',help="output file or leave unset for STDOUT") args = parser.parse_args() if args.input=='-': args.input=sys.stdin elif re.search('\.gz$',args.input): args.input = gzip.open(args.input) else: args.input = open(args.input) udir = os.path.dirname(os.path.realpath(__file__)) cmd2 = udir+'/genepred_counts_to_mappability.py -' cmd2 += ' --threads '+str(args.threads) cmd2 += ' -k '+str(args.fragment_size) if args.perbase: cmd2 += ' --perbase' if args.output: cmd2 += ' --output '+args.output if args.type: cmd2 += ' --type '+args.type p2 = Popen(cmd2.split(),stdin=PIPE) ref = read_fasta_into_hash(args.reference) cmd1 = 'hisat -x '+args.hisat_index+' -U - -f --reorder -p '+str(args.threads) p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin,stderr=null) #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin) line_number = 0 for line in args.input: line_number +=1 gpd = GPD(line.rstrip()) #print gpd.entry['name'] #print gpd.length() if gpd.length() < args.fragment_size: continue seq = gpd.get_sequence(ref) for i in range(0,len(seq)-args.fragment_size+1): info = gpd.value('name')+"\t"+gpd.value('gene_name')+"\t"+str(line_number)+"\t"+str(len(seq))+"\t"+str(i) einfo = encode_name(info) p1.stdin.write('>'+einfo+"\n") p1.stdin.write(seq[i:i+args.fragment_size]+"\n") p1.communicate() p2.communicate()
def main(): parser = argparse.ArgumentParser( description="Correct the matches/mismatches and Ncount of a PSL file") parser.add_argument('input', help="PSLFILE or - for STIDN") parser.add_argument('reference', help="FASTAFILE reference genome") parser.add_argument('query', help="FASTAFILE query sequences") parser.add_argument('--minimum_intron_size', type=int, default=68, help="INT") #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance") args = parser.parse_args() # Read in the reference genome sys.stderr.write("Reading in reference genome\n") g = read_fasta_into_hash(args.reference) sys.stderr.write("Finished reading " + str(len(g.keys())) + " reference sequences\n") inf = sys.stdin if args.input != '-': inf = open(args.input) fhr = FastaHandleReader(open(args.query)) last_fasta = fhr.read_entry() if not last_fasta: sys.stderr.write("ERROR: No query sequences\n") sys.exit() for line in inf: p = PSLBasics.PSL(line) if not p.validate(): sys.stderr.write( "WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n" + line.rstrip() + "\n") n = p.value('qName') if not last_fasta: sys.stderr.write( "ERROR: Ran out of query sequences too soon. Are they sorted properly\n" ) sys.exit() while last_fasta['name'] != n: last_fasta = fhr.read_entry() p.set_query(last_fasta['seq']) p.set_reference_dictionary(g) print p.get_line() p.pretty_print(50) fhr.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('ref_genome') parser.add_argument('phased_vcf') args = parser.parse_args() g = read_fasta_into_hash(args.ref_genome) gL = {} gR = {} for chr in g: gL[chr] = [x for x in g[chr].upper()] gR[chr] = [x for x in g[chr].upper()] #with open('1KG.biallelic.het.exonic/1KG.biallelic.het.exonic.vcf') as inf: z = 0 with open(args.phased_vcf) as inf: for line in inf: if re.match('#',line): continue z += 1 sys.stderr.write(str(z)+"\r") f = line.rstrip().split("\t") chr = f[0] [n1,n2] = [int(x) for x in f[9].split('|')] if int(f[1]) > len(gL[chr]): sys.stderr.write(line) sys.exit() if n1 == 0: gL[chr][int(f[1])-1] = f[3] else: gL[chr][int(f[1])-1] = f[4] if n2 == 0: gR[chr][int(f[1])-1] = f[3] else: gR[chr][int(f[1])-1] = f[4] sys.stderr.write("\nalmost done\n") ofL = open('L.fa','w') for chr in sorted(gL.keys()): ofL.write(">"+chr+"\n") ofL.write("".join(gL[chr])+"\n") ofL.close() ofR = open('R.fa','w') for chr in sorted(gR.keys()): ofR.write(">"+chr+"\n") ofR.write("".join(gR[chr])+"\n") ofR.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('ref_genome') parser.add_argument('phased_vcf') args = parser.parse_args() g = read_fasta_into_hash(args.ref_genome) gL = {} gR = {} for chr in g: gL[chr] = [x for x in g[chr].upper()] gR[chr] = [x for x in g[chr].upper()] #with open('1KG.biallelic.het.exonic/1KG.biallelic.het.exonic.vcf') as inf: z = 0 with open(args.phased_vcf) as inf: for line in inf: if re.match('#', line): continue z += 1 sys.stderr.write(str(z) + "\r") f = line.rstrip().split("\t") chr = f[0] [n1, n2] = [int(x) for x in f[9].split('|')] if int(f[1]) > len(gL[chr]): sys.stderr.write(line) sys.exit() if n1 == 0: gL[chr][int(f[1]) - 1] = f[3] else: gL[chr][int(f[1]) - 1] = f[4] if n2 == 0: gR[chr][int(f[1]) - 1] = f[3] else: gR[chr][int(f[1]) - 1] = f[4] sys.stderr.write("\nalmost done\n") ofL = open('L.fa', 'w') for chr in sorted(gL.keys()): ofL.write(">" + chr + "\n") ofL.write("".join(gL[chr]) + "\n") ofL.close() ofR = open('R.fa', 'w') for chr in sorted(gR.keys()): ofR.write(">" + chr + "\n") ofR.write("".join(gR[chr]) + "\n") ofR.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('reference_genome') parser.add_argument('transcripts_genepred') parser.add_argument('--out_gpd', help="fusion genepred", required=True) parser.add_argument('--out_fasta', help="fusion fasta", required=True) parser.add_argument( '--fusion_count', type=int, default=1000, help="Create this many fusions, max is number of genes/2.") args = parser.parse_args() ref = read_fasta_into_hash(args.reference_genome) of_gpd = open(args.out_gpd, 'w') of_fasta = open(args.out_fasta, 'w') genes = {} with open(args.transcripts_genepred) as inf: for line in inf: gpd = GPD(line.rstrip()) if gpd.value('exonCount') <= 1: continue if gpd.value('gene_name') not in genes: genes[gpd.value('gene_name')] = [] genes[gpd.value('gene_name')].append(gpd) gene_names = genes.keys() fusion_count = args.fusion_count shuffle(gene_names) pairs = [] while True: if len(pairs) == fusion_count: break if len(gene_names) < 2: break pair = [gene_names[0], gene_names[1]] pairs.append(pair) gene_names.pop(0) gene_names.pop(0) for pair in pairs: [gpds, ars] = get_random_gpds_from_pair(pair, genes, ref) print ars.name of_fasta.write(ars.get_fasta()) for gpd in gpds: of_gpd.write(gpd + "\n") of_gpd.close() of_fasta.close()
def main(): parser = argparse.ArgumentParser(description='Create artifical reference sequences from a genepred') parser.add_argument('gpd_file') parser.add_argument('reference_fasta') parser.add_argument('-o','--output',help="output file to write to or STDOUT if not set") args = parser.parse_args() of = sys.stdout if args.output: of = open(args.output,'w') f = read_fasta_into_hash(args.reference_fasta) with open(args.gpd_file) as inf: for line in inf: gpd = GenePredBasics.GenePredEntry() gpd.line_to_entry(line.rstrip()) ars = ARS() beds = [] for i in range(0,gpd.value('exonCount')): b = Bed(gpd.value('chrom'),gpd.value('exonStarts')[i],gpd.value('exonEnds')[i],gpd.value('strand')) beds.append(b) ars.set_bounds(beds) ars.set_name(gpd.value('name')) ars.set_sequence_from_original_reference_hash(f) of.write(ars.get_fasta())
def main(): parser = argparse.ArgumentParser() parser.add_argument('reference_genome') parser.add_argument('transcripts_genepred') parser.add_argument('--out_gpd',help="fusion genepred",required=True) parser.add_argument('--out_fasta',help="fusion fasta",required=True) parser.add_argument('--fusion_count',type=int,default=1000,help="Create this many fusions, max is number of genes/2.") args = parser.parse_args() ref = read_fasta_into_hash(args.reference_genome) of_gpd = open(args.out_gpd,'w') of_fasta = open(args.out_fasta,'w') genes = {} with open(args.transcripts_genepred) as inf: for line in inf: gpd = GPD(line.rstrip()) if gpd.value('exonCount') <= 1: continue if gpd.value('gene_name') not in genes: genes[gpd.value('gene_name')] = [] genes[gpd.value('gene_name')].append(gpd) gene_names = genes.keys() fusion_count = args.fusion_count shuffle(gene_names) pairs = [] while True: if len(pairs) == fusion_count: break if len(gene_names) < 2: break pair = [gene_names[0],gene_names[1]] pairs.append(pair) gene_names.pop(0) gene_names.pop(0) for pair in pairs: [gpds,ars] = get_random_gpds_from_pair(pair,genes,ref) print ars.name of_fasta.write(ars.get_fasta()) for gpd in gpds: of_gpd.write(gpd+"\n") of_gpd.close() of_fasta.close()
def main(): parser = argparse.ArgumentParser( description="Create a simulated RNA-seq dataset") parser.add_argument('reference_genome', help="The reference genome.") parser.add_argument( 'transcripts_genepred', help= "A genepred file describing the transcripts. Each transcript name must be unique." ) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--uniform_expression', action='store_true', help="Uniform distribution of transcript expression") group.add_argument( '--isoform_expression', help= "The transcript expression in TSV format <Transcript name> tab <Expression>" ) group.add_argument( '--cufflinks_isoform_expression', help= "The expression of the isoforms or - for a uniform distribution of transcript expression" ) group2 = parser.add_mutually_exclusive_group() group2.add_argument('--long_reads_only', action='store_true') group2.add_argument('--short_reads_only', action='store_true') group2.add_argument('--output', help="Directory name for output") parser.add_argument('--short_read_count', type=int, default=10000, help="INT number of short reads") parser.add_argument('--short_read_length', type=int, default=101, help="INT length of the short reads") parser.add_argument('--long_read_count', type=int, default=4000, help="INT default number of long reads") parser.add_argument('--no_errors', action='store_true') parser.add_argument('--threads', type=int, default=1) args = parser.parse_args() if args.output: args.output = args.output.rstrip('/') fq_prof_pacbio_ccs95 = None fq_prof_pacbio_subreads = None fq_prof_illumina = None if not args.no_errors: fq_prof_pacbio_ccs95 = default_pacbio_ccs95() fq_prof_pacbio_subreads = default_pacbio_subreads() fq_prof_illumina = default_illumina() ref = read_fasta_into_hash(args.reference_genome) txn = Transcriptome() txn.set_reference_genome_dictionary(ref) with open(args.transcripts_genepred) as inf: for line in inf: if line[0] == '#': continue txn.add_genepred_line(line.rstrip()) if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0], float(f[1])) elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0], float(f[9])) sys.stderr.write("have transcriptome\n") for n in txn.ref_hash.keys(): del txn.ref_hash[n] rbe = SimulationBasics.RandomTranscriptomeEmitter(txn) # Now we have the transcriptomes set #Now our dataset is set up if args.short_reads_only: rbe.set_gaussian_fragmentation_default_hiseq() for zi in range(0, args.short_read_count): [name, seq] = rbe.emit_short_read(args.short_read_length) if args.no_errors: print "@SRSIM" + str(zi + 1) print seq print "+" print 'I' * len(seq) else: l1perm = fq_prof_illumina.create_fastq_and_permute_sequence( seq) print "@SRSIM" + str(zi + 1) print l1perm['seq'] print "+" print l1perm['qual'] return if args.long_reads_only: rbe.set_gaussian_fragmentation_default_pacbio() for zi in range(0, args.long_read_count): [name, seq] = rbe.emit_long_read() if args.no_errors: g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str( zi + 1) + '/ccs' print "@" + g print seq print "+" print 'I' * len(seq) else: g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str( zi + 1) + '/ccs' seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence( seq) print "@" + g print seqperm['seq'] print "+" print seqperm['qual'] return if not os.path.exists(args.output): os.makedirs(args.output) rbe.set_gaussian_fragmentation_default_hiseq() # Lets prepare to output now sys.stderr.write("Sequencing short reads\n") global left_handle global right_handle left_handle = gzip.open(args.output + "/SR_1.fq.gz", 'wb') right_handle = gzip.open(args.output + "/SR_2.fq.gz", 'wb') buffer_size = 10000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0, args.short_read_count): z = i + 1 if z % 1000 == 0: sys.stderr.write(str(z) + "\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_short_read_buffer(buffer[:], rbe, args, fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer, args=(buffer[:], rbe, args, fq_prof_illumina), callback=do_short) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_short_read_buffer(buffer[:], rbe, args, fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer, args=(buffer[:], rbe, args, fq_prof_illumina), callback=do_short) buffer = [] if args.threads > 1: p.close() p.join() global greport of = open(args.output + "/SR_report.txt", 'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]]) + "\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing short reads\n") left_handle.close() right_handle.close() # Now lets create the long read set rbe.set_gaussian_fragmentation_default_pacbio() sys.stderr.write("Sequencing ccs long reads\n") global long_handle long_handle = gzip.open(args.output + "/LR_ccs.fq.gz", 'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0, args.long_read_count): z = i + 1 if z % 100 == 0: sys.stderr.write(str(z) + "\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs'), callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs'), callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output + "/LR_ccs_report.txt", 'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]]) + "\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing ccs long reads\n") sys.stderr.write("Sequencing long sub reads\n") long_handle = gzip.open(args.output + "/LR_sub.fq.gz", 'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(z, z + args.long_read_count): z = i + 1 if z % 100 == 0: sys.stderr.write(str(z) + "\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub'), callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub'), callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output + "/LR_sub_report.txt", 'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]]) + "\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing long sub reads\n") combo = {} with open(args.output + "/SR_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name, express, left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output + "/LR_ccs_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name, express, left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output + "/LR_sub_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name, express, left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) of = open(args.output + "/LR_SR_combo_report.txt", 'w') for name in sorted(combo): of.write(name + "\t" + combo[name]['express'] + "\t" + str(combo[name]['left']) + "\n") of.close()
def main(): parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file") parser.add_argument('input',help="PSLFILE or - for STIDN") parser.add_argument('reference',help="FASTAFILE reference genome") parser.add_argument('query',help="FASTAFILE query sequences") parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT") #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance") args = parser.parse_args() # Read in the reference genome sys.stderr.write("Reading in reference genome\n") g = read_fasta_into_hash(args.reference) sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n") inf = sys.stdin if args.input != '-': inf = open(args.input) fhr = FastaHandleReader(open(args.query)) last_fasta = fhr.read_entry() if not last_fasta: sys.stderr.write("ERROR: No query sequences\n") sys.exit() for line in inf: p = PSLBasics.PSL(line) if not p.validate(): sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n") n = p.value('qName') if not last_fasta: sys.stderr.write("ERROR: Ran out of query sequences too soon. Are they sorted properly\n") sys.exit() while last_fasta['name'] != n: last_fasta = fhr.read_entry() p.set_query(last_fasta['seq']) p.set_reference_dictionary(g) p.correct_stats() print p.get_line() continue f = last_fasta nCount = 0 matches = 0 misMatches = 0 prev_qE = 0 prev_tE = 0 qNumInsert = 0 qBaseInsert = 0 tNumInsert = 0 tBaseInsert = 0 for i in range(p.value('blockCount')): blen = p.value('blockSizes')[i] qS = p.value('qStarts')[i] #query start qE = qS + blen #query end tS = p.value('tStarts')[i] #target start tE = tS + blen #target end #Work on gaps if prev_qE > 0 or prev_tE > 0: #if its not our first time through tgap = tS-prev_tE if tgap < args.minimum_intron_size and tgap > 0: tNumInsert += 1 tBaseInsert += tgap qgap = qS-prev_qE if qgap > 0: qNumInsert += 1 qBaseInsert += qgap query = f['seq'] if p.value('strand') == '-': query = rc(f['seq']) qseq = query[qS:qE].upper() rseq = g[p.value('tName')][tS:tE].upper() #print qseq+"\n"+rseq+"\n" for j in range(0,blen): if qseq[j] == 'N': nCount += 1 elif qseq[j] == rseq[j]: matches += 1 else: misMatches += 1 prev_qE = qE prev_tE = tE p.entry['matches'] = matches p.entry['misMatches'] = misMatches p.entry['nCount'] = nCount p.entry['qNumInsert'] = qNumInsert p.entry['qBaseInsert'] = qBaseInsert p.entry['tNumInsert'] = tNumInsert p.entry['tBaseInsert'] = tBaseInsert p.entry['qSize'] = len(query) p.entry['tSize'] = len(g[p.value('tName')]) print p.get_line() #p.pretty_print(100) fhr.close()
def main(): parser = argparse.ArgumentParser(description="Create a simulated RNA-seq dataset") parser.add_argument('reference_genome',help="The reference genome.") parser.add_argument('transcripts_genepred',help="A genepred file describing the transcripts. Each transcript name must be unique.") group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--uniform_expression',action='store_true',help="Uniform distribution of transcript expression") group.add_argument('--isoform_expression',help="The transcript expression in TSV format <Transcript name> tab <Expression>") group.add_argument('--cufflinks_isoform_expression',help="The expression of the isoforms or - for a uniform distribution of transcript expression") group2 = parser.add_mutually_exclusive_group() group2.add_argument('--long_reads_only',action='store_true') group2.add_argument('--short_reads_only',action='store_true') group2.add_argument('--output',help="Directory name for output") parser.add_argument('--short_read_count',type=int,default=10000,help="INT number of short reads") parser.add_argument('--short_read_length',type=int,default=101,help="INT length of the short reads") parser.add_argument('--long_read_count',type=int,default=4000,help="INT default number of long reads") parser.add_argument('--no_errors',action='store_true') parser.add_argument('--threads',type=int,default=1) args = parser.parse_args() if args.output: args.output = args.output.rstrip('/') fq_prof_pacbio_ccs95 = None fq_prof_pacbio_subreads = None fq_prof_illumina = None if not args.no_errors: fq_prof_pacbio_ccs95 = default_pacbio_ccs95() fq_prof_pacbio_subreads = default_pacbio_subreads() fq_prof_illumina = default_illumina() ref = read_fasta_into_hash(args.reference_genome) txn = Transcriptome() txn.set_reference_genome_dictionary(ref) with open(args.transcripts_genepred) as inf: for line in inf: if line[0]=='#': continue txn.add_genepred_line(line.rstrip()) if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0],float(f[1])) elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0],float(f[9])) sys.stderr.write("have transcriptome\n") for n in txn.ref_hash.keys(): del txn.ref_hash[n] rbe = SimulationBasics.RandomTranscriptomeEmitter(txn) # Now we have the transcriptomes set #Now our dataset is set up if args.short_reads_only: rbe.set_gaussian_fragmentation_default_hiseq() for zi in range(0,args.short_read_count): [name,seq] = rbe.emit_short_read(args.short_read_length) if args.no_errors: print "@SRSIM"+str(zi+1) print seq print "+" print 'I'*len(seq) else: l1perm = fq_prof_illumina.create_fastq_and_permute_sequence(seq) print "@SRSIM"+str(zi+1) print l1perm['seq'] print "+" print l1perm['qual'] return if args.long_reads_only: rbe.set_gaussian_fragmentation_default_pacbio() for zi in range(0,args.long_read_count): [name,seq] = rbe.emit_long_read() if args.no_errors: g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs' print "@"+g print seq print "+" print 'I'*len(seq) else: g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs' seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence(seq) print "@"+g print seqperm['seq'] print "+" print seqperm['qual'] return if not os.path.exists(args.output): os.makedirs(args.output) rbe.set_gaussian_fragmentation_default_hiseq() # Lets prepare to output now sys.stderr.write("Sequencing short reads\n") global left_handle global right_handle left_handle = gzip.open(args.output+"/SR_1.fq.gz",'wb') right_handle = gzip.open(args.output+"/SR_2.fq.gz",'wb') buffer_size = 10000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0,args.short_read_count): z = i+1 if z %1000==0: sys.stderr.write(str(z)+"\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short) buffer = [] if args.threads > 1: p.close() p.join() global greport of = open(args.output+"/SR_report.txt",'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]])+"\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing short reads\n") left_handle.close() right_handle.close() # Now lets create the long read set rbe.set_gaussian_fragmentation_default_pacbio() sys.stderr.write("Sequencing ccs long reads\n") global long_handle long_handle = gzip.open(args.output+"/LR_ccs.fq.gz",'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0,args.long_read_count): z = i+1 if z %100==0: sys.stderr.write(str(z)+"\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output+"/LR_ccs_report.txt",'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]])+"\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing ccs long reads\n") sys.stderr.write("Sequencing long sub reads\n") long_handle = gzip.open(args.output+"/LR_sub.fq.gz",'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(z,z+args.long_read_count): z = i+1 if z %100==0: sys.stderr.write(str(z)+"\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output+"/LR_sub_report.txt",'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]])+"\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing long sub reads\n") combo = {} with open(args.output+"/SR_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name,express,left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output+"/LR_ccs_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name,express,left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output+"/LR_sub_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name,express,left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) of = open(args.output+"/LR_SR_combo_report.txt",'w') for name in sorted(combo): of.write(name+"\t"+combo[name]['express']+"\t"+str(combo[name]['left'])+"\n") of.close()
def main(): parser = argparse.ArgumentParser(description="splice together partial alignments") group1 = parser.add_mutually_exclusive_group(required=True) group1.add_argument('--fastq_reads') group1.add_argument('--fasta_reads') parser.add_argument('--genome',help="FASTA reference genome",required=True) parser.add_argument('--genepred',help="Transcriptome genepred") parser.add_argument('--max_intron_size',type=int,default=100000,help="INT maximum intron size") parser.add_argument('--min_intron_size',type=int,default=68,help="INT minimum intron size") parser.add_argument('--max_gap_size',type=int,default=10,help="INT gap size in query to join") parser.add_argument('--max_search_expand',type=int,default=10,help="INT max search space to expand search for junction") parser.add_argument('--direction_specific',action='store_true',help="The direction of the transcript is known and properly oriented already") parser.add_argument('--threads',type=int,default=0,help="INT number of threads to use default cpu_count") parser.add_argument('-o','--output',default='-',help="FILENAME output results to here rather than STDOUT which is default") parser.add_argument('input_alignment',help="FILENAME input .psl file or '-' for STDIN") args = parser.parse_args() # Read our reference genome sys.stderr.write("Reading reference\n") ref = read_fasta_into_hash(args.genome) # Make sure our reads are unique sys.stderr.write("Checking for unqiuely named reads\n") reads = check_for_uniquely_named_reads(args) # does a hard exit and error if there are any names repeated sys.stderr.write("Reads are uniquely named\n") # Set number of threads to use cpu_count = multiprocessing.cpu_count() if args.threads > 0: cpu_count = args.threads #Set reference splices (if any are available) reference_splices = {} if args.genepred: sys.stderr.write("Reading reference splices from genepred\n") reference_splices = get_reference_splices(args) sys.stderr.write("Reading alignments into loci\n") # Get locus division (first stage) # Each read (qName) is separated # Then each locus will be specific to at chromosome (tName) # Then by (strand), but keep in mind this is the is based on the read # Each locus should be specific to a direction but we don't necessarily # know direction based on the data we have thus far. inf = sys.stdin if args.input_alignment != '-': inf = open(args.input_alignment,'r') loci = {} for line in inf: line = line.rstrip() if re.match('^#',line): continue psl = PSLBasics.line_to_entry(line) if psl['qName'] not in loci: loci[psl['qName']] = {} if psl['tName'] not in loci[psl['qName']]: loci[psl['qName']][psl['tName']] = {} if psl['strand'] not in loci[psl['qName']][psl['tName']]: loci[psl['qName']][psl['tName']][psl['strand']] = {} if psl['tStarts'][0] not in loci[psl['qName']][psl['tName']][psl['strand']]: loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'][0]] = [] loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'][0]].append(psl) sys.stderr.write("breaking loci by genomic distance\n") for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: #print qname + "\t" + chr + "\t" + strand starts = loci[qname][chr][strand].keys() current_set = [] locus_sets = [] last_end = -1*(args.max_intron_size+2) for start in sorted(starts): for e in loci[qname][chr][strand][start]: start = e['tStarts'][0]+1 # base-1 start of start of alignment if start > last_end+args.max_intron_size: # we have the start of a new set if len(current_set) > 0: locus_sets.append(current_set) current_set = [] last_end = e['tStarts'][len(e['tStarts'])-1]+e['blockSizes'][len(e['tStarts'])-1] current_set.append(e) if len(current_set) > 0: locus_sets.append(current_set) loci[qname][chr][strand] = locus_sets # replace what was there with these ordered sets locus_total = 0 for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: for locus_set in loci[qname][chr][strand]: locus_total+=1 sys.stderr.write("Work on each read in each locus with "+str(cpu_count)+" CPUs\n") p = multiprocessing.Pool(processes=cpu_count) locus_count = 0 for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: #print qname + "\t" + chr + "\t" + strand for locus_set in loci[qname][chr][strand]: locus_count += 1 onum = len(locus_set) # send blank reference splices unless we have some rsplices = {} if chr in reference_splices: rsplices = reference_splices[chr] #p.apply_async(process_locus_set,(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count),callback=do_locus_callback) r1 = execute_locus(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count) do_locus_callback(r1) #nnum = len(new_locus_set) #print str(onum) + " to " + str(nnum) #for e in new_locus_set: # print PSLBasics.entry_to_line(e) p.close() p.join() sys.stderr.write("\nfinished\n") ofh = sys.stdout if not args.output == '-': ofh = open(args.output,'w') for line in combo_results: ofh.write(line)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) cpus = multiprocessing.cpu_count() parser.add_argument('--ref_genome',required=True,help='(required) FASTA filename of reference genome') parser.add_argument('--bwa_index',required=True,help='(required) BWA Index') #parser.add_argument('--max_mismatches',type=int,default=2,help='INT maximum number of allowed mismatches') parser.add_argument('--min_read_size',type=int,default=30,help='INT minimum read size to consider') parser.add_argument('--test_size',type=int,default=5000,help='INT number of sequences to test') parser.add_argument('--min_test_size',type=int,default=500,help='INT disregard any parameter sets that do not produce at least this number of sequences prior to mapping.') parser.add_argument('--left_trim_range',help='start:end:increment, default is 0:[read_length]:5') parser.add_argument('--right_trim_range',help='start:end:increment, default is 0:[read_length]:5') parser.add_argument('--quality_number_range',help='start:end:increment, default is [qual_min]:[qual_max]:5') parser.add_argument('--quality_fail_count_range',help='start:end:increment, default is 0:[read_length]:5') parser.add_argument('--mapped_mismatch_range',help='start:end:increment, default is 0:3:1') parser.add_argument('--ignore_mapped_mismatches',action='store_true') parser.add_argument('--ignore_quality',action='store_true') parser.add_argument('--threads',type=int,default=cpus,help='INT of threads to run that defaults to cpu_count') parser.add_argument('--tempdir',default='/tmp/',help='Directory of your prefered temporary directory') parser.add_argument('-o',help='FILENAME for output') parser.add_argument('fastq_file',help='FILENAME for fastq file (can be .gz)') args = parser.parse_args() maxcnt = args.test_size mincnt = args.min_test_size sys.stderr.write("Testing up to "+str(maxcnt)+" reads.\n") sys.stderr.write("Require parameters leave at least "+str(mincnt)+" reads.\n") #max_allowed_mismatches = args.max_mismatches #sys.stderr.write("Allowing up to "+str(max_allowed_mismatches)+" mismatches.\n") #max_end_mismatches = 2 min_read_size = args.min_read_size sys.stderr.write("Requiring QC parameters produce a minimum read length of "+str(min_read_size)+"\n") man = multiprocessing.Manager() Q = man.Queue() ifile = args.bwa_index sys.stderr.write("BWA index: "+ifile+"\n") refgenome = args.ref_genome sys.stderr.write("Ref Genome: "+refgenome+"\n") #ifile = '/Shared/Au/jason/Reference/UCSC/Human/hg19_GRCh37_feb2009/BWA_Index/genome.fa' #refgenome = '/Users/weirathe/jason/Reference/UCSC/Human/hg19_GRCh37_feb2009/Genome/genome.fa' #refgenome = 'test_ref.fa' if args.threads: cpus = args.threads sys.stderr.write("Using "+str(cpus)+" threads\n") sys.stderr.write("reading reference genome\n") g = read_fasta_into_hash(refgenome) gz = {} cman = multiprocessing.Manager() cQ = man.Queue() pc = multiprocessing.Pool(processes=cpus) cresults = [] sys.stderr.write("compressing reference genome\n") for name in g: pc.apply_async(comp,[name,g[name],cQ,len(g)]) pc.close() pc.join() sys.stderr.write("\n") while not cQ.empty(): [name,zseq] = cQ.get() gz[name] = zseq sys.stderr.write("finished processing reference genome\n") #[entries,stats] = read_fastq('test3.fq',maxcnt) [entries,stats] = read_fastq(args.fastq_file,maxcnt) #tstart = '/tmp' tstart = args.tempdir.rstrip('/') tdir = tstart.rstrip('/')+'/'+'weirathe.'+str(randint(1,100000000)) if not os.path.exists(tdir): os.makedirs(tdir) z = 0 #max_l_trim = 20 #max_r_trim = 20 max_l_trim = stats['lenmax'] max_r_trim = stats['lenmax'] min_l_trim = 0 min_r_trim = 0 l_trim_iter = 5 r_trim_iter = 5 if args.left_trim_range: m = re.match('(\d+):(\d+):(\d+)',args.left_trim_range) if not m: sys.stderr.write("Error. malformed left trim range "+args.left_trim_range+"\n") return max_l_trim = int(m.group(2)) min_l_trim = int(m.group(1)) l_trim_iter = int(m.group(3)) if args.right_trim_range: m = re.match('(\d+):(\d+):(\d+)',args.right_trim_range) if not m: sys.stderr.write("Error. malformed right trim range "+args.right_trim_range+"\n") return max_r_trim = int(m.group(2)) min_r_trim = int(m.group(1)) r_trim_iter = int(m.group(3)) max_q_num = stats['qmax'] max_q_fail = stats['lenmax'] min_q_num = stats['qmin'] min_q_fail = 0 q_num_iter = 5 q_fail_iter = 5 if args.quality_number_range: m = re.match('(\d+):(\d+):(\d+)',args.quality_number_range) if not m: sys.stderr.write("Error. malformed quality number range "+args.quality_number_range+"\n") return max_q_num = int(m.group(2)) min_q_num = int(m.group(1)) q_num_iter = int(m.group(3)) if args.quality_fail_count_range: m = re.match('(\d+):(\d+):(\d+)',args.quality_fail_count_range) if not m: sys.stderr.write("Error. malformed quality number range "+args.quality_fail_count_range+"\n") return max_q_fail = int(m.group(2)) min_q_fail = int(m.group(1)) q_fail_iter = int(m.group(3)) if args.ignore_quality: max_q_fail = stats['lenmax'] min_q_fail = stats['lenmax'] q_fail_iter = 1 max_q_num = stats['qmax'] min_q_num = stats['qmax'] q_num_iter = 1 max_mismatch = 3 min_mismatch = 0 mismatch_iter = 1 if args.mapped_mismatch_range: m = re.match('(\d+):(\d+):(\d+)',args.mapped_mismatch_range) if not m: sys.stderr.write("Error. malformed mapped mismatch tolerance range "+args.mapped_mismatch_range+"\n") return max_mismatch = int(m.group(2)) min_mismatch = int(m.group(1)) q_mismatch = int(m.group(3)) if args.ignore_mapped_mismatches: min_mismatch = stats['lenmax'] max_mismatch = stats['lenmax'] mismatch_iter = 1 flist = [] run_params = {} run_stats = {} sys.stderr.write("Left trim search space: "+str(min_l_trim)+":"+str(min([stats['lenmax'],max_l_trim]))+":"+str(l_trim_iter)+"\n") sys.stderr.write("Right trim search space: "+str(min_r_trim)+":"+str(min([stats['lenmax'],max_r_trim]))+":"+str(r_trim_iter)+"\n") sys.stderr.write("Quality number search space: "+str(max(min_q_num,stats['qmin']))+":"+str(min(max_q_num,stats['qmax']))+":"+str(q_num_iter)+"\n") sys.stderr.write("Quality fail count search space: "+str(min_q_fail)+":"+str(min(stats['lenmax'],max_q_fail))+":"+str(q_fail_iter)+"\n") sys.stderr.write("Max mapped mismatch search space: "+str(min_mismatch)+":"+str(min(stats['lenmax'],max_mismatch))+":"+str(mismatch_iter)+"\n") for l_cut in range(min_l_trim,min([stats['lenmax'],max_l_trim])+1,l_trim_iter): for r_cut in range(min_r_trim,min([stats['lenmax'],max_r_trim])+1,r_trim_iter): for q_floor in range(max(min_q_num,stats['qmin']),min(max_q_num,stats['qmax'])+1,q_num_iter): for failure_limit in range(min(min_q_fail,stats['lenmax']-l_cut-r_cut),min(stats['lenmax']-l_cut-r_cut,max_q_fail)+1,q_fail_iter): for max_allowed_mismatches in range(min_mismatch,max_mismatch+1,mismatch_iter): z += 1 run_params[z] = {} run_params[z]['l_cut'] = l_cut run_params[z]['r_cut'] = r_cut run_params[z]['q_floor'] = q_floor run_params[z]['failure_limit'] = failure_limit run_params[z]['max_allowed_mismatches'] = max_allowed_mismatches run_stats[z] = {} run_stats[z]['after_qc_reads'] = 0 run_stats[z]['after_qc_bases'] = 0 of = open(tdir+'/'+str(z)+'.fq','w') k = 0 scnt = 0 for e in entries: seq = e['seq'] seq = left_trim(seq,l_cut) seq = right_trim(seq,r_cut) qual = e['quality'] qual = left_trim(qual,l_cut) qual = right_trim(qual,r_cut) if len(seq) < min_read_size: continue failure_count = 0 for i in range(0,len(qual)): if seq[i].upper() == 'N': failure_count += 1 elif ord(qual[i]) < q_floor: failure_count += 1 if failure_count > failure_limit: continue k+=1 scnt += 1 run_stats[z]['after_qc_reads'] += 1 run_stats[z]['after_qc_bases'] += len(seq) of.write("@s_"+str(k)+"\n") of.write(seq+"\n") of.write('+'+"\n") of.write(qual+"\n") of.close() if scnt < mincnt: #how many sequences were left after filtering, make sure we have enough to care os.remove(tdir+'/'+str(z)+'.fq') else: flist.append(z) sys.stderr.write("total of "+str(len(flist))+" params\n") p = multiprocessing.Pool(processes=cpus) results = [] for z in flist: p.apply_async(check_parameters,(z,gz,ifile,tdir,run_params[z]['max_allowed_mismatches'],Q,len(flist))) #check_parameters(z,gz,ifile,tdir,max_end_mismatches,max_allowed_mismatches,Q) #print str(map_bases) + "\t" + str(map_reads) p.close() p.join() sys.stderr.write("\n") run_results = {} while True: if Q.empty(): break [z, reads, bases] = Q.get() #[z, reads, bases] = result run_results[z] = {} run_results[z]['after_mapped_reads'] = reads run_results[z]['after_mapped_bases'] = bases header = "left_cut_count\tright_cut_count\tmin_quality_value\tmax_quality_failure_count\tmax_mapped_mismatch_count\toriginal_read_count\toriginal_base_count\tpost_qc_read_count\tpost_qc_base_count\tmapped_reads\tmapped_bases" if args.o: of = open(args.o,'w') of.write(header+"\n") else: print header for z in sorted(run_results.keys()): ostring = str(run_params[z]['l_cut']) + "\t" + str(run_params[z]['r_cut']) + "\t" + \ str(run_params[z]['q_floor']) + "\t" + str(run_params[z]['failure_limit']) + "\t" ostring += str(run_params[z]['max_allowed_mismatches']) + "\t" ostring += str(stats['readcount']) + "\t" + str(stats['basecount']) + "\t" ostring += str(run_stats[z]['after_qc_reads']) + "\t" + str(run_stats[z]['after_qc_bases']) + "\t" ostring += str(run_results[z]['after_mapped_reads']) + "\t" + str(run_results[z]['after_mapped_bases']) + "\t" if args.o: of.write(ostring+"\n") else: print ostring if args.o: of.close() rmtree(tdir)
def load_from_inputs(args): #Read in the VCF file sys.stderr.write("Reading in the VCF file\n") alleles = {} #with open(args.phased_VCF) as inf: with open(args.inputs[1]) as inf: for line in inf: vcf = VCF(line) if not vcf.is_snp(): continue g = vcf.get_phased_genotype() if not g: continue if vcf.value('chrom') not in alleles: alleles[vcf.value('chrom')] = {} if vcf.value('pos') in alleles[vcf.value('chrom')]: sys.stderr.write("WARNING: seeing the same position twice.\n"+line.rstrip()+"\n") alleles[vcf.value('chrom')][vcf.value('pos')] = g # set our left and right sys.stderr.write("Reading in the reference genome\n") #ref = read_fasta_into_hash(args.reference_genome) ref = read_fasta_into_hash(args.inputs[0]) res1 = [] res2 = [] p = None sys.stderr.write("Introducing VCF changes to reference sequences\n") # Pretty memory intesnive to so don't go with all possible threads if args.threads > 1: p = Pool(processes=max(1,int(args.threads/4))) for chrom in ref: # handle the case where there is no allele information if chrom not in alleles: r1q = Queue() r1q.put([0,chrom,ref[chrom]]) res1.append(r1q) r2q = Queue() r2q.put([0,chrom,ref[chrom]]) res2.append(r2q) elif args.threads > 1: res1.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],0,chrom))) res2.append(p.apply_async(adjust_reference_genome,args=(alleles[chrom],ref[chrom],1,chrom))) else: r1q = Queue() r1q.put(adjust_reference_genome(alleles[chrom],ref[chrom],0,chrom)) res1.append(r1q) r2q = Queue() r2q.put(adjust_reference_genome(alleles[chrom],ref[chrom],1,chrom)) res2.append(r2q) if args.threads > 1: p.close() p.join() # now we can fill reference 1 with all our new sequences ref1 = {} c1 = 0 for i in range(0,len(res1)): res = res1[i].get() c1 += res[0] ref1[res[1]]=res[2] # now we can fill reference 2 with all our new sequences ref2 = {} c2 = 0 for i in range(0,len(res2)): res = res2[i].get() c2 += res[0] ref2[res[1]]=res[2] sys.stderr.write("Made "+str(c1)+"|"+str(c2)+" changes to the reference\n") # Now ref1 and ref2 have are the diploid sources of the transcriptome gpdnames = {} txn1 = Transcriptome() txn2 = Transcriptome() txn1.set_reference_genome_dictionary(ref1) txn2.set_reference_genome_dictionary(ref2) #with open(args.transcripts_genepred) as inf: with open(args.inputs[2]) as inf: for line in inf: if line[0]=='#': continue txn1.add_genepred_line(line.rstrip()) txn2.add_genepred_line(line.rstrip()) gpd = GenePredEntry(line.rstrip()) gpdnames[gpd.value('name')] = gpd.value('gene_name') # The transcriptomes are set but we dont' really need the references anymore # Empty our big memory things txn1.ref_hash = None txn2.ref_hash = None for chrom in ref1.keys(): del ref1[chrom] for chrom in ref2.keys(): del ref2[chrom] for chrom in ref.keys(): del ref[chrom] if not args.locus_by_gene_name: #[locus2name,name2locus] = get_loci(args.transcripts_genepred) [locus2name,name2locus] = get_loci(args.inputs[2]) else: # set locus by gene name sys.stderr.write("Organizing loci by gene name\n") locus2name = {} name2locus = {} numname = {} m = 0 for name in sorted(gpdnames): gene = gpdnames[name] if gene not in numname: m+=1 numname[gene] = m num = numname[gene] if num not in locus2name: locus2name[num] = set() locus2name[num].add(name) name2locus[name] = num sys.stderr.write("Ended with "+str(len(locus2name.keys()))+" loci\n") if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn1.add_expression(f[0],float(f[1])) txn2.add_expression(f[0],float(f[1])) elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") cuffz = 0 with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: cuffz +=1 sys.stderr.write(str(cuffz)+" cufflinks entries processed\r") f = line.rstrip().split("\t") txn1.add_expression_no_update(f[0],float(f[9])) txn2.add_expression_no_update(f[0],float(f[9])) txn1.update_expression() txn2.update_expression() sys.stderr.write("\n") elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") else: sys.stderr.write("Warning isoform expression not sepcified, using uniform expression model.\n") # Now we have the transcriptomes set rhos = {} # The ASE of allele 1 (the left side) randos = {} if args.seed: random.seed(args.seed) for z in locus2name: randos[z] = random.random() sys.stderr.write("Setting rho for each transcript\n") # Lets set rho for ASE for each transcript for tname in sorted(txn1.transcripts): if args.ASE_identical or args.ASE_identical == 0: rhos[tname] = float(args.ASE_identical) elif args.ASE_isoform_random: rhos[tname] = random.random() else: # we must be on locus random rhos[tname] = randos[name2locus[tname]] #Now our dataset is set up rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1,txn2) rbe.gene_names = gpdnames rbe.name2locus = name2locus rbe.set_transcriptome1_rho(rhos) return rbe
def main(): parser = argparse.ArgumentParser( description="splice together partial alignments") group1 = parser.add_mutually_exclusive_group(required=True) group1.add_argument('--fastq_reads') group1.add_argument('--fasta_reads') parser.add_argument('--genome', help="FASTA reference genome", required=True) parser.add_argument('--genepred', help="Transcriptome genepred") parser.add_argument('--max_intron_size', type=int, default=100000, help="INT maximum intron size") parser.add_argument('--min_intron_size', type=int, default=68, help="INT minimum intron size") parser.add_argument('--max_gap_size', type=int, default=10, help="INT gap size in query to join") parser.add_argument( '--max_search_expand', type=int, default=10, help="INT max search space to expand search for junction") parser.add_argument( '--direction_specific', action='store_true', help= "The direction of the transcript is known and properly oriented already" ) parser.add_argument('--threads', type=int, default=0, help="INT number of threads to use default cpu_count") parser.add_argument( '-o', '--output', default='-', help= "FILENAME output results to here rather than STDOUT which is default") parser.add_argument('input_alignment', help="FILENAME input .psl file or '-' for STDIN") args = parser.parse_args() # Read our reference genome sys.stderr.write("Reading reference\n") ref = read_fasta_into_hash(args.genome) # Make sure our reads are unique sys.stderr.write("Checking for unqiuely named reads\n") reads = check_for_uniquely_named_reads( args) # does a hard exit and error if there are any names repeated sys.stderr.write("Reads are uniquely named\n") # Set number of threads to use cpu_count = multiprocessing.cpu_count() if args.threads > 0: cpu_count = args.threads #Set reference splices (if any are available) reference_splices = {} if args.genepred: sys.stderr.write("Reading reference splices from genepred\n") reference_splices = get_reference_splices(args) sys.stderr.write("Reading alignments into loci\n") # Get locus division (first stage) # Each read (qName) is separated # Then each locus will be specific to at chromosome (tName) # Then by (strand), but keep in mind this is the is based on the read # Each locus should be specific to a direction but we don't necessarily # know direction based on the data we have thus far. inf = sys.stdin if args.input_alignment != '-': inf = open(args.input_alignment, 'r') loci = {} for line in inf: line = line.rstrip() if re.match('^#', line): continue psl = PSLBasics.line_to_entry(line) if psl['qName'] not in loci: loci[psl['qName']] = {} if psl['tName'] not in loci[psl['qName']]: loci[psl['qName']][psl['tName']] = {} if psl['strand'] not in loci[psl['qName']][psl['tName']]: loci[psl['qName']][psl['tName']][psl['strand']] = {} if psl['tStarts'][0] not in loci[psl['qName']][psl['tName']][ psl['strand']]: loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'] [0]] = [] loci[psl['qName']][psl['tName']][psl['strand']][psl['tStarts'] [0]].append(psl) sys.stderr.write("breaking loci by genomic distance\n") for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: #print qname + "\t" + chr + "\t" + strand starts = loci[qname][chr][strand].keys() current_set = [] locus_sets = [] last_end = -1 * (args.max_intron_size + 2) for start in sorted(starts): for e in loci[qname][chr][strand][start]: start = e['tStarts'][ 0] + 1 # base-1 start of start of alignment if start > last_end + args.max_intron_size: # we have the start of a new set if len(current_set) > 0: locus_sets.append(current_set) current_set = [] last_end = e['tStarts'][len(e['tStarts']) - 1] + e['blockSizes'][ len(e['tStarts']) - 1] current_set.append(e) if len(current_set) > 0: locus_sets.append(current_set) loci[qname][chr][ strand] = locus_sets # replace what was there with these ordered sets locus_total = 0 for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: for locus_set in loci[qname][chr][strand]: locus_total += 1 sys.stderr.write("Work on each read in each locus with " + str(cpu_count) + " CPUs\n") p = multiprocessing.Pool(processes=cpu_count) locus_count = 0 for qname in loci: for chr in loci[qname]: for strand in loci[qname][chr]: #print qname + "\t" + chr + "\t" + strand for locus_set in loci[qname][chr][strand]: locus_count += 1 onum = len(locus_set) # send blank reference splices unless we have some rsplices = {} if chr in reference_splices: rsplices = reference_splices[chr] #p.apply_async(process_locus_set,(locus_set,args,rsplices,ref[chr],reads[qname],locus_total,locus_count),callback=do_locus_callback) r1 = execute_locus(locus_set, args, rsplices, ref[chr], reads[qname], locus_total, locus_count) do_locus_callback(r1) #nnum = len(new_locus_set) #print str(onum) + " to " + str(nnum) #for e in new_locus_set: # print PSLBasics.entry_to_line(e) p.close() p.join() sys.stderr.write("\nfinished\n") ofh = sys.stdout if not args.output == '-': ofh = open(args.output, 'w') for line in combo_results: ofh.write(line)
def main(): parser = argparse.ArgumentParser( description='Use reference junctions when they are close', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size', type=int, default=68, help="INT min intron size") parser.add_argument( '--min_local_support', type=int, default=0, help= "INT min number of junctions within search_size of a junction in order to count it" ) parser.add_argument('--search_size', type=int, default=10, help="INT search space for reference") parser.add_argument( '--output_fake_psl', help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred', help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1, len(e['exonStarts'])): if e['exonEnds'][i - 1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i - 1]] = {} if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i - 1]]: ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] + 1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#', line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len( pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps( GenePredBasics.line_to_entry(genepred_line), args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe, ge, refjuns, args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line( new_ge, genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--only_output_alternates',action='store_true',help='When selected, the original coordiantes are not output, and only the alternates are output') parser.add_argument('--long_form', action='store_true',help="add an additional column to the beginning of the output indicating whether it is an original or alternate splice coordinate") parser.add_argument('GenomeFastaFile',nargs=1,help="FILENAME Fasta format file of the reference genome") parser.add_argument('SpliceSiteFile',nargs=1,help="FILENAME Splice Site file is in tsv format with <Left chrom> <Left coord (base-1)> <Left dir [+-]> <Right chrom> <Right coord (base-1)> <Right dir [+-]>\nWhere the coordinates indicate the base that is inside the exon proximal to the splice. Direction indicates the transcription direction on the chromosome for that side of the splice. For coordiantes 1-base means that the number 1 would be the first base of the sequence (makes sense to do it that way, right? :P)") of = sys.stdout args = parser.parse_args() golds = [] with open(args.SpliceSiteFile[0]) as inf: for line in inf: f = line.rstrip().split() t = {} t['l'] = {} t['r'] = {} t['l']['chr'] = f[0] t['l']['coord'] = int(f[1]) t['l']['dir'] = f[2] t['r']['chr'] = f[3] t['r']['coord'] = int(f[4]) t['r']['dir'] = f[5] golds.append(t) ref = read_fasta_into_hash(args.GenomeFastaFile[0]) lens = {} for chr in ref: lens[chr] = len(ref[chr]) for g in golds: l_chrom = g['l']['chr'] r_chrom = g['r']['chr'] l_start = g['l']['coord'] r_start = g['r']['coord'] l_dir = g['l']['dir'] r_dir = g['r']['dir'] # print the main case if not args.only_output_alternates: startstring = '' if args.long_form: startstring = "original\t" of.write(startstring+l_chrom + "\t" + str(l_start) + "\t" + l_dir + "\t" + r_chrom + "\t" + str(r_start) + "\t" + r_dir+"\n") #check upstream left equivalent = 1 l_base = l_start r_base = r_start while(equivalent == 1): left_bases = '' right_bases = '' if l_dir == '+': l_base -= 1 if l_base < 1: break left_bases = str(ref[l_chrom][l_base]) else: l_base += 1 if l_base > lens[l_chrom]: break left_bases = rc(str(ref[l_chrom][l_base-2])) if r_dir == '+': r_base -= 1 if r_base < 1: break right_bases = str(ref[r_chrom][r_base-1]) else: r_base += 1 if r_base > lens[r_chrom]: break right_bases = rc(str(ref[r_chrom][r_base-1])) if left_bases != right_bases: break startstring = '' if args.long_form: startstring = "alternate\t" of.write(startstring+l_chrom + "\t" + str(l_base) + "\t" + l_dir + "\t" + r_chrom + "\t" + str(r_base) + "\t" + r_dir+"\n") #check downstream left equivalent = 1 l_base = l_start r_base = r_start while(equivalent == 1): left_bases = '' right_bases = '' if l_dir == '+': l_base += 1 if l_base > lens[l_chrom]: break left_bases = str(ref[l_chrom][l_base-1]) else: l_base -= 1 if l_base < 1: break left_bases = rc(str(ref[l_chrom][l_base-1])) if r_dir == '+': r_base += 1 if r_base > lens[r_chrom]: break right_bases = str(ref[r_chrom][r_base-2]) else: r_base -= 1 if r_base > lens[r_chrom]: break right_bases = rc(str(ref[r_chrom][r_base])) if left_bases != right_bases: break startstring = '' if args.long_form: startstring = "alternate\t" of.write(startstring+l_chrom + "\t" + str(l_base) + "\t" + l_dir + "\t" + r_chrom + "\t" + str(r_base) + "\t" + r_dir+"\n")
def load_from_inputs(args): #Read in the VCF file sys.stderr.write("Reading in the VCF file\n") alleles = {} #with open(args.phased_VCF) as inf: with open(args.inputs[1]) as inf: for line in inf: vcf = VCF(line) if not vcf.is_snp(): continue g = vcf.get_phased_genotype() if not g: continue if vcf.value('chrom') not in alleles: alleles[vcf.value('chrom')] = {} if vcf.value('pos') in alleles[vcf.value('chrom')]: sys.stderr.write("WARNING: seeing the same position twice.\n" + line.rstrip() + "\n") alleles[vcf.value('chrom')][vcf.value( 'pos')] = g # set our left and right sys.stderr.write("Reading in the reference genome\n") #ref = read_fasta_into_hash(args.reference_genome) ref = read_fasta_into_hash(args.inputs[0]) res1 = [] res2 = [] p = None sys.stderr.write("Introducing VCF changes to reference sequences\n") # Pretty memory intesnive to so don't go with all possible threads if args.threads > 1: p = Pool(processes=max(1, int(args.threads / 4))) for chrom in ref: # handle the case where there is no allele information if chrom not in alleles: r1q = Queue() r1q.put([0, chrom, ref[chrom]]) res1.append(r1q) r2q = Queue() r2q.put([0, chrom, ref[chrom]]) res2.append(r2q) elif args.threads > 1: res1.append( p.apply_async(adjust_reference_genome, args=(alleles[chrom], ref[chrom], 0, chrom))) res2.append( p.apply_async(adjust_reference_genome, args=(alleles[chrom], ref[chrom], 1, chrom))) else: r1q = Queue() r1q.put( adjust_reference_genome(alleles[chrom], ref[chrom], 0, chrom)) res1.append(r1q) r2q = Queue() r2q.put( adjust_reference_genome(alleles[chrom], ref[chrom], 1, chrom)) res2.append(r2q) if args.threads > 1: p.close() p.join() # now we can fill reference 1 with all our new sequences ref1 = {} c1 = 0 for i in range(0, len(res1)): res = res1[i].get() c1 += res[0] ref1[res[1]] = res[2] # now we can fill reference 2 with all our new sequences ref2 = {} c2 = 0 for i in range(0, len(res2)): res = res2[i].get() c2 += res[0] ref2[res[1]] = res[2] sys.stderr.write("Made " + str(c1) + "|" + str(c2) + " changes to the reference\n") # Now ref1 and ref2 have are the diploid sources of the transcriptome gpdnames = {} txn1 = Transcriptome() txn2 = Transcriptome() txn1.set_reference_genome_dictionary(ref1) txn2.set_reference_genome_dictionary(ref2) #with open(args.transcripts_genepred) as inf: with open(args.inputs[2]) as inf: for line in inf: if line[0] == '#': continue txn1.add_genepred_line(line.rstrip()) txn2.add_genepred_line(line.rstrip()) gpd = GenePredEntry(line.rstrip()) gpdnames[gpd.value('name')] = gpd.value('gene_name') # The transcriptomes are set but we dont' really need the references anymore # Empty our big memory things txn1.ref_hash = None txn2.ref_hash = None for chrom in ref1.keys(): del ref1[chrom] for chrom in ref2.keys(): del ref2[chrom] for chrom in ref.keys(): del ref[chrom] if not args.locus_by_gene_name: #[locus2name,name2locus] = get_loci(args.transcripts_genepred) [locus2name, name2locus] = get_loci(args.inputs[2]) else: # set locus by gene name sys.stderr.write("Organizing loci by gene name\n") locus2name = {} name2locus = {} numname = {} m = 0 for name in sorted(gpdnames): gene = gpdnames[name] if gene not in numname: m += 1 numname[gene] = m num = numname[gene] if num not in locus2name: locus2name[num] = set() locus2name[num].add(name) name2locus[name] = num sys.stderr.write("Ended with " + str(len(locus2name.keys())) + " loci\n") if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn1.add_expression(f[0], float(f[1])) txn2.add_expression(f[0], float(f[1])) elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") cuffz = 0 with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: cuffz += 1 sys.stderr.write(str(cuffz) + " cufflinks entries processed\r") f = line.rstrip().split("\t") txn1.add_expression_no_update(f[0], float(f[9])) txn2.add_expression_no_update(f[0], float(f[9])) txn1.update_expression() txn2.update_expression() sys.stderr.write("\n") elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") else: sys.stderr.write( "Warning isoform expression not sepcified, using uniform expression model.\n" ) # Now we have the transcriptomes set rhos = {} # The ASE of allele 1 (the left side) randos = {} if args.seed: random.seed(args.seed) for z in locus2name: randos[z] = random.random() sys.stderr.write("Setting rho for each transcript\n") # Lets set rho for ASE for each transcript for tname in sorted(txn1.transcripts): if args.ASE_identical or args.ASE_identical == 0: rhos[tname] = float(args.ASE_identical) elif args.ASE_isoform_random: rhos[tname] = random.random() else: # we must be on locus random rhos[tname] = randos[name2locus[tname]] #Now our dataset is set up rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter(txn1, txn2) rbe.gene_names = gpdnames rbe.name2locus = name2locus rbe.set_transcriptome1_rho(rhos) return rbe
def main(): parser = argparse.ArgumentParser( description="For every genepred entry report its alignability", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input', help="Genepred can be gzipped or - for STDIN") parser.add_argument('-r', '--reference', required=True, help="Reference fasta") parser.add_argument('-k', '--fragment_size', default=100, type=int, help="Fragment size to try to align") parser.add_argument('-x', '--hisat_index', required=True, help="HISAT index base name") parser.add_argument('--threads', type=int, default=cpu_count(), help="number of threads") parser.add_argument('--type', choices=['mean', 'median'], default='mean', help="How to bring together overlapping reads") parser.add_argument('--perbase', action='store_true') parser.add_argument('--output', '-o', help="output file or leave unset for STDOUT") args = parser.parse_args() if args.input == '-': args.input = sys.stdin elif re.search('\.gz$', args.input): args.input = gzip.open(args.input) else: args.input = open(args.input) udir = os.path.dirname(os.path.realpath(__file__)) cmd2 = udir + '/genepred_counts_to_mappability.py -' cmd2 += ' --threads ' + str(args.threads) cmd2 += ' -k ' + str(args.fragment_size) if args.perbase: cmd2 += ' --perbase' if args.output: cmd2 += ' --output ' + args.output if args.type: cmd2 += ' --type ' + args.type p2 = Popen(cmd2.split(), stdin=PIPE) ref = read_fasta_into_hash(args.reference) cmd1 = 'hisat -x ' + args.hisat_index + ' -U - -f --reorder -p ' + str( args.threads) p1 = Popen(cmd1.split(), stdin=PIPE, stdout=p2.stdin, stderr=null) #p1 = Popen(cmd1.split(),stdin=PIPE,stdout=p2.stdin) line_number = 0 for line in args.input: line_number += 1 gpd = GPD(line.rstrip()) #print gpd.entry['name'] #print gpd.length() if gpd.length() < args.fragment_size: continue seq = gpd.get_sequence(ref) for i in range(0, len(seq) - args.fragment_size + 1): info = gpd.value('name') + "\t" + gpd.value( 'gene_name') + "\t" + str(line_number) + "\t" + str( len(seq)) + "\t" + str(i) einfo = encode_name(info) p1.stdin.write('>' + einfo + "\n") p1.stdin.write(seq[i:i + args.fragment_size] + "\n") p1.communicate() p2.communicate()