def make_exons(args, thread_index, thread_count): is_sam = True if re.search('\.bam$', args.sam_file): is_sam = False stag = '' if is_sam: stag = '-S' cmd = 'samtools view -F 4 ' + stag + ' ' + args.sam_file spcf = SamBasics.SAMtoPSLconversionFactory() if args.reference_genome: spcf.set_genome(args.reference_genome) sampipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) fname = args.tempdir + '/bedpart.' + str(thread_index) + '.bed' of = open(fname, 'w') z = 0 with sampipe.stdout as inf: for line in inf: z += 1 if z % thread_count != thread_index: continue line = line.rstrip() if SamBasics.is_header(line): continue d = SamBasics.sam_line_to_dictionary(line) strand = '+' if SamBasics.check_flag(d['flag'], 16): strand = '-' seqs = [] sequence = d['seq'] seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']]) m = re.search('XA:Z:(\S+)', line) if m and args.use_secondary_alignments: e = m.group(1) secondaries = e.rstrip(";").split(";") for secondary in secondaries: m1 = re.match('([^,]+),([+-])(\d+),([^,]+)', secondary) if not m1: sys.stderr.write("strange secondary format " + secondary + "\n") sys.exit() seqs.append([ d['qname'], m1.group(1), m1.group(2), int(m1.group(3)), m1.group(4) ]) #p.apply_async(get_exons_from_seqs,[seqs,d,spcf]) exons = get_exons_from_seqs(seqs, d, spcf) of.write(exons) #return exons of.close()
def main(): parser = argparse.ArgumentParser( description= "Find mapping distance of paired end reads. Takes an ordered (by query) alignment to a transcriptome.\nSomething that works for an input thus far is like:\nhisat --reorder -x mytranscriptome -1 my_1.fastq -2 my_2.fastq | this_script.py -" ) parser.add_argument( 'input_sam', help="SAMFILE ordered alignment a transcriptome or - for stdin") args = parser.parse_args() inf = sys.stdin if args.input_sam != '-': inf = open(args.input_sam) msr = SamBasics.MultiEntrySamReader(inf) spcf = SamBasics.SAMtoPSLconversionFactory() data = [] sys.stderr.write("Pairs Mean Stddev\n") while True: entries = msr.read_entries() if not entries: break if len(entries) != 2: continue [e1, e2] = entries if e1.check_flag(4) or e2.check_flag(4): continue if not e1.check_flag(2) and e2.check_flag(2): continue if not ((e1.check_flag(64) and e2.check_flag(128)) or (e1.check_flag(128) and e2.check_flag(64))): continue p1 = spcf.convert_line(e1.get_line()) p2 = spcf.convert_line(e2.get_line()) if not p1 or not p2: continue p1 = PSLBasics.PSL(p1) p2 = PSLBasics.PSL(p2) dist = max( p2.value('tEnd') - p1.value('tStart'), p1.value('tEnd') - p2.value('tStart')) data.append(dist) if len(data) < 2: continue if len(data) % 1000 == 0: sys.stderr.write( str(len(data)) + " " + str(int(mean(data))) + " " + str(int(stddev(data))) + " \r") sys.stderr.write( str(len(data)) + " " + str(int(mean(data))) + " " + str(int(stddev(data))) + " \r") sys.stderr.write("\n")
def main(): parser = argparse.ArgumentParser( description="Convert a sam file into a psl file") parser.add_argument('--genome', help="FASTA input file of reference genome") parser.add_argument('--get_secondary_alignments', action='store_true', help="Report SA:Z secondary alignments as well") parser.add_argument('--get_alternative_alignments', action='store_true', help="Report XA:Z alternative alignments as well") parser.add_argument( '--get_all_alignments', action='store_true', help="Report SA:Z and XA:Z alternative alignments as well") parser.add_argument('--give_unique_names', action='store_true', help="Output query names will be unique.") group = parser.add_mutually_exclusive_group() group.add_argument( '--output_fasta', help= "FILENAME to save an outgoing fasta. Only works for primary alignments." ) group.add_argument( '--output_fastq', help= "FILENAME to save an outgoing fastq. Only works for primary alignments." ) parser.add_argument('infile', help="FILENAME input file or '-' for STDIN") parser.add_argument('-o', '--output', help="FILENAME for the output, STDOUT if not set.") args = parser.parse_args() if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments): sys.stderr.write( "ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n" ) sys.exit() inf = sys.stdin if args.infile != '-': inf = open(args.infile) of = sys.stdout if args.output: of = open(args.output, 'w') spcf = SamBasics.SAMtoPSLconversionFactory() if args.genome: spcf.set_genome(args.genome) off = None if args.output_fasta: off = open(args.output_fasta, 'w') if args.output_fastq: off = open(args.output_fastq, 'w') z = 0 for line in inf: line = line.rstrip() if SamBasics.is_header(line): spcf.read_header_line(line) continue # We have a line to convert psl = spcf.convert_line(line) if psl: pobj = PSL(psl) z += 1 if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") if args.output_fastq or args.output_fasta: sam = SamBasics.SAM(line) sequence = sam.value('seq').upper() quality = sam.value('qual') if sam.check_flag(16): sequence = rc(sam.value('seq').upper()) quality = sam.value('qual')[::-1] if args.output_fasta: off.write(">" + pobj.value('qName') + "\n" + sequence + "\n") elif args.output_fastq: if len(sequence) == len(quality): off.write("@" + pobj.value('qName') + "\n" + sequence + "\n" + "+\n" + quality + "\n") else: sys.stderr.write("ERROR: sequence " + sequence + " length (" + str(len(sequence)) + ") doesnt match quality " + quality + " length (" + str(len(quality)) + ")\n") sys.exit() # Lets look for secondary alignments to convert if args.get_secondary_alignments or args.get_all_alignments: secondary_alignments = SamBasics.get_secondary_alignments( line.rstrip()) for samline in secondary_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") if args.get_alternative_alignments or args.get_all_alignments: alternative_alignments = SamBasics.get_alternative_alignments( line.rstrip()) for samline in alternative_alignments: psl = spcf.convert_line(samline) if psl: #print "\nsecondary" #print samline z += 1 pobj = PSL(psl) if args.give_unique_names: pobj.entry['qName'] = 'Q' + str(z) of.write(pobj.get_line() + "\n") inf.close() of.close()