def main():
  parser = argparse.ArgumentParser(description="Make sam file compatible with tools counting on a splicemap format sam file.")
  parser.add_argument('in_sam',help="FILENAME of sam file, or '-' for STDIN")
  args = parser.parse_args()
  inf = sys.stdin
  if args.in_sam != '-':
    inf = open(args.in_sam)
  for line in inf:
    line = line.rstrip()
    if SamBasics.is_header(line):
      print line
      continue
    f = line.rstrip().split("\t")
    e = SamBasics.sam_line_to_dictionary(line)
    if SamBasics.check_flag(e['flag'],4):
      continue # skip the unmapped reads
    if SamBasics.check_flag(e['flag'],16):
      f[1] = "16"
    else:
      f[1] = "0"
    f[4] = "0"
    f[6] = "*"
    f[7] = "0"
    f[8] = "0"
    print "\t".join(f)
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Make sam file compatible with tools counting on a splicemap format sam file."
    )
    parser.add_argument('in_sam',
                        help="FILENAME of sam file, or '-' for STDIN")
    args = parser.parse_args()
    inf = sys.stdin
    if args.in_sam != '-':
        inf = open(args.in_sam)
    for line in inf:
        line = line.rstrip()
        if SamBasics.is_header(line):
            print line
            continue
        f = line.rstrip().split("\t")
        e = SamBasics.sam_line_to_dictionary(line)
        if SamBasics.check_flag(e['flag'], 4):
            continue  # skip the unmapped reads
        if SamBasics.check_flag(e['flag'], 16):
            f[1] = "16"
        else:
            f[1] = "0"
        f[4] = "0"
        f[6] = "*"
        f[7] = "0"
        f[8] = "0"
        print "\t".join(f)
def get_exons_from_seqs(seqs, d, spcf):
    sind = 0
    oline = ''
    for seq in seqs:
        sind += 1
        psec = 'P'  #primary or secondary
        if sind > 1: psec = 'S'
        d1 = d.copy()
        d1['rname'] = seq[1]
        if seq[2] == '+': d1['flag'] = 0
        else: d1['flag'] = 16
        d1['pos'] = seq[3]
        d1['cigar'] = seq[4]
        d1['cigar_array'] = SamBasics.parse_cigar(seq[4])
        skips = set(['H', 'D', 'N'])
        total_length = 0
        possible_matches = 0
        indels = 0
        qstart = 0
        if d1['cigar_array'][0]['op'] == 'S':
            qstart = d1['cigar_array'][0]['val']
        if d1['cigar_array'][0]['op'] == 'H':
            qstart = d1['cigar_array'][0]['val']
        for ce in d1['cigar_array']:
            if ce['op'] not in skips:
                total_length += ce['val']
            if ce['op'] == 'M': possible_matches += ce['val']
            elif ce['op'] == 'I':
                indels += ce['val']
            elif ce['op'] == 'D' and ce['val'] < 68:
                indels += ce['val']
        fakeseq = 'N' * total_length
        d1['seq'] = fakeseq
        nline = SamBasics.entry_to_line(d1)
        pline = spcf.convert_line(nline)
        pentry = PSLBasics.line_to_entry(pline)
        #mismatch_count = -1
        #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches
        #  for i in range(0,len(pentry['blockSizes'])):
        #    tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]]
        #    qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]]
        #    print pentry['blockSizes'][i]
        #    print tseq
        #    print qseq
        #    for j in range(0,len(tseq)):
        #      if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1
        gline = PSLBasics.convert_entry_to_genepred_line(pentry)
        gentry = GenePredBasics.line_to_entry(gline)
        gsmooth = GenePredBasics.smooth_gaps(gentry, 68)
        for i in range(0, len(gsmooth['exonStarts'])):
            oline += gsmooth['chrom'] + "\t" + str(
                gsmooth['exonStarts'][i]) + "\t" + str(
                    gsmooth['exonEnds']
                    [i]) + "\t" + gsmooth['strand'] + "\t" + gsmooth[
                        'name'] + "\t" + str(possible_matches) + "\t" + str(
                            indels) + "\t" + psec + "\t" + str(qstart) + "\n"
    return oline
def get_exons_from_seqs(seqs,d,spcf):
  sind = 0
  oline = ''
  for seq in seqs:
    sind+=1
    psec = 'P' #primary or secondary
    if sind > 1: psec = 'S'
    d1 = d.copy()
    d1['rname'] = seq[1]
    if seq[2] == '+':  d1['flag'] = 0
    else: d1['flag'] = 16
    d1['pos'] = seq[3]
    d1['cigar'] = seq[4]
    d1['cigar_array'] = SamBasics.parse_cigar(seq[4])
    skips = set(['H','D','N'])
    total_length = 0
    possible_matches = 0
    indels = 0
    qstart = 0
    if d1['cigar_array'][0]['op'] == 'S':
      qstart = d1['cigar_array'][0]['val']
    if d1['cigar_array'][0]['op'] == 'H':
      qstart = d1['cigar_array'][0]['val']
    for ce in d1['cigar_array']:
      if ce['op'] not in skips:
        total_length += ce['val']
      if ce['op'] == 'M': possible_matches += ce['val']
      elif ce['op'] == 'I':
        indels += ce['val']
      elif ce['op'] == 'D' and ce['val'] < 68:
        indels += ce['val']
    fakeseq = 'N'*total_length
    d1['seq'] = fakeseq
    nline = SamBasics.entry_to_line(d1)
    pline = spcf.convert_line(nline)
    pentry = PSLBasics.line_to_entry(pline)
    #mismatch_count = -1
    #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches
    #  for i in range(0,len(pentry['blockSizes'])):
    #    tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]]
    #    qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]]
    #    print pentry['blockSizes'][i]
    #    print tseq
    #    print qseq
    #    for j in range(0,len(tseq)):
    #      if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1
    gline = PSLBasics.convert_entry_to_genepred_line(pentry)
    gentry = GenePredBasics.line_to_entry(gline)
    gsmooth = GenePredBasics.smooth_gaps(gentry,68)
    for i in range(0,len(gsmooth['exonStarts'])):
      oline += gsmooth['chrom'] + "\t" + str(gsmooth['exonStarts'][i])+"\t"+str(gsmooth['exonEnds'][i])+"\t"+gsmooth['strand']+"\t"+gsmooth['name']+"\t"+str(possible_matches)+"\t"+str(indels)+"\t"+psec+"\t"+str(qstart)+"\n"
  return oline
def make_exons(args, thread_index, thread_count):
    is_sam = True
    if re.search('\.bam$', args.sam_file):
        is_sam = False
    stag = ''
    if is_sam: stag = '-S'
    cmd = 'samtools view -F 4 ' + stag + ' ' + args.sam_file
    spcf = SamBasics.SAMtoPSLconversionFactory()
    if args.reference_genome:
        spcf.set_genome(args.reference_genome)
    sampipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    fname = args.tempdir + '/bedpart.' + str(thread_index) + '.bed'
    of = open(fname, 'w')
    z = 0
    with sampipe.stdout as inf:
        for line in inf:
            z += 1
            if z % thread_count != thread_index: continue
            line = line.rstrip()
            if SamBasics.is_header(line):
                continue
            d = SamBasics.sam_line_to_dictionary(line)
            strand = '+'
            if SamBasics.check_flag(d['flag'], 16):
                strand = '-'
            seqs = []
            sequence = d['seq']
            seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']])
            m = re.search('XA:Z:(\S+)', line)
            if m and args.use_secondary_alignments:
                e = m.group(1)
                secondaries = e.rstrip(";").split(";")
                for secondary in secondaries:
                    m1 = re.match('([^,]+),([+-])(\d+),([^,]+)', secondary)
                    if not m1:
                        sys.stderr.write("strange secondary format " +
                                         secondary + "\n")
                        sys.exit()
                    seqs.append([
                        d['qname'],
                        m1.group(1),
                        m1.group(2),
                        int(m1.group(3)),
                        m1.group(4)
                    ])
            #p.apply_async(get_exons_from_seqs,[seqs,d,spcf])
            exons = get_exons_from_seqs(seqs, d, spcf)
            of.write(exons)
            #return exons
    of.close()
def main():
    parser = argparse.ArgumentParser(
        description="Get read counts from sam or bam.")
    parser.add_argument('input', help="FILENAME sam or bam")
    parser.add_argument(
        '--add_report',
        action='store_true',
        help="make a new file where we replace sam or bam with a .mapped_count"
    )
    args = parser.parse_args()
    if args.add_report:
        m = re.match('(.+)\.[bs]am', args.input)
        if not m:
            sys.stderr.write("bad inputfile type should be .bam or .sam\n")
            sys.exit()
        baseinput = m.group(1)
    samtag = ''
    if re.search('\.sam$', args.input): samtag = '-S'
    z = 0
    #se = open('/dev/stderr','w')
    p = subprocess.Popen('sort | uniq | wc -l',
                         shell=True,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE)
    with os.popen('samtools view ' + samtag + ' ' + args.input) as inf:
        for line in inf:
            z += 1
            if z % 100000 == 0:
                sys.stderr.write(str(z) + " alignments processed\r")
            line = line.rstrip()
            d = SamBasics.sam_line_to_dictionary(line)
            if not SamBasics.check_flag(d['flag'], 4):
                if SamBasics.check_flag(d['flag'], 64):
                    p.stdin.write(d['qname'] + '.1' + "\n")
                elif SamBasics.check_flag(d['flag'], 128):
                    p.stdin.write(d['qname'] + '.2' + "\n")
                else:
                    sys.stderr.write("Unrecognized\n")
                    sys.exit()
    sys.stderr.write("\n")
    aligned_reads = int(p.communicate()[0].rstrip())
    if args.add_report:
        of = open(baseinput + '.mapped_reads', 'w')
        of.write(str(aligned_reads) + "\n")
        return
    print aligned_reads
def main():
    parser = argparse.ArgumentParser(
        description=
        "Find mapping distance of paired end reads.  Takes an ordered (by query) alignment to a transcriptome.\nSomething that works for an input thus far is like:\nhisat --reorder -x mytranscriptome -1 my_1.fastq -2 my_2.fastq | this_script.py -"
    )
    parser.add_argument(
        'input_sam',
        help="SAMFILE ordered alignment a transcriptome or - for stdin")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input_sam != '-':
        inf = open(args.input_sam)
    msr = SamBasics.MultiEntrySamReader(inf)
    spcf = SamBasics.SAMtoPSLconversionFactory()
    data = []
    sys.stderr.write("Pairs    Mean    Stddev\n")
    while True:
        entries = msr.read_entries()
        if not entries: break
        if len(entries) != 2: continue
        [e1, e2] = entries
        if e1.check_flag(4) or e2.check_flag(4): continue
        if not e1.check_flag(2) and e2.check_flag(2): continue
        if not ((e1.check_flag(64) and e2.check_flag(128)) or
                (e1.check_flag(128) and e2.check_flag(64))):
            continue
        p1 = spcf.convert_line(e1.get_line())
        p2 = spcf.convert_line(e2.get_line())
        if not p1 or not p2: continue
        p1 = PSLBasics.PSL(p1)
        p2 = PSLBasics.PSL(p2)
        dist = max(
            p2.value('tEnd') - p1.value('tStart'),
            p1.value('tEnd') - p2.value('tStart'))
        data.append(dist)
        if len(data) < 2: continue
        if len(data) % 1000 == 0:
            sys.stderr.write(
                str(len(data)) + "    " + str(int(mean(data))) + "    " +
                str(int(stddev(data))) + "              \r")
    sys.stderr.write(
        str(len(data)) + "    " + str(int(mean(data))) + "    " +
        str(int(stddev(data))) + "              \r")
    sys.stderr.write("\n")
def make_exons(args,thread_index,thread_count):
  is_sam = True
  if re.search('\.bam$',args.sam_file):
    is_sam = False
  stag = ''
  if is_sam: stag = '-S'
  cmd = 'samtools view -F 4 '+stag+' '+args.sam_file
  spcf = SamBasics.SAMtoPSLconversionFactory()
  if args.reference_genome:
    spcf.set_genome(args.reference_genome)
  sampipe = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
  fname = args.tempdir+'/bedpart.'+str(thread_index)+'.bed'
  of = open(fname,'w')
  z = 0
  with sampipe.stdout as inf:
    for line in inf:
      z+=1
      if z%thread_count != thread_index: continue
      line = line.rstrip()
      if SamBasics.is_header(line):
        continue
      d = SamBasics.sam_line_to_dictionary(line)
      strand = '+'
      if SamBasics.check_flag(d['flag'],16):
        strand = '-'
      seqs = []
      sequence = d['seq']
      seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']])
      m = re.search('XA:Z:(\S+)',line)
      if m and args.use_secondary_alignments:
        e = m.group(1)
        secondaries = e.rstrip(";").split(";")
        for secondary in secondaries:
          m1 = re.match('([^,]+),([+-])(\d+),([^,]+)',secondary)
          if not m1:
            sys.stderr.write("strange secondary format "+secondary+"\n")
            sys.exit()
          seqs.append([d['qname'], m1.group(1),m1.group(2),int(m1.group(3)),m1.group(4)])
      #p.apply_async(get_exons_from_seqs,[seqs,d,spcf])
      exons = get_exons_from_seqs(seqs,d,spcf)
      of.write(exons)
      #return exons
  of.close()
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('psl',help="FILENAME of psl file (can be gzipped)")
  parser.add_argument('refgenome',help="FASTA of the reference genome")
  parser.add_argument('--min_intron_size',default=68,type=int,help="INT minimum intron size")
  parser.add_argument('--fastq_reads',help="FASTQ of the reads")
  parser.add_argument('--fasta_reads',help="FASTA of the reads")
  parser.add_argument('--skip_directionless_splice',action='store_true',help='only output reads where canonical splice sites indicate direciton if junctions are present')
  parser.add_argument('-o',help="FILENAME to save sam output")
  args = parser.parse_args()
  pscf = SamBasics.PSLtoSAMconversionFactory()
  pscf.set_min_intron_size(args.min_intron_size)
  sys.stderr.write("Creating header from reference fasta\n")
  #header = SamBasics.construct_header_from_reference_fasta('/Shared/Au/jason/Reference/UCSC/Human/hg19_GRCh37_feb2009/Genome/genome.fa')
  if args.skip_directionless_splice:
    pscf.set_skip_directionless_splice()
  header = SamBasics.construct_header_from_reference_fasta(args.refgenome)
  if args.o:
    of = open(args.o,'w')
    of.write(header)
  else:
    sys.stdout.write(header)
  sys.stderr.write("setting reference fasta for conversion\n")
  pscf.set_reference_genome(args.refgenome)
  sys.stderr.write("determining mapping counts from psl\n")
  pscf.set_mapping_counts(args.psl)
  #pscf.construct_header_from_reference_fasta('test.fa')
  sys.stderr.write("Establishing library of reads\n")
  if args.fastq_reads:
    pscf.set_read_fastq(args.fastq_reads)
  elif args.fasta_reads:
    pscf.set_read_fasta(args.fasta_reads)
  sys.stderr.write("Performing conversion\n")
  gfr = None
  if args.psl[-3:]=='.gz': 
    gfr = gzip.open(args.psl)
  else:
    gfr = open(args.psl)
  skipped = 0
  while True:
    line = gfr.readline()
    if not line: break
    samline = pscf.convert_line(line.rstrip())
    if not samline:
      skipped += 1 
      sys.stderr.write("\rskipping directionless splice ("+str(skipped)+")            ")
      continue # happens if we are skipping directionless splice
    if args.o:
      of.write(samline+"\n")
    else:
      sys.stdout.write(samline+"\n")
  if args.o:
    of.close()
  gfr.close()
  sys.stderr.write("\n")
def main():
  parser = argparse.ArgumentParser(description="Get read counts from sam or bam.")
  parser.add_argument('input',help="FILENAME sam or bam")
  parser.add_argument('--add_report',action='store_true',help="make a new file where we replace sam or bam with a .mapped_count")
  args = parser.parse_args()
  if args.add_report:
    m = re.match('(.+)\.[bs]am',args.input)
    if not m:
      sys.stderr.write("bad inputfile type should be .bam or .sam\n")
      sys.exit()
    baseinput = m.group(1)
  samtag = ''
  if re.search('\.sam$',args.input): samtag = '-S'
  z = 0
  #se = open('/dev/stderr','w')
  p = subprocess.Popen('sort | uniq | wc -l',shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE)
  with os.popen('samtools view '+samtag+' '+args.input) as inf:
    for line in inf:
      z += 1
      if z%100000 ==0: 
        sys.stderr.write(str(z)+" alignments processed\r")
      line = line.rstrip()
      d = SamBasics.sam_line_to_dictionary(line)
      if not SamBasics.check_flag(d['flag'],4):
        if SamBasics.check_flag(d['flag'],64):
          p.stdin.write(d['qname']+'.1'+"\n")
        elif SamBasics.check_flag(d['flag'],128):
          p.stdin.write(d['qname']+'.2'+"\n")
        else:
          sys.stderr.write("Unrecognized\n")
          sys.exit()
  sys.stderr.write("\n")
  aligned_reads = int(p.communicate()[0].rstrip())
  if args.add_report:
    of = open(baseinput+'.mapped_reads','w')
    of.write(str(aligned_reads)+"\n")
    return
  print aligned_reads
 def read_sam_file(self,filename):
   gsr = SamBasics.GenericSamReader(filename)
   linecount = 0
   while True and linecount < self.max_read_count:
     line1 = gsr.readline().rstrip()
     if not line1: break
     line2 = gsr.readline().rstrip()
     if not line2: break
     line3 = gsr.readline().rstrip()
     if not line3: break
     line4 = gsr.readline().rstrip()
     if not line4: break
     self.record_observation(line4)
     linecount += 1
   gsr.close()
Exemple #12
0
def main():
  parser = argparse.ArgumentParser(description="Read a sam file and output a bed file in the format of junction_color.bed")
  parser.add_argument('-o','--output',help='FILENAME is output')
  parser.add_argument('--min_intron_size',type=int,default=68,help='minimum intron size')
  parser.add_argument('infile',help='FILENAME of sam file or "-" for STDIN')
  parser.add_argument('reference_genome',help='FILENAME of the reference genome')
  args = parser.parse_args()

  # get our reference genome
  sys.stderr.write("reading reference genome\n")
  #g = SequenceBasics.read_fasta_into_hash(args.reference_genome)
  g = FastaData(open(args.reference_genome).read())
  sys.stderr.write("finished reading reference genome\n")

  inf = sys.stdin
  read_mapping_count = {}
  junctions = {}
  if args.infile != '-':
    inf = open(args.infile)
  sys.stderr.write("reading through sam file\n")
  zall = 0
  zn = 0
  while True:
    line = inf.readline()
    if not line: break
    line = line.rstrip()
    if SamBasics.is_header(line): continue
    d = SamBasics.sam_line_to_dictionary(line)
    chrom = d['rname']
    if chrom =='*': continue
    if chrom not in g.keys():
      sys.stderr.write("WARNING: "+chrom+" not in reference, skipping\n")
      continue
    mate = 'U'
    if SamBasics.check_flag(d['flag'],int('0x4',16)): #check if its unmapped
      continue  # we can ignore the unmapped things for now
    if SamBasics.check_flag(d['flag'],int('0x40',16)):
      mate = 'L'
    elif SamBasics.check_flag(d['flag'],int('0x80',16)):
      mate = 'R'
    actual_read = d['qname']+"\t"+mate
    if actual_read not in read_mapping_count:
      read_mapping_count[actual_read] = 0
    read_mapping_count[actual_read] += 1
    has_intron = 0
    start_loc = d['pos']
    current_loc = start_loc
    bounds  = []
    for i in range(0,len(d['cigar_array'])):
      ce = d['cigar_array'][i]
      if ce['op'] == 'N' and ce['val'] >= args.min_intron_size:
        has_intron = 1
        lbound = current_loc # should be the intron start base index-1
        current_loc += ce['val']
        rbound = current_loc # should be the second exon start base index-1
        right_size = d['cigar_array'][i+1]['val']
        bounds.append([lbound,rbound,right_size])
      elif ce['op'] == 'D':
        current_loc += ce['val']
      elif re.match('[=XMSHP]',ce['op']):
        current_loc += ce['val'] 
    if has_intron == 0: continue # there are no splices to report here
    #print actual_read
    #print d['cigar']
    #print d
    #print start_loc
    #print bounds
    for bound in bounds:
      zall += 1
      intronflank = g[chrom][bound[0]-1:bound[0]+1].upper() + '-' + \
                    g[chrom][bound[1]-3:bound[1]-1].upper()
      strand = ''
      if is_canon(intronflank): # its a positive strand
        strand = '+'
      elif is_revcanon(intronflank): # its a negative strand
        strand = '-'
      else:
        # We can't deal with the non-canonical splice sorry
        zn += 1
        sys.stderr.write("WARNING skipping non-canonical splice ("+str(zn)+"/"+str(zall)+")\r")
        continue
      # If we are still in we have successfully found a splice
      out_chrom = chrom
      out_start = bound[0]-51
      out_end = bound[1]+49
      out_name = '*' # this will be done later
      out_score = 50
      out_strand = strand
      out_thickStart = out_start
      out_thickEnd = out_end
      out_rgb = '0,0,0'
      out_block_count = 2
      out_block_sizes = '50,50'
      out_block_starts = '0,'+str(bound[1]-bound[0]+50)
      bed = []
      bed.append(out_chrom)
      bed.append(str(out_start))
      bed.append(str(out_end))
      bed.append(out_name)
      bed.append(str(out_score))
      bed.append(out_strand)
      bed.append(str(out_thickStart))
      bed.append(str(out_thickEnd))
      bed.append(out_rgb)
      bed.append(str(out_block_count))
      bed.append(out_block_sizes)
      bed.append(out_block_starts)
      entry = "\t".join(bed)
      if entry not in junctions:
        junctions[entry] = {}
        junctions[entry]['reads'] = set()
        junctions[entry]['positions'] = set()
        junctions[entry]['right_sizes'] = set()
      junctions[entry]['reads'].add(actual_read)
      junctions[entry]['positions'].add(d['pos'])
      junctions[entry]['right_sizes'].add(bound[2])
  sys.stderr.write("\n")
  sys.stderr.write("finished reading sam\n")
  of = sys.stdout
  if args.output:
    of = open(args.output,'w')
  if len(junctions) > 0: # if we have stuff lets print a header
    of.write("track\tname=junctions\tdescription=\"SpliceMap junctions\" itemRgb=\"On\"\n")
  for entry in junctions:
    nR = len(junctions[entry]['reads'])
    width = max(junctions[entry]['right_sizes'])-min(junctions[entry]['right_sizes'])
    nNR = len(junctions[entry]['positions'])
    nUR = 0
    nMR = 0
    for read in junctions[entry]['reads']:
      if read_mapping_count[read] == 1:
        nUR += 1
      elif read_mapping_count[read] > 1:
        nMR += 1
      else:
        sys.stderr.write("ERROR: nonsense read count\n")
        return
    name = '('+str(nR)+')['+str(width)+'_'+str(nNR)+']('+str(nUR)+'/'+str(nMR)+')'
    bed = entry.split("\t")
    bed[3] = name
    of.write("\t".join(bed)+"\n")    
def main():
    parser = argparse.ArgumentParser(
        description="Convert a sam file into a psl file")
    parser.add_argument('--genome',
                        help="FASTA input file of reference genome")
    parser.add_argument('--get_secondary_alignments',
                        action='store_true',
                        help="Report SA:Z secondary alignments as well")
    parser.add_argument('--get_alternative_alignments',
                        action='store_true',
                        help="Report XA:Z alternative alignments as well")
    parser.add_argument(
        '--get_all_alignments',
        action='store_true',
        help="Report SA:Z and XA:Z alternative alignments as well")
    parser.add_argument('--give_unique_names',
                        action='store_true',
                        help="Output query names will be unique.")
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--output_fasta',
        help=
        "FILENAME to save an outgoing fasta.  Only works for primary alignments."
    )
    group.add_argument(
        '--output_fastq',
        help=
        "FILENAME to save an outgoing fastq.  Only works for primary alignments."
    )
    parser.add_argument('infile', help="FILENAME input file or '-' for STDIN")
    parser.add_argument('-o',
                        '--output',
                        help="FILENAME for the output, STDOUT if not set.")
    args = parser.parse_args()
    if (args.output_fasta
            or args.output_fastq) and (args.get_secondary_alignments
                                       or args.get_alternative_alignments
                                       or args.get_all_alignments):
        sys.stderr.write(
            "ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n"
        )
        sys.exit()
    inf = sys.stdin
    if args.infile != '-':
        inf = open(args.infile)
    of = sys.stdout
    if args.output:
        of = open(args.output, 'w')
    spcf = SamBasics.SAMtoPSLconversionFactory()
    if args.genome: spcf.set_genome(args.genome)
    off = None
    if args.output_fasta:
        off = open(args.output_fasta, 'w')
    if args.output_fastq:
        off = open(args.output_fastq, 'w')
    z = 0
    for line in inf:
        line = line.rstrip()
        if SamBasics.is_header(line):
            spcf.read_header_line(line)
            continue
        # We have a line to convert
        psl = spcf.convert_line(line)
        if psl:
            pobj = PSL(psl)
            z += 1
            if args.give_unique_names:
                pobj.entry['qName'] = 'Q' + str(z)
            of.write(pobj.get_line() + "\n")
            if args.output_fastq or args.output_fasta:
                sam = SamBasics.SAM(line)
                sequence = sam.value('seq').upper()
                quality = sam.value('qual')
                if sam.check_flag(16):
                    sequence = rc(sam.value('seq').upper())
                    quality = sam.value('qual')[::-1]
                if args.output_fasta:
                    off.write(">" + pobj.value('qName') + "\n" + sequence +
                              "\n")
                elif args.output_fastq:
                    if len(sequence) == len(quality):
                        off.write("@" + pobj.value('qName') + "\n" + sequence +
                                  "\n" + "+\n" + quality + "\n")
                    else:
                        sys.stderr.write("ERROR: sequence " + sequence +
                                         " length (" + str(len(sequence)) +
                                         ") doesnt match quality " + quality +
                                         " length (" + str(len(quality)) +
                                         ")\n")
                        sys.exit()
        # Lets look for secondary alignments to convert
        if args.get_secondary_alignments or args.get_all_alignments:
            secondary_alignments = SamBasics.get_secondary_alignments(
                line.rstrip())
            for samline in secondary_alignments:
                psl = spcf.convert_line(samline)
                if psl:
                    #print "\nsecondary"
                    #print samline
                    z += 1
                    pobj = PSL(psl)
                    if args.give_unique_names:
                        pobj.entry['qName'] = 'Q' + str(z)
                    of.write(pobj.get_line() + "\n")
        if args.get_alternative_alignments or args.get_all_alignments:
            alternative_alignments = SamBasics.get_alternative_alignments(
                line.rstrip())
            for samline in alternative_alignments:
                psl = spcf.convert_line(samline)
                if psl:
                    #print "\nsecondary"
                    #print samline
                    z += 1
                    pobj = PSL(psl)
                    if args.give_unique_names:
                        pobj.entry['qName'] = 'Q' + str(z)
                    of.write(pobj.get_line() + "\n")
    inf.close()
    of.close()
def main():
  parser = argparse.ArgumentParser(description="Convert a sam file into a psl file")
  parser.add_argument('--genome',help="FASTA input file of reference genome")
  parser.add_argument('--get_secondary_alignments',action='store_true',help="Report SA:Z secondary alignments as well")
  parser.add_argument('--get_alternative_alignments',action='store_true',help="Report XA:Z alternative alignments as well")
  parser.add_argument('--get_all_alignments',action='store_true',help="Report SA:Z and XA:Z alternative alignments as well")
  parser.add_argument('--give_unique_names',action='store_true',help="Output query names will be unique.")
  group = parser.add_mutually_exclusive_group()
  group.add_argument('--output_fasta',help="FILENAME to save an outgoing fasta.  Only works for primary alignments.")
  group.add_argument('--output_fastq',help="FILENAME to save an outgoing fastq.  Only works for primary alignments.")
  parser.add_argument('infile',help="FILENAME input file or '-' for STDIN")
  parser.add_argument('-o','--output',help="FILENAME for the output, STDOUT if not set.")
  args = parser.parse_args()
  if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments):
    sys.stderr.write("ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n")
    sys.exit()
  inf = sys.stdin
  if args.infile != '-': 
    inf = open(args.infile)
  of = sys.stdout
  if args.output:
    of = open(args.output,'w')
  spcf = SamBasics.SAMtoPSLconversionFactory()
  if args.genome: spcf.set_genome(args.genome)
  off = None
  if args.output_fasta:
    off = open(args.output_fasta,'w')
  if args.output_fastq:
    off = open(args.output_fastq,'w')
  z = 0
  for line in inf:
    line = line.rstrip()
    if SamBasics.is_header(line): 
      spcf.read_header_line(line)
      continue
    # We have a line to convert
    psl = spcf.convert_line(line)
    if psl:
      pobj = PSL(psl)
      z += 1
      if args.give_unique_names:
        pobj.entry['qName'] = 'Q'+str(z)
      of.write(pobj.get_line()+"\n")
      if args.output_fastq or args.output_fasta:
        sam = SamBasics.SAM(line)
        sequence = sam.value('seq').upper()
        quality = sam.value('qual')
        if sam.check_flag(16):
          sequence = rc(sam.value('seq').upper())
          quality = sam.value('qual')[::-1]
        if args.output_fasta:
          off.write(">"+pobj.value('qName')+"\n"+sequence+"\n")
        elif args.output_fastq:
          if len(sequence) == len(quality):
            off.write("@"+pobj.value('qName')+"\n"+sequence+"\n"+"+\n"+quality+"\n")
          else:
            sys.stderr.write("ERROR: sequence "+sequence+" length ("+str(len(sequence))+") doesnt match quality "+quality+" length ("+str(len(quality))+")\n")
            sys.exit()
    # Lets look for secondary alignments to convert
    if args.get_secondary_alignments or args.get_all_alignments:
      secondary_alignments = SamBasics.get_secondary_alignments(line.rstrip())
      for samline in secondary_alignments:
        psl = spcf.convert_line(samline)
        if psl:
          #print "\nsecondary"
          #print samline
          z += 1
          pobj = PSL(psl)
          if args.give_unique_names:
            pobj.entry['qName'] = 'Q'+str(z)
          of.write(pobj.get_line()+"\n")
    if args.get_alternative_alignments or args.get_all_alignments:
      alternative_alignments = SamBasics.get_alternative_alignments(line.rstrip())
      for samline in alternative_alignments:
        psl = spcf.convert_line(samline)
        if psl:
          #print "\nsecondary"
          #print samline
          z += 1
          pobj = PSL(psl)
          if args.give_unique_names:
            pobj.entry['qName'] = 'Q'+str(z)
          of.write(pobj.get_line()+"\n")
  inf.close()
  of.close()
Exemple #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('psl', help="FILENAME of psl file (can be gzipped)")
    parser.add_argument('refgenome', help="FASTA of the reference genome")
    parser.add_argument('--min_intron_size',
                        default=68,
                        type=int,
                        help="INT minimum intron size")
    parser.add_argument('--fastq_reads', help="FASTQ of the reads")
    parser.add_argument('--fasta_reads', help="FASTA of the reads")
    parser.add_argument(
        '--skip_directionless_splice',
        action='store_true',
        help=
        'only output reads where canonical splice sites indicate direciton if junctions are present'
    )
    parser.add_argument('-o', help="FILENAME to save sam output")
    args = parser.parse_args()
    pscf = SamBasics.PSLtoSAMconversionFactory()
    pscf.set_min_intron_size(args.min_intron_size)
    sys.stderr.write("Creating header from reference fasta\n")
    #header = SamBasics.construct_header_from_reference_fasta('/Shared/Au/jason/Reference/UCSC/Human/hg19_GRCh37_feb2009/Genome/genome.fa')
    if args.skip_directionless_splice:
        pscf.set_skip_directionless_splice()
    header = SamBasics.construct_header_from_reference_fasta(args.refgenome)
    if args.o:
        of = open(args.o, 'w')
        of.write(header)
    else:
        sys.stdout.write(header)
    sys.stderr.write("setting reference fasta for conversion\n")
    pscf.set_reference_genome(args.refgenome)
    sys.stderr.write("determining mapping counts from psl\n")
    pscf.set_mapping_counts(args.psl)
    #pscf.construct_header_from_reference_fasta('test.fa')
    sys.stderr.write("Establishing library of reads\n")
    if args.fastq_reads:
        pscf.set_read_fastq(args.fastq_reads)
    elif args.fasta_reads:
        pscf.set_read_fasta(args.fasta_reads)
    sys.stderr.write("Performing conversion\n")
    gfr = None
    if args.psl[-3:] == '.gz':
        gfr = gzip.open(args.psl)
    else:
        gfr = open(args.psl)
    skipped = 0
    while True:
        line = gfr.readline()
        if not line: break
        samline = pscf.convert_line(line.rstrip())
        if not samline:
            skipped += 1
            sys.stderr.write("\rskipping directionless splice (" +
                             str(skipped) + ")            ")
            continue  # happens if we are skipping directionless splice
        if args.o:
            of.write(samline + "\n")
        else:
            sys.stdout.write(samline + "\n")
    if args.o:
        of.close()
    gfr.close()
    sys.stderr.write("\n")
def check_parameters(z,gz,ifile,tdir,max_allowed_mismatches,Q,fsize):
    #sys.stderr.write("doing "+str(z)+"\n")
    g = {}
    for n in gz:
      g[n] = zlib.decompress(gz[n])
    FNULL = open(os.devnull,'w')
    cmd1 = "bwa mem "+ifile+" "+tdir+'/'+str(z)+'.fq'
    cmd2 = "samtools view -S -"
    stream1 = subprocess.Popen(cmd1.split(),stdout=subprocess.PIPE,stderr=FNULL)
    stream2 = subprocess.Popen(cmd2.split(),stdin=stream1.stdout,stdout=subprocess.PIPE,stderr=FNULL)
    reads = {}
    while True:
      sumlen= 0
      mismatches = 0
      line = stream2.stdout.readline()
      if not line: break
      f = line.rstrip().split("\t")
      if f[2] == '*':
        continue
      d = SamBasics.sam_line_to_dictionary(line)
      #if d['rname'] != 'chr20': continue #get rid of this line soon.
      cigar = d['cigar_array']
      #endmismatch = 0
      #if cigar[0]['op'] == 'S':
      #  endmismatch += cigar[0]['val']
      #if cigar[len(cigar)-1]['op'] == 'S':
      #  endmismatch += cigar[len(cigar)-1]['val']
      #if endmismatch > max_end_mismatches: continue
      read_index = 1
      chrom_index = d['pos']
      for e in cigar:
        if re.match('[MX=]',e['op']): 
          sumlen += e['val']  # keep track of our match length
          refseq = g[d['rname']][chrom_index-1:chrom_index-1+e['val']].upper()
          readseq = d['seq'][read_index-1:read_index-1+e['val']].upper()
          for i in range(0,e['val']): 
            if refseq[i] != readseq[i]: mismatches += 1
          read_index += e['val']
          chrom_index += e['val']
        elif re.match('[SI]',e['op']):
          mismatches += e['val']
          read_index += e['val']
        elif re.match('[NDH]',e['op']):
          chrom_index += e['val']
        else:
          sys.stderr.write("warning: strange SAM op\n")
      # save the biggest sum for the read name
      #print 'mismatches: '+str(mismatches)
      if mismatches > max_allowed_mismatches: continue
      if d['qname'] not in reads: 
        reads[d['qname']] = {}
        reads[d['qname']]['alignment_length'] = 0
        reads[d['qname']]['mismatches'] = 0
      if sumlen > reads[d['qname']]['alignment_length']: 
        reads[d['qname']]['alignment_length'] = sumlen
        reads[d['qname']]['mismatches'] = mismatches
    mapped_bases = 0
    mapped_reads = 0
    for rname in reads:
      mapped_bases += reads[rname]['alignment_length']
      mapped_reads += 1
    #print str(mapped_bases) + "\t" + str(mapped_reads)
    res = [z,mapped_reads,mapped_bases]
    #sys.stderr.write(str(z)+"\t"+str(mapped_reads)+"\t"+str(mapped_bases)+"\n")
    Q.put(res)
    progress = Q.qsize()
    sys.stderr.write('\r'+(' '*40))
    sys.stderr.write('\r'+str(progress)+"/"+str(fsize))
    sys.stderr.flush()
    return