Exemple #1
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Make sam file compatible with tools counting on a splicemap format sam file."
    )
    parser.add_argument('in_sam',
                        help="FILENAME of sam file, or '-' for STDIN")
    args = parser.parse_args()
    inf = sys.stdin
    if args.in_sam != '-':
        inf = open(args.in_sam)
    for line in inf:
        line = line.rstrip()
        if SamBasics.is_header(line):
            print line
            continue
        f = line.rstrip().split("\t")
        e = SamBasics.sam_line_to_dictionary(line)
        if SamBasics.check_flag(e['flag'], 4):
            continue  # skip the unmapped reads
        if SamBasics.check_flag(e['flag'], 16):
            f[1] = "16"
        else:
            f[1] = "0"
        f[4] = "0"
        f[6] = "*"
        f[7] = "0"
        f[8] = "0"
        print "\t".join(f)
def main():
  parser = argparse.ArgumentParser(description="Make sam file compatible with tools counting on a splicemap format sam file.")
  parser.add_argument('in_sam',help="FILENAME of sam file, or '-' for STDIN")
  args = parser.parse_args()
  inf = sys.stdin
  if args.in_sam != '-':
    inf = open(args.in_sam)
  for line in inf:
    line = line.rstrip()
    if SamBasics.is_header(line):
      print line
      continue
    f = line.rstrip().split("\t")
    e = SamBasics.sam_line_to_dictionary(line)
    if SamBasics.check_flag(e['flag'],4):
      continue # skip the unmapped reads
    if SamBasics.check_flag(e['flag'],16):
      f[1] = "16"
    else:
      f[1] = "0"
    f[4] = "0"
    f[6] = "*"
    f[7] = "0"
    f[8] = "0"
    print "\t".join(f)
def main():
    parser = argparse.ArgumentParser(
        description="Get read counts from sam or bam.")
    parser.add_argument('input', help="FILENAME sam or bam")
    parser.add_argument(
        '--add_report',
        action='store_true',
        help="make a new file where we replace sam or bam with a .mapped_count"
    )
    args = parser.parse_args()
    if args.add_report:
        m = re.match('(.+)\.[bs]am', args.input)
        if not m:
            sys.stderr.write("bad inputfile type should be .bam or .sam\n")
            sys.exit()
        baseinput = m.group(1)
    samtag = ''
    if re.search('\.sam$', args.input): samtag = '-S'
    z = 0
    #se = open('/dev/stderr','w')
    p = subprocess.Popen('sort | uniq | wc -l',
                         shell=True,
                         stdin=subprocess.PIPE,
                         stdout=subprocess.PIPE)
    with os.popen('samtools view ' + samtag + ' ' + args.input) as inf:
        for line in inf:
            z += 1
            if z % 100000 == 0:
                sys.stderr.write(str(z) + " alignments processed\r")
            line = line.rstrip()
            d = SamBasics.sam_line_to_dictionary(line)
            if not SamBasics.check_flag(d['flag'], 4):
                if SamBasics.check_flag(d['flag'], 64):
                    p.stdin.write(d['qname'] + '.1' + "\n")
                elif SamBasics.check_flag(d['flag'], 128):
                    p.stdin.write(d['qname'] + '.2' + "\n")
                else:
                    sys.stderr.write("Unrecognized\n")
                    sys.exit()
    sys.stderr.write("\n")
    aligned_reads = int(p.communicate()[0].rstrip())
    if args.add_report:
        of = open(baseinput + '.mapped_reads', 'w')
        of.write(str(aligned_reads) + "\n")
        return
    print aligned_reads
def main():
  parser = argparse.ArgumentParser(description="Get read counts from sam or bam.")
  parser.add_argument('input',help="FILENAME sam or bam")
  parser.add_argument('--add_report',action='store_true',help="make a new file where we replace sam or bam with a .mapped_count")
  args = parser.parse_args()
  if args.add_report:
    m = re.match('(.+)\.[bs]am',args.input)
    if not m:
      sys.stderr.write("bad inputfile type should be .bam or .sam\n")
      sys.exit()
    baseinput = m.group(1)
  samtag = ''
  if re.search('\.sam$',args.input): samtag = '-S'
  z = 0
  #se = open('/dev/stderr','w')
  p = subprocess.Popen('sort | uniq | wc -l',shell=True,stdin=subprocess.PIPE,stdout=subprocess.PIPE)
  with os.popen('samtools view '+samtag+' '+args.input) as inf:
    for line in inf:
      z += 1
      if z%100000 ==0: 
        sys.stderr.write(str(z)+" alignments processed\r")
      line = line.rstrip()
      d = SamBasics.sam_line_to_dictionary(line)
      if not SamBasics.check_flag(d['flag'],4):
        if SamBasics.check_flag(d['flag'],64):
          p.stdin.write(d['qname']+'.1'+"\n")
        elif SamBasics.check_flag(d['flag'],128):
          p.stdin.write(d['qname']+'.2'+"\n")
        else:
          sys.stderr.write("Unrecognized\n")
          sys.exit()
  sys.stderr.write("\n")
  aligned_reads = int(p.communicate()[0].rstrip())
  if args.add_report:
    of = open(baseinput+'.mapped_reads','w')
    of.write(str(aligned_reads)+"\n")
    return
  print aligned_reads
def make_exons(args, thread_index, thread_count):
    is_sam = True
    if re.search('\.bam$', args.sam_file):
        is_sam = False
    stag = ''
    if is_sam: stag = '-S'
    cmd = 'samtools view -F 4 ' + stag + ' ' + args.sam_file
    spcf = SamBasics.SAMtoPSLconversionFactory()
    if args.reference_genome:
        spcf.set_genome(args.reference_genome)
    sampipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    fname = args.tempdir + '/bedpart.' + str(thread_index) + '.bed'
    of = open(fname, 'w')
    z = 0
    with sampipe.stdout as inf:
        for line in inf:
            z += 1
            if z % thread_count != thread_index: continue
            line = line.rstrip()
            if SamBasics.is_header(line):
                continue
            d = SamBasics.sam_line_to_dictionary(line)
            strand = '+'
            if SamBasics.check_flag(d['flag'], 16):
                strand = '-'
            seqs = []
            sequence = d['seq']
            seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']])
            m = re.search('XA:Z:(\S+)', line)
            if m and args.use_secondary_alignments:
                e = m.group(1)
                secondaries = e.rstrip(";").split(";")
                for secondary in secondaries:
                    m1 = re.match('([^,]+),([+-])(\d+),([^,]+)', secondary)
                    if not m1:
                        sys.stderr.write("strange secondary format " +
                                         secondary + "\n")
                        sys.exit()
                    seqs.append([
                        d['qname'],
                        m1.group(1),
                        m1.group(2),
                        int(m1.group(3)),
                        m1.group(4)
                    ])
            #p.apply_async(get_exons_from_seqs,[seqs,d,spcf])
            exons = get_exons_from_seqs(seqs, d, spcf)
            of.write(exons)
            #return exons
    of.close()
def make_exons(args,thread_index,thread_count):
  is_sam = True
  if re.search('\.bam$',args.sam_file):
    is_sam = False
  stag = ''
  if is_sam: stag = '-S'
  cmd = 'samtools view -F 4 '+stag+' '+args.sam_file
  spcf = SamBasics.SAMtoPSLconversionFactory()
  if args.reference_genome:
    spcf.set_genome(args.reference_genome)
  sampipe = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
  fname = args.tempdir+'/bedpart.'+str(thread_index)+'.bed'
  of = open(fname,'w')
  z = 0
  with sampipe.stdout as inf:
    for line in inf:
      z+=1
      if z%thread_count != thread_index: continue
      line = line.rstrip()
      if SamBasics.is_header(line):
        continue
      d = SamBasics.sam_line_to_dictionary(line)
      strand = '+'
      if SamBasics.check_flag(d['flag'],16):
        strand = '-'
      seqs = []
      sequence = d['seq']
      seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']])
      m = re.search('XA:Z:(\S+)',line)
      if m and args.use_secondary_alignments:
        e = m.group(1)
        secondaries = e.rstrip(";").split(";")
        for secondary in secondaries:
          m1 = re.match('([^,]+),([+-])(\d+),([^,]+)',secondary)
          if not m1:
            sys.stderr.write("strange secondary format "+secondary+"\n")
            sys.exit()
          seqs.append([d['qname'], m1.group(1),m1.group(2),int(m1.group(3)),m1.group(4)])
      #p.apply_async(get_exons_from_seqs,[seqs,d,spcf])
      exons = get_exons_from_seqs(seqs,d,spcf)
      of.write(exons)
      #return exons
  of.close()
Exemple #7
0
def main():
  parser = argparse.ArgumentParser(description="Read a sam file and output a bed file in the format of junction_color.bed")
  parser.add_argument('-o','--output',help='FILENAME is output')
  parser.add_argument('--min_intron_size',type=int,default=68,help='minimum intron size')
  parser.add_argument('infile',help='FILENAME of sam file or "-" for STDIN')
  parser.add_argument('reference_genome',help='FILENAME of the reference genome')
  args = parser.parse_args()

  # get our reference genome
  sys.stderr.write("reading reference genome\n")
  #g = SequenceBasics.read_fasta_into_hash(args.reference_genome)
  g = FastaData(open(args.reference_genome).read())
  sys.stderr.write("finished reading reference genome\n")

  inf = sys.stdin
  read_mapping_count = {}
  junctions = {}
  if args.infile != '-':
    inf = open(args.infile)
  sys.stderr.write("reading through sam file\n")
  zall = 0
  zn = 0
  while True:
    line = inf.readline()
    if not line: break
    line = line.rstrip()
    if SamBasics.is_header(line): continue
    d = SamBasics.sam_line_to_dictionary(line)
    chrom = d['rname']
    if chrom =='*': continue
    if chrom not in g.keys():
      sys.stderr.write("WARNING: "+chrom+" not in reference, skipping\n")
      continue
    mate = 'U'
    if SamBasics.check_flag(d['flag'],int('0x4',16)): #check if its unmapped
      continue  # we can ignore the unmapped things for now
    if SamBasics.check_flag(d['flag'],int('0x40',16)):
      mate = 'L'
    elif SamBasics.check_flag(d['flag'],int('0x80',16)):
      mate = 'R'
    actual_read = d['qname']+"\t"+mate
    if actual_read not in read_mapping_count:
      read_mapping_count[actual_read] = 0
    read_mapping_count[actual_read] += 1
    has_intron = 0
    start_loc = d['pos']
    current_loc = start_loc
    bounds  = []
    for i in range(0,len(d['cigar_array'])):
      ce = d['cigar_array'][i]
      if ce['op'] == 'N' and ce['val'] >= args.min_intron_size:
        has_intron = 1
        lbound = current_loc # should be the intron start base index-1
        current_loc += ce['val']
        rbound = current_loc # should be the second exon start base index-1
        right_size = d['cigar_array'][i+1]['val']
        bounds.append([lbound,rbound,right_size])
      elif ce['op'] == 'D':
        current_loc += ce['val']
      elif re.match('[=XMSHP]',ce['op']):
        current_loc += ce['val'] 
    if has_intron == 0: continue # there are no splices to report here
    #print actual_read
    #print d['cigar']
    #print d
    #print start_loc
    #print bounds
    for bound in bounds:
      zall += 1
      intronflank = g[chrom][bound[0]-1:bound[0]+1].upper() + '-' + \
                    g[chrom][bound[1]-3:bound[1]-1].upper()
      strand = ''
      if is_canon(intronflank): # its a positive strand
        strand = '+'
      elif is_revcanon(intronflank): # its a negative strand
        strand = '-'
      else:
        # We can't deal with the non-canonical splice sorry
        zn += 1
        sys.stderr.write("WARNING skipping non-canonical splice ("+str(zn)+"/"+str(zall)+")\r")
        continue
      # If we are still in we have successfully found a splice
      out_chrom = chrom
      out_start = bound[0]-51
      out_end = bound[1]+49
      out_name = '*' # this will be done later
      out_score = 50
      out_strand = strand
      out_thickStart = out_start
      out_thickEnd = out_end
      out_rgb = '0,0,0'
      out_block_count = 2
      out_block_sizes = '50,50'
      out_block_starts = '0,'+str(bound[1]-bound[0]+50)
      bed = []
      bed.append(out_chrom)
      bed.append(str(out_start))
      bed.append(str(out_end))
      bed.append(out_name)
      bed.append(str(out_score))
      bed.append(out_strand)
      bed.append(str(out_thickStart))
      bed.append(str(out_thickEnd))
      bed.append(out_rgb)
      bed.append(str(out_block_count))
      bed.append(out_block_sizes)
      bed.append(out_block_starts)
      entry = "\t".join(bed)
      if entry not in junctions:
        junctions[entry] = {}
        junctions[entry]['reads'] = set()
        junctions[entry]['positions'] = set()
        junctions[entry]['right_sizes'] = set()
      junctions[entry]['reads'].add(actual_read)
      junctions[entry]['positions'].add(d['pos'])
      junctions[entry]['right_sizes'].add(bound[2])
  sys.stderr.write("\n")
  sys.stderr.write("finished reading sam\n")
  of = sys.stdout
  if args.output:
    of = open(args.output,'w')
  if len(junctions) > 0: # if we have stuff lets print a header
    of.write("track\tname=junctions\tdescription=\"SpliceMap junctions\" itemRgb=\"On\"\n")
  for entry in junctions:
    nR = len(junctions[entry]['reads'])
    width = max(junctions[entry]['right_sizes'])-min(junctions[entry]['right_sizes'])
    nNR = len(junctions[entry]['positions'])
    nUR = 0
    nMR = 0
    for read in junctions[entry]['reads']:
      if read_mapping_count[read] == 1:
        nUR += 1
      elif read_mapping_count[read] > 1:
        nMR += 1
      else:
        sys.stderr.write("ERROR: nonsense read count\n")
        return
    name = '('+str(nR)+')['+str(width)+'_'+str(nNR)+']('+str(nUR)+'/'+str(nMR)+')'
    bed = entry.split("\t")
    bed[3] = name
    of.write("\t".join(bed)+"\n")