Example #1
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Make sam file compatible with tools counting on a splicemap format sam file."
    )
    parser.add_argument('in_sam',
                        help="FILENAME of sam file, or '-' for STDIN")
    args = parser.parse_args()
    inf = sys.stdin
    if args.in_sam != '-':
        inf = open(args.in_sam)
    for line in inf:
        line = line.rstrip()
        if SamBasics.is_header(line):
            print line
            continue
        f = line.rstrip().split("\t")
        e = SamBasics.sam_line_to_dictionary(line)
        if SamBasics.check_flag(e['flag'], 4):
            continue  # skip the unmapped reads
        if SamBasics.check_flag(e['flag'], 16):
            f[1] = "16"
        else:
            f[1] = "0"
        f[4] = "0"
        f[6] = "*"
        f[7] = "0"
        f[8] = "0"
        print "\t".join(f)
def main():
  parser = argparse.ArgumentParser(description="Make sam file compatible with tools counting on a splicemap format sam file.")
  parser.add_argument('in_sam',help="FILENAME of sam file, or '-' for STDIN")
  args = parser.parse_args()
  inf = sys.stdin
  if args.in_sam != '-':
    inf = open(args.in_sam)
  for line in inf:
    line = line.rstrip()
    if SamBasics.is_header(line):
      print line
      continue
    f = line.rstrip().split("\t")
    e = SamBasics.sam_line_to_dictionary(line)
    if SamBasics.check_flag(e['flag'],4):
      continue # skip the unmapped reads
    if SamBasics.check_flag(e['flag'],16):
      f[1] = "16"
    else:
      f[1] = "0"
    f[4] = "0"
    f[6] = "*"
    f[7] = "0"
    f[8] = "0"
    print "\t".join(f)
def make_exons(args, thread_index, thread_count):
    is_sam = True
    if re.search('\.bam$', args.sam_file):
        is_sam = False
    stag = ''
    if is_sam: stag = '-S'
    cmd = 'samtools view -F 4 ' + stag + ' ' + args.sam_file
    spcf = SamBasics.SAMtoPSLconversionFactory()
    if args.reference_genome:
        spcf.set_genome(args.reference_genome)
    sampipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    fname = args.tempdir + '/bedpart.' + str(thread_index) + '.bed'
    of = open(fname, 'w')
    z = 0
    with sampipe.stdout as inf:
        for line in inf:
            z += 1
            if z % thread_count != thread_index: continue
            line = line.rstrip()
            if SamBasics.is_header(line):
                continue
            d = SamBasics.sam_line_to_dictionary(line)
            strand = '+'
            if SamBasics.check_flag(d['flag'], 16):
                strand = '-'
            seqs = []
            sequence = d['seq']
            seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']])
            m = re.search('XA:Z:(\S+)', line)
            if m and args.use_secondary_alignments:
                e = m.group(1)
                secondaries = e.rstrip(";").split(";")
                for secondary in secondaries:
                    m1 = re.match('([^,]+),([+-])(\d+),([^,]+)', secondary)
                    if not m1:
                        sys.stderr.write("strange secondary format " +
                                         secondary + "\n")
                        sys.exit()
                    seqs.append([
                        d['qname'],
                        m1.group(1),
                        m1.group(2),
                        int(m1.group(3)),
                        m1.group(4)
                    ])
            #p.apply_async(get_exons_from_seqs,[seqs,d,spcf])
            exons = get_exons_from_seqs(seqs, d, spcf)
            of.write(exons)
            #return exons
    of.close()
def make_exons(args,thread_index,thread_count):
  is_sam = True
  if re.search('\.bam$',args.sam_file):
    is_sam = False
  stag = ''
  if is_sam: stag = '-S'
  cmd = 'samtools view -F 4 '+stag+' '+args.sam_file
  spcf = SamBasics.SAMtoPSLconversionFactory()
  if args.reference_genome:
    spcf.set_genome(args.reference_genome)
  sampipe = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
  fname = args.tempdir+'/bedpart.'+str(thread_index)+'.bed'
  of = open(fname,'w')
  z = 0
  with sampipe.stdout as inf:
    for line in inf:
      z+=1
      if z%thread_count != thread_index: continue
      line = line.rstrip()
      if SamBasics.is_header(line):
        continue
      d = SamBasics.sam_line_to_dictionary(line)
      strand = '+'
      if SamBasics.check_flag(d['flag'],16):
        strand = '-'
      seqs = []
      sequence = d['seq']
      seqs.append([d['qname'], d['rname'], strand, d['pos'], d['cigar']])
      m = re.search('XA:Z:(\S+)',line)
      if m and args.use_secondary_alignments:
        e = m.group(1)
        secondaries = e.rstrip(";").split(";")
        for secondary in secondaries:
          m1 = re.match('([^,]+),([+-])(\d+),([^,]+)',secondary)
          if not m1:
            sys.stderr.write("strange secondary format "+secondary+"\n")
            sys.exit()
          seqs.append([d['qname'], m1.group(1),m1.group(2),int(m1.group(3)),m1.group(4)])
      #p.apply_async(get_exons_from_seqs,[seqs,d,spcf])
      exons = get_exons_from_seqs(seqs,d,spcf)
      of.write(exons)
      #return exons
  of.close()
Example #5
0
def main():
  parser = argparse.ArgumentParser(description="Read a sam file and output a bed file in the format of junction_color.bed")
  parser.add_argument('-o','--output',help='FILENAME is output')
  parser.add_argument('--min_intron_size',type=int,default=68,help='minimum intron size')
  parser.add_argument('infile',help='FILENAME of sam file or "-" for STDIN')
  parser.add_argument('reference_genome',help='FILENAME of the reference genome')
  args = parser.parse_args()

  # get our reference genome
  sys.stderr.write("reading reference genome\n")
  #g = SequenceBasics.read_fasta_into_hash(args.reference_genome)
  g = FastaData(open(args.reference_genome).read())
  sys.stderr.write("finished reading reference genome\n")

  inf = sys.stdin
  read_mapping_count = {}
  junctions = {}
  if args.infile != '-':
    inf = open(args.infile)
  sys.stderr.write("reading through sam file\n")
  zall = 0
  zn = 0
  while True:
    line = inf.readline()
    if not line: break
    line = line.rstrip()
    if SamBasics.is_header(line): continue
    d = SamBasics.sam_line_to_dictionary(line)
    chrom = d['rname']
    if chrom =='*': continue
    if chrom not in g.keys():
      sys.stderr.write("WARNING: "+chrom+" not in reference, skipping\n")
      continue
    mate = 'U'
    if SamBasics.check_flag(d['flag'],int('0x4',16)): #check if its unmapped
      continue  # we can ignore the unmapped things for now
    if SamBasics.check_flag(d['flag'],int('0x40',16)):
      mate = 'L'
    elif SamBasics.check_flag(d['flag'],int('0x80',16)):
      mate = 'R'
    actual_read = d['qname']+"\t"+mate
    if actual_read not in read_mapping_count:
      read_mapping_count[actual_read] = 0
    read_mapping_count[actual_read] += 1
    has_intron = 0
    start_loc = d['pos']
    current_loc = start_loc
    bounds  = []
    for i in range(0,len(d['cigar_array'])):
      ce = d['cigar_array'][i]
      if ce['op'] == 'N' and ce['val'] >= args.min_intron_size:
        has_intron = 1
        lbound = current_loc # should be the intron start base index-1
        current_loc += ce['val']
        rbound = current_loc # should be the second exon start base index-1
        right_size = d['cigar_array'][i+1]['val']
        bounds.append([lbound,rbound,right_size])
      elif ce['op'] == 'D':
        current_loc += ce['val']
      elif re.match('[=XMSHP]',ce['op']):
        current_loc += ce['val'] 
    if has_intron == 0: continue # there are no splices to report here
    #print actual_read
    #print d['cigar']
    #print d
    #print start_loc
    #print bounds
    for bound in bounds:
      zall += 1
      intronflank = g[chrom][bound[0]-1:bound[0]+1].upper() + '-' + \
                    g[chrom][bound[1]-3:bound[1]-1].upper()
      strand = ''
      if is_canon(intronflank): # its a positive strand
        strand = '+'
      elif is_revcanon(intronflank): # its a negative strand
        strand = '-'
      else:
        # We can't deal with the non-canonical splice sorry
        zn += 1
        sys.stderr.write("WARNING skipping non-canonical splice ("+str(zn)+"/"+str(zall)+")\r")
        continue
      # If we are still in we have successfully found a splice
      out_chrom = chrom
      out_start = bound[0]-51
      out_end = bound[1]+49
      out_name = '*' # this will be done later
      out_score = 50
      out_strand = strand
      out_thickStart = out_start
      out_thickEnd = out_end
      out_rgb = '0,0,0'
      out_block_count = 2
      out_block_sizes = '50,50'
      out_block_starts = '0,'+str(bound[1]-bound[0]+50)
      bed = []
      bed.append(out_chrom)
      bed.append(str(out_start))
      bed.append(str(out_end))
      bed.append(out_name)
      bed.append(str(out_score))
      bed.append(out_strand)
      bed.append(str(out_thickStart))
      bed.append(str(out_thickEnd))
      bed.append(out_rgb)
      bed.append(str(out_block_count))
      bed.append(out_block_sizes)
      bed.append(out_block_starts)
      entry = "\t".join(bed)
      if entry not in junctions:
        junctions[entry] = {}
        junctions[entry]['reads'] = set()
        junctions[entry]['positions'] = set()
        junctions[entry]['right_sizes'] = set()
      junctions[entry]['reads'].add(actual_read)
      junctions[entry]['positions'].add(d['pos'])
      junctions[entry]['right_sizes'].add(bound[2])
  sys.stderr.write("\n")
  sys.stderr.write("finished reading sam\n")
  of = sys.stdout
  if args.output:
    of = open(args.output,'w')
  if len(junctions) > 0: # if we have stuff lets print a header
    of.write("track\tname=junctions\tdescription=\"SpliceMap junctions\" itemRgb=\"On\"\n")
  for entry in junctions:
    nR = len(junctions[entry]['reads'])
    width = max(junctions[entry]['right_sizes'])-min(junctions[entry]['right_sizes'])
    nNR = len(junctions[entry]['positions'])
    nUR = 0
    nMR = 0
    for read in junctions[entry]['reads']:
      if read_mapping_count[read] == 1:
        nUR += 1
      elif read_mapping_count[read] > 1:
        nMR += 1
      else:
        sys.stderr.write("ERROR: nonsense read count\n")
        return
    name = '('+str(nR)+')['+str(width)+'_'+str(nNR)+']('+str(nUR)+'/'+str(nMR)+')'
    bed = entry.split("\t")
    bed[3] = name
    of.write("\t".join(bed)+"\n")    
def main():
    parser = argparse.ArgumentParser(
        description="Convert a sam file into a psl file")
    parser.add_argument('--genome',
                        help="FASTA input file of reference genome")
    parser.add_argument('--get_secondary_alignments',
                        action='store_true',
                        help="Report SA:Z secondary alignments as well")
    parser.add_argument('--get_alternative_alignments',
                        action='store_true',
                        help="Report XA:Z alternative alignments as well")
    parser.add_argument(
        '--get_all_alignments',
        action='store_true',
        help="Report SA:Z and XA:Z alternative alignments as well")
    parser.add_argument('--give_unique_names',
                        action='store_true',
                        help="Output query names will be unique.")
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--output_fasta',
        help=
        "FILENAME to save an outgoing fasta.  Only works for primary alignments."
    )
    group.add_argument(
        '--output_fastq',
        help=
        "FILENAME to save an outgoing fastq.  Only works for primary alignments."
    )
    parser.add_argument('infile', help="FILENAME input file or '-' for STDIN")
    parser.add_argument('-o',
                        '--output',
                        help="FILENAME for the output, STDOUT if not set.")
    args = parser.parse_args()
    if (args.output_fasta
            or args.output_fastq) and (args.get_secondary_alignments
                                       or args.get_alternative_alignments
                                       or args.get_all_alignments):
        sys.stderr.write(
            "ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n"
        )
        sys.exit()
    inf = sys.stdin
    if args.infile != '-':
        inf = open(args.infile)
    of = sys.stdout
    if args.output:
        of = open(args.output, 'w')
    spcf = SamBasics.SAMtoPSLconversionFactory()
    if args.genome: spcf.set_genome(args.genome)
    off = None
    if args.output_fasta:
        off = open(args.output_fasta, 'w')
    if args.output_fastq:
        off = open(args.output_fastq, 'w')
    z = 0
    for line in inf:
        line = line.rstrip()
        if SamBasics.is_header(line):
            spcf.read_header_line(line)
            continue
        # We have a line to convert
        psl = spcf.convert_line(line)
        if psl:
            pobj = PSL(psl)
            z += 1
            if args.give_unique_names:
                pobj.entry['qName'] = 'Q' + str(z)
            of.write(pobj.get_line() + "\n")
            if args.output_fastq or args.output_fasta:
                sam = SamBasics.SAM(line)
                sequence = sam.value('seq').upper()
                quality = sam.value('qual')
                if sam.check_flag(16):
                    sequence = rc(sam.value('seq').upper())
                    quality = sam.value('qual')[::-1]
                if args.output_fasta:
                    off.write(">" + pobj.value('qName') + "\n" + sequence +
                              "\n")
                elif args.output_fastq:
                    if len(sequence) == len(quality):
                        off.write("@" + pobj.value('qName') + "\n" + sequence +
                                  "\n" + "+\n" + quality + "\n")
                    else:
                        sys.stderr.write("ERROR: sequence " + sequence +
                                         " length (" + str(len(sequence)) +
                                         ") doesnt match quality " + quality +
                                         " length (" + str(len(quality)) +
                                         ")\n")
                        sys.exit()
        # Lets look for secondary alignments to convert
        if args.get_secondary_alignments or args.get_all_alignments:
            secondary_alignments = SamBasics.get_secondary_alignments(
                line.rstrip())
            for samline in secondary_alignments:
                psl = spcf.convert_line(samline)
                if psl:
                    #print "\nsecondary"
                    #print samline
                    z += 1
                    pobj = PSL(psl)
                    if args.give_unique_names:
                        pobj.entry['qName'] = 'Q' + str(z)
                    of.write(pobj.get_line() + "\n")
        if args.get_alternative_alignments or args.get_all_alignments:
            alternative_alignments = SamBasics.get_alternative_alignments(
                line.rstrip())
            for samline in alternative_alignments:
                psl = spcf.convert_line(samline)
                if psl:
                    #print "\nsecondary"
                    #print samline
                    z += 1
                    pobj = PSL(psl)
                    if args.give_unique_names:
                        pobj.entry['qName'] = 'Q' + str(z)
                    of.write(pobj.get_line() + "\n")
    inf.close()
    of.close()
def main():
  parser = argparse.ArgumentParser(description="Convert a sam file into a psl file")
  parser.add_argument('--genome',help="FASTA input file of reference genome")
  parser.add_argument('--get_secondary_alignments',action='store_true',help="Report SA:Z secondary alignments as well")
  parser.add_argument('--get_alternative_alignments',action='store_true',help="Report XA:Z alternative alignments as well")
  parser.add_argument('--get_all_alignments',action='store_true',help="Report SA:Z and XA:Z alternative alignments as well")
  parser.add_argument('--give_unique_names',action='store_true',help="Output query names will be unique.")
  group = parser.add_mutually_exclusive_group()
  group.add_argument('--output_fasta',help="FILENAME to save an outgoing fasta.  Only works for primary alignments.")
  group.add_argument('--output_fastq',help="FILENAME to save an outgoing fastq.  Only works for primary alignments.")
  parser.add_argument('infile',help="FILENAME input file or '-' for STDIN")
  parser.add_argument('-o','--output',help="FILENAME for the output, STDOUT if not set.")
  args = parser.parse_args()
  if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments):
    sys.stderr.write("ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n")
    sys.exit()
  inf = sys.stdin
  if args.infile != '-': 
    inf = open(args.infile)
  of = sys.stdout
  if args.output:
    of = open(args.output,'w')
  spcf = SamBasics.SAMtoPSLconversionFactory()
  if args.genome: spcf.set_genome(args.genome)
  off = None
  if args.output_fasta:
    off = open(args.output_fasta,'w')
  if args.output_fastq:
    off = open(args.output_fastq,'w')
  z = 0
  for line in inf:
    line = line.rstrip()
    if SamBasics.is_header(line): 
      spcf.read_header_line(line)
      continue
    # We have a line to convert
    psl = spcf.convert_line(line)
    if psl:
      pobj = PSL(psl)
      z += 1
      if args.give_unique_names:
        pobj.entry['qName'] = 'Q'+str(z)
      of.write(pobj.get_line()+"\n")
      if args.output_fastq or args.output_fasta:
        sam = SamBasics.SAM(line)
        sequence = sam.value('seq').upper()
        quality = sam.value('qual')
        if sam.check_flag(16):
          sequence = rc(sam.value('seq').upper())
          quality = sam.value('qual')[::-1]
        if args.output_fasta:
          off.write(">"+pobj.value('qName')+"\n"+sequence+"\n")
        elif args.output_fastq:
          if len(sequence) == len(quality):
            off.write("@"+pobj.value('qName')+"\n"+sequence+"\n"+"+\n"+quality+"\n")
          else:
            sys.stderr.write("ERROR: sequence "+sequence+" length ("+str(len(sequence))+") doesnt match quality "+quality+" length ("+str(len(quality))+")\n")
            sys.exit()
    # Lets look for secondary alignments to convert
    if args.get_secondary_alignments or args.get_all_alignments:
      secondary_alignments = SamBasics.get_secondary_alignments(line.rstrip())
      for samline in secondary_alignments:
        psl = spcf.convert_line(samline)
        if psl:
          #print "\nsecondary"
          #print samline
          z += 1
          pobj = PSL(psl)
          if args.give_unique_names:
            pobj.entry['qName'] = 'Q'+str(z)
          of.write(pobj.get_line()+"\n")
    if args.get_alternative_alignments or args.get_all_alignments:
      alternative_alignments = SamBasics.get_alternative_alignments(line.rstrip())
      for samline in alternative_alignments:
        psl = spcf.convert_line(samline)
        if psl:
          #print "\nsecondary"
          #print samline
          z += 1
          pobj = PSL(psl)
          if args.give_unique_names:
            pobj.entry['qName'] = 'Q'+str(z)
          of.write(pobj.get_line()+"\n")
  inf.close()
  of.close()