def write_genepred_to_fasta(gpd_filename,ref_fasta,out_fasta):
  ofile = open(out_fasta,'w')
  ref = SequenceBasics.read_fasta_into_hash(ref_fasta)
  with open(gpd_filename) as f:
    for line in f:
      if re.match('^#',line): continue
      d = line_to_entry(line)
      if d['chrom'] in ref:
        seq = ''
        for i in range(0,d['exonCount']):
          seq = seq+ref[d['chrom']][d['exonStarts'][i]:d['exonEnds'][i]]
        if d['strand'] == '-': seq = SequenceBasics.rc(seq)
        ofile.write(">"+str(d['name'])+"\n"+seq.upper()+"\n")
  ofile.close()
Esempio n. 2
0
def convert_directionless_gpd_alignment_to_reference(sam_filename,genepred_filename,out_map):
  conv = GenePredBasics.get_directionless_gpd_conversion(genepred_filename)
  ofile = open(out_map,'w')
  with open(sam_filename) as samfile:
    for line in samfile:
      line = line.rstrip()
      if re.match('^@[A-Z][A-Z]\s',line): continue #skip header
      d = sam_line_to_dictionary(line)
      if d['rname'] == '*': continue #skip unmapped
      startposition = d['pos']-1
      readcoord = []
      z = 0
      for entry in d['cigar_array']:
        if re.match('[MISX=]',entry['op']):  # all the entries that map to the read
          for i in range(0,entry['val']):
            if re.match('[M=X]',entry['op']): #all the entries that match the reference alignment
              readcoord.append(conv[d['rname']]['coordinates'][startposition+z])
              z+=1
            # lets ignore insertions for now
            #else:
            #  readcoord.append('*')
        if re.match('[DNH]',entry['op']):
          z+= entry['val']      
      abbrev = conv[d['rname']]['chrom']+':'+SequenceBasics.collapse_coordinate_array(readcoord)
      ofile.write(d['qname'] + "\t" + d['rname'] + "\t" + abbrev + "\n")
  ofile.close()
Esempio n. 3
0
def get_alternative_alignments(in_sam_line):
    f = in_sam_line.rstrip().split("\t")
    if len(f) <= 12:
        return []  # move on if theres no optional tags
    enstring = "\t".join(f[x] for x in range(11, len(f)))
    m = re.search('XA:Z:(\S+)', enstring)
    if not m:
        return []  # move on if theres no SA:Z tag
    secondary_alignments = m.group(1)
    aligns = secondary_alignments.split(';')
    bwalike = re.compile('^([^,]+),(\d+),([+-]),([^,]+),(\d+),(\d+)$')
    otherlike = re.compile('^([^,]+),([+-])(\d+),([^,]+),(\d+),(\d+)$')
    otherlike2 = re.compile('^([^,]+),([+-])(\d+),([^,]+),(\d+)$')
    output = []
    for align in aligns:
        if align == '':
            continue  # I guess you can have empty segments and we should ignore them
        m1 = bwalike.match(align)
        m2 = otherlike.match(align)
        m3 = otherlike2.match(align)
        if m1:
            chr = m1.group(1)
            pos = m1.group(2)
            strand = m1.group(3)
            cigar = m1.group(4)
            mapQ = m1.group(5)
            nm = m1.group(6)
        elif m2:
            chr = m2.group(1)
            pos = m2.group(3)
            strand = m2.group(2)
            cigar = m2.group(4)
            mapQ = m2.group(5)
            nm = m2.group(6)
        elif m3:
            chr = m3.group(1)
            pos = m3.group(3)
            strand = m3.group(2)
            cigar = m3.group(4)
            mapQ = m3.group(5)
            nm = 0
        else:
            sys.stderr.write("WARNING: unable to parse secondary alignment\n" +
                             align + "\n")
            sys.exit()
        flag = '0'
        seq = f[9]
        phred = f[10]
        if strand == '-':
            flag = '16'
            seq = SequenceBasics.rc(seq)
            phred = phred[::-1]
        samline= f[0]+"\t"+flag+"\t"+chr+"\t"+pos+"\t"+mapQ+"\t"+cigar+"\t"\
               + "*\t0\t0\t*\t*"
        output.append(samline)
    return output
Esempio n. 4
0
def get_alternative_alignments(in_sam_line):
    f = in_sam_line.rstrip().split("\t")
    if len(f) <= 12:
      return [] # move on if theres no optional tags
    enstring = "\t".join(f[x] for x in range(11,len(f)))
    m = re.search('XA:Z:(\S+)',enstring)
    if not m:
      return [] # move on if theres no SA:Z tag
    secondary_alignments = m.group(1)
    aligns = secondary_alignments.split(';')
    bwalike = re.compile('^([^,]+),(\d+),([+-]),([^,]+),(\d+),(\d+)$')
    otherlike = re.compile('^([^,]+),([+-])(\d+),([^,]+),(\d+),(\d+)$')
    otherlike2 = re.compile('^([^,]+),([+-])(\d+),([^,]+),(\d+)$')
    output = []
    for align in aligns:
      if align == '': continue # I guess you can have empty segments and we should ignore them
      m1 = bwalike.match(align)
      m2 = otherlike.match(align)
      m3 = otherlike2.match(align)
      if m1:
	chr = m1.group(1)
        pos = m1.group(2)
        strand = m1.group(3)
        cigar = m1.group(4)
        mapQ = m1.group(5)
        nm = m1.group(6)
      elif m2:
	chr = m2.group(1)
        pos = m2.group(3)
        strand = m2.group(2)
        cigar = m2.group(4)
        mapQ = m2.group(5)
        nm = m2.group(6)
      elif m3:
	chr = m3.group(1)
        pos = m3.group(3)
        strand = m3.group(2)
        cigar = m3.group(4)
        mapQ = m3.group(5)
        nm = 0
      else:
	sys.stderr.write("WARNING: unable to parse secondary alignment\n"+align+"\n")
        sys.exit()
      flag = '0'
      seq = f[9]
      phred = f[10]
      if strand == '-': 
        flag = '16'
        seq = SequenceBasics.rc(seq)
        phred = phred[::-1]
      samline= f[0]+"\t"+flag+"\t"+chr+"\t"+pos+"\t"+mapQ+"\t"+cigar+"\t"\
             + "*\t0\t0\t*\t*"
      output.append(samline)
    return output
 def add_genepred_line(self,inline):
   if not self.ref_hash:  
     sys.stderr.write("ERROR: Must assign a reference genome dictionary first\n")
     sys.exit()
   gpd = GenePredBasics.GenePredEntry(inline)
   if gpd.value('name') in self.transcripts:
     sys.stderr.write("WARNING: "+inline+" transcript was already set\n")
   seq = ''
   for i in range(0,gpd.value('exonCount')):
     seq += self.ref_hash[gpd.value('chrom')][gpd.value('exonStarts')[i]:gpd.value('exonEnds')[i]].upper()
   if gpd.value('strand') == '-': seq = SequenceBasics.rc(seq)
   self.transcripts[gpd.value('name')] = seq
   return    
Esempio n. 6
0
 def set_read_fasta(self, read_fasta_file):
     self.reads_set = True
     gfr = SequenceBasics.GenericFastaFileReader(read_fasta_file)
     self.reads = {}
     while True:
         e = gfr.read_entry()
         if not e: break
         if e['name'] in self.reads:
             sys.stderr.write(
                 "Warning duplicate name in fasta file, could be big problems on sequence assignment.\n"
             )
         self.reads[e['name']] = e['seq'].upper()
     gfr.close()
     return
 def add_genepred_line(self, inline):
     if not self.ref_hash:
         sys.stderr.write(
             "ERROR: Must assign a reference genome dictionary first\n")
         sys.exit()
     gpd = GenePredBasics.GenePredEntry(inline)
     if gpd.value('name') in self.transcripts:
         sys.stderr.write("WARNING: " + inline +
                          " transcript was already set\n")
     seq = ''
     for i in range(0, gpd.value('exonCount')):
         seq += self.ref_hash[gpd.value('chrom')][
             gpd.value('exonStarts')[i]:gpd.value('exonEnds')[i]].upper()
     if gpd.value('strand') == '-': seq = SequenceBasics.rc(seq)
     self.transcripts[gpd.value('name')] = seq
     return
 def __init__(self,genomefasta,vcffile):
   self.var_by_chr = {}
   with open(vcffile) as inf:
     for line in inf:
       line = line.rstrip()
       if re.match('^#',line): continue
       f = line.split("\t")
       chrom = f[0]
       pos = int(f[1])
       reference = f[3]
       alternate = f[4]
       if not chrom in self.var_by_chr:
         self.var_by_chr[chrom] = {}
       self.var_by_chr[chrom][pos] = {}
       self.var_by_chr[chrom][pos]['ref'] = reference
       self.var_by_chr[chrom][pos]['alt'] = alternate
   self.ref_genome = SequenceBasics.read_fasta_into_hash(genomefasta)
 def emit_paired_short_read(self,read_length):
   [name,seq] = self.emit()
   # Get the sequence name first
   flipped_seq = random_flip(seq)
   # Use fragmentation if its enabled
   frag_seq = flipped_seq
   if self.gaussian_fragmentation:
     frag_len = max(self.gaussian_fragmentation['minimum'],int(random.gauss(self.gaussian_fragmentation['mu'],self.gaussian_fragmentation['sigma'])))
     if frag_len == 0:
       return [name, 'N'*read_length, 'N'*read_length]
     frag_seq = random_fragment(flipped_seq,frag_len)
   
   l1 = frag_seq[0:read_length]
   if len(l1) < read_length:
     l1 = l1 + 'N'*(read_length-len(l1))
   rc_frag_seq = SequenceBasics.rc(frag_seq)
   r1 = rc_frag_seq[0:read_length]
   if len(r1) < read_length:
     r1 = r1 + 'N'*(read_length-len(r1))
   return [name,l1,r1]
Esempio n. 10
0
def convert_directionless_gpd_alignment_to_reference(sam_filename,
                                                     genepred_filename,
                                                     out_map):
    conv = GenePredBasics.get_directionless_gpd_conversion(genepred_filename)
    ofile = open(out_map, 'w')
    with open(sam_filename) as samfile:
        for line in samfile:
            line = line.rstrip()
            if re.match('^@[A-Z][A-Z]\s', line): continue  #skip header
            d = sam_line_to_dictionary(line)
            if d['rname'] == '*': continue  #skip unmapped
            startposition = d['pos'] - 1
            readcoord = []
            z = 0
            for entry in d['cigar_array']:
                if re.match(
                        '[MISX=]',
                        entry['op']):  # all the entries that map to the read
                    for i in range(0, entry['val']):
                        if re.match(
                                '[M=X]', entry['op']
                        ):  #all the entries that match the reference alignment
                            readcoord.append(
                                conv[d['rname']]['coordinates'][startposition +
                                                                z])
                            z += 1
                        # lets ignore insertions for now
                        #else:
                        #  readcoord.append('*')
                if re.match('[DNH]', entry['op']):
                    z += entry['val']
            abbrev = conv[d['rname']][
                'chrom'] + ':' + SequenceBasics.collapse_coordinate_array(
                    readcoord)
            ofile.write(d['qname'] + "\t" + d['rname'] + "\t" + abbrev + "\n")
    ofile.close()
Esempio n. 11
0
 def set_genome(self, fasta):
     self.genome = SequenceBasics.read_fasta_into_hash(fasta)
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("bwa_bam",
                        help="BAMFILE or - for sam streamed to stdin")
    parser.add_argument('-o',
                        '--output',
                        help="OUTFILE or if not set use STDOUT")
    group1 = parser.add_mutually_exclusive_group(required=True)
    group1.add_argument("--query_fasta", help="FASTA for query sequences")
    group1.add_argument("--query_fastq", help="FASTQ for query sequences")
    parser.add_argument("--ref",
                        required=True,
                        help="FASTA for reference genome")
    parser.add_argument(
        "-S",
        "--size",
        help="linux sort option S, in kb if units not specified")
    group2 = parser.add_mutually_exclusive_group()
    group2.add_argument("--tempdir",
                        default='/tmp',
                        help="DIR to store a temp directory")
    group2.add_argument("--specific_tempdir",
                        help="Exact DIR to work in, is not deleted")
    parser.add_argument(
        "--get_all_alignments",
        action='store_true',
        help="Be exhaustive in retrieving secondary and chimeric alignments")
    args = parser.parse_args()

    # Manage your tempdir
    # put it into args.tempdir
    if not args.specific_tempdir:
        rnum = random.randint(1, 100000000)
        args.tempdir = args.tempdir.rstrip('/') + '/weirathe.' + str(rnum)
    else:
        args.tempdir = args.specific_tempdir.rstrip('/')
    if not os.path.exists(args.tempdir):
        os.makedirs(args.tempdir)

    inf = sys.stdin
    if args.bwa_bam != '-':
        p1 = subprocess.Popen(("samtools view " + args.bwa_bam).split(),
                              stdout=subprocess.PIPE)
        inf = p1.stdout
    # 1. Now we can start the process.  First convert the sam to a psl
    cmd = "sam_to_psl.py -"
    if args.get_all_alignments:
        cmd += " --get_all_alignments"

    #No matter what we don't need to see the exact same alignment more than once
    cmd += " | sort -T " + args.tempdir
    if args.size:
        cmd += " -S " + args.size
    cmd += " | uniq"

    #Sort the alignment based on query name
    cmd += " | sort_psl.py - --tempdir " + args.tempdir
    if args.size:
        cmd += " -S " + args.size

    #Put the fragmented alignments together
    cmd += " | defragment_PSL_alignments.py - | sort_psl.py - --tempdir " + args.tempdir + " -o " + args.tempdir + "/1.psl"
    if args.size:
        cmd += " -S " + args.size
    p2 = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
    for line in inf:
        p2.stdin.write(line)
    p2.communicate()
    if args.bwa_bam != '-':
        p1.communicate()

    # 2. Get a sorted fasta file
    cmd = "sort_fasta.py - --tempdir " + args.tempdir + " -o " + args.tempdir + '/2.query.fa'
    if args.size:
        cmd += " -S " + args.size
    p3 = subprocess.Popen(cmd.split(), stdin=subprocess.PIPE)
    if args.query_fasta:
        gfr = SequenceBasics.GenericFastaFileReader(args.query_fasta)
    else:
        gfr = SequenceBasics.GenericFastqFileReader(args.query_fastq)
    while True:
        e = gfr.read_entry()
        if not e: break
        p3.stdin.write('>' + e['name'] + "\n" + e['seq'] + "\n")
    p3.communicate()
    gfr.close()

    # 3. Fix the psl and output the results
    of = sys.stdout
    if args.output:
        of = open(args.output, 'w')
    cmd = "fix_psl_stats.py " + args.tempdir + '/1.psl ' + args.ref + ' ' + args.tempdir + '/2.query.fa | sort_psl.py - --tempdir ' + args.tempdir
    if args.size:
        cmd += " -S " + args.size
    p4 = subprocess.Popen(cmd, shell=True, stdout=of)
    p4.communicate()

    # Clean up your temporary directory if you aren't in a specific one.
    if not args.specific_tempdir:
        rmtree(args.tempdir)
    return
Esempio n. 13
0
 def get_sequence(self,ref_fasta_hash):
   seq = ''.join([\
         ref_fasta_hash[self.value('chrom')][self.value('exonStarts')[i]:self.value('exonEnds')[i]]\
         for i in range(0,self.value('exonCount'))])
   if self.value('strand') == '-':  return SequenceBasics.rc(seq).upper()
   return seq.upper()
Esempio n. 14
0
 def set_genome(self,fasta):
   self.genome = SequenceBasics.read_fasta_into_hash(fasta)
def random_flip(seq):
  if random.random() < 0.5:
    return seq
  return SequenceBasics.rc(seq)
Esempio n. 16
0
    def convert_line(self,
                     psl_line,
                     query_sequence=None,
                     quality_sequence=None):
        try:
            pe = PSLBasics.line_to_entry(psl_line)
        except:
            sys.stderr.write("Problem parsing line:\n" + psl_line.rstrip() +
                             "\n")
            return False
        if len(pe['tStarts']) != len(pe['blockSizes']):
            sys.stderr.write("Warning invalid psl entry: " + pe['qName'] +
                             "\n")
            return False
        #work on the positive strand case first
        cigar = '*'
        blocks = len(pe['blockSizes'])
        starts = pe['qStarts']
        #if pe['strand'] == '-':
        #  starts = [x for x in reversed(pe['qStarts_actual'])]
        #  print 'isrev'
        q_coord_start = starts[0] + 1  # base-1 converted starting position
        q_coord_end = starts[blocks -
                             1] + pe['blockSizes'][blocks -
                                                   1]  # base-1 position
        t_coord_start = pe['tStarts'][
            0] + 1  # base-1 converted starting position
        t_coord_end = pe['tStarts'][blocks -
                                    1] + pe['blockSizes'][blocks -
                                                          1]  # base-1 position
        if pe['qName'] not in self.reads and self.reads_set is True:
            sys.stderr.write("Warning: qName " + pe['qName'] +
                             " was not found in reads\n")
        # we will clip the query sequence to begin and end from the aligned region
        #q_seq = ''
        #if self.reads_set:
        #  q_seq = self.reads[pe['qName']]

        # 1. Get the new query to output
        q_seq_trimmed = '*'
        if self.reads_set or query_sequence:
            q_seq_trimmed = query_sequence
            if not query_sequence:  # get it from the archive we loaded if we didn't give it
                q_seq_trimmed = self.reads[pe['qName']]
            if pe['strand'] == '-':
                q_seq_trimmed = SequenceBasics.rc(q_seq_trimmed)
            q_seq_trimmed = q_seq_trimmed[q_coord_start - 1:q_coord_end]

        qual_trimmed = '*'
        if self.qualities_set or quality_sequence:
            qual_trimmed = quality_sequence
            if not quality_sequence:
                qual_trimmed = self.qualities[pe['qName']]
            if pe['strand'] == '-':
                qual_trimmed = qual_trimmed[::-1]
            qual_trimmed = qual_trimmed[q_coord_start - 1:q_coord_end]
        # 2. Get the cigar string to output
        prev_diff = t_coord_start - q_coord_start
        cigar = ''
        #for i in range(0,blocks):
        #  current_diff = pe['tStarts'][i]-starts[i]
        #  delta = current_diff - prev_diff
        #  #print delta
        #  if delta >= self.min_intron_size:
        #    cigar += str(abs(delta))+'N'
        #  elif delta > 0: # we have a
        #    cigar += str(abs(delta))+'D'
        #  elif delta < 0: # we have a
        #    cigar += str(abs(delta))+'I'
        #  cigar += str(pe['blockSizes'][i])+'M' # our matches
        #  #print current_diff
        #  prev_diff = current_diff
        qstarts = [x - pe['qStarts'][0] for x in pe['qStarts']]
        tstarts = [x - pe['tStarts'][0] for x in pe['tStarts']]
        query_index = 0
        target_index = 0
        junctions = []
        for i in range(0, blocks):
            qdif = qstarts[i] - query_index
            tdif = tstarts[i] - target_index
            if qdif > 0:  # we have to insert
                cigar += str(qdif) + 'I'
            if tdif > self.min_intron_size:  # we have an intron
                cigar += str(tdif) + 'N'
                junctions.append(i)
            elif tdif > 0:  # we have to delete
                cigar += str(tdif) + 'D'
            cigar += str(pe['blockSizes'][i]) + 'M'
            query_index = qstarts[i] + pe['blockSizes'][i]
            target_index = tstarts[i] + pe['blockSizes'][i]
        ### cigar done
        # inspect junctions if we have a ref_genome
        spliceflag_set = False
        if self.ref_genome_set:
            canon = 0
            revcanon = 0
            for i in junctions:  #blocks following a junction
                left_num = pe['tStarts'][i - 1] + pe['blockSizes'][i - 1]
                left_val = self.ref_genome[pe['tName']][left_num:left_num +
                                                        2].upper()
                right_num = pe['tStarts'][i - 1] - 2
                right_val = self.ref_genome[pe['tName']][right_num:right_num +
                                                         2].upper()
                junc = left_val + '-' + right_val
                if junc in self.canonical: canon += 1
                if junc in self.revcanonical: revcanon += 1
            if canon > revcanon:
                spliceflag_set = True
                spliceflag = '+'
            elif revcanon > canon:
                spliceflag_set = True
                spliceflag = '-'
        # if we have junctions, and we should be setting direction but
        # we can't figure out the direction skip ambiguous direction
        if len(
                junctions
        ) > 0 and self.skip_directionless_splice and spliceflag_set == False:
            return False
        samline = pe['qName'] + "\t"  # 1. QNAME
        if pe['strand'] == '-':
            samline += '16' + "\t"  # 2. FLAG
        else:
            samline += '0' + "\t"
        samline += pe['tName'] + "\t"  # 3. RNAME
        samline += str(t_coord_start) + "\t"  # 4. POS
        samline += '0' + "\t"  # 5. MAPQ
        samline += cigar + "\t"  # 6. CIGAR
        samline += '*' + "\t"  # 7. RNEXT
        samline += '0' + "\t"  # 8. PNEXT
        samline += '0' + "\t"  # 9. TLEN
        samline += q_seq_trimmed + "\t"  # 10. SEQ
        samline += qual_trimmed + "\t"  # 11. QUAL
        if spliceflag_set:
            samline += 'XS:A:' + spliceflag + "\t"
        if self.ref_genome_set:
            samline += 'NH:i:' + str(self.mapping_counts[pe['qName']]) + "\t"
        samline += 'XC:i:' + str(len(junctions)) + "\t"
        samline += 'NM:i:0'
        return samline
Esempio n. 17
0
  def convert_line(self,psl_line,query_sequence=None,quality_sequence=None):
    try:
      pe = PSLBasics.line_to_entry(psl_line)
    except:
      sys.stderr.write("Problem parsing line:\n"+psl_line.rstrip()+"\n")
      return False
    if len(pe['tStarts']) != len(pe['blockSizes']):
      sys.stderr.write("Warning invalid psl entry: "+pe['qName']+"\n")
      return False
    #work on the positive strand case first
    cigar = '*'
    blocks = len(pe['blockSizes'])
    starts = pe['qStarts']
    #if pe['strand'] == '-':
    #  starts = [x for x in reversed(pe['qStarts_actual'])]
    #  print 'isrev'
    q_coord_start = starts[0]+1 # base-1 converted starting position
    q_coord_end = starts[blocks-1]+pe['blockSizes'][blocks-1] # base-1 position
    t_coord_start = pe['tStarts'][0]+1 # base-1 converted starting position
    t_coord_end = pe['tStarts'][blocks-1]+pe['blockSizes'][blocks-1] # base-1 position
    if pe['qName'] not in self.reads and self.reads_set is True:
      sys.stderr.write("Warning: qName "+pe['qName']+" was not found in reads\n")
    # we will clip the query sequence to begin and end from the aligned region
    #q_seq = ''
    #if self.reads_set:
    #  q_seq = self.reads[pe['qName']]

    # 1. Get the new query to output
    q_seq_trimmed = '*'
    if self.reads_set or query_sequence:
      q_seq_trimmed = query_sequence
      if not query_sequence: # get it from the archive we loaded if we didn't give it
        q_seq_trimmed = self.reads[pe['qName']]
      if pe['strand'] == '-':
        q_seq_trimmed = SequenceBasics.rc(q_seq_trimmed)
      q_seq_trimmed = q_seq_trimmed[q_coord_start-1:q_coord_end]

    qual_trimmed = '*'
    if self.qualities_set or quality_sequence:
      qual_trimmed = quality_sequence
      if not quality_sequence:
        qual_trimmed = self.qualities[pe['qName']]
      if pe['strand'] == '-':
        qual_trimmed = qual_trimmed[::-1]
      qual_trimmed = qual_trimmed[q_coord_start-1:q_coord_end]
    # 2. Get the cigar string to output
    prev_diff = t_coord_start-q_coord_start
    cigar = ''
    #for i in range(0,blocks):
    #  current_diff = pe['tStarts'][i]-starts[i]
    #  delta = current_diff - prev_diff
    #  #print delta
    #  if delta >= self.min_intron_size:
    #    cigar += str(abs(delta))+'N'
    #  elif delta > 0: # we have a
    #    cigar += str(abs(delta))+'D'
    #  elif delta < 0: # we have a
    #    cigar += str(abs(delta))+'I'
    #  cigar += str(pe['blockSizes'][i])+'M' # our matches
    #  #print current_diff
    #  prev_diff = current_diff
    qstarts = [x-pe['qStarts'][0] for x in pe['qStarts']]
    tstarts = [x-pe['tStarts'][0] for x in pe['tStarts']]
    query_index = 0
    target_index = 0
    junctions = []
    for i in range(0,blocks):
      qdif = qstarts[i] - query_index
      tdif = tstarts[i] - target_index
      if qdif > 0:  # we have to insert
        cigar += str(qdif) + 'I'
      if tdif > self.min_intron_size: # we have an intron
        cigar += str(tdif) + 'N'
        junctions.append(i)
      elif tdif > 0: # we have to delete
        cigar += str(tdif) + 'D'
      cigar += str(pe['blockSizes'][i]) + 'M'
      query_index = qstarts[i]+pe['blockSizes'][i]
      target_index = tstarts[i]+pe['blockSizes'][i]
    ### cigar done
    # inspect junctions if we have a ref_genome
    spliceflag_set = False
    if self.ref_genome_set:
      canon = 0
      revcanon = 0
      for i in junctions: #blocks following a junction
        left_num = pe['tStarts'][i-1]+pe['blockSizes'][i-1]
        left_val = self.ref_genome[pe['tName']][left_num:left_num+2].upper()
        right_num = pe['tStarts'][i-1]-2
        right_val = self.ref_genome[pe['tName']][right_num:right_num+2].upper()
        junc = left_val + '-' + right_val
        if junc in self.canonical: canon += 1
        if junc in self.revcanonical: revcanon += 1
      if canon > revcanon: 
        spliceflag_set = True
        spliceflag = '+'
      elif revcanon > canon:
        spliceflag_set = True
        spliceflag = '-'
    # if we have junctions, and we should be setting direction but 
    # we can't figure out the direction skip ambiguous direction
    if len(junctions) > 0 and self.skip_directionless_splice and spliceflag_set == False:
      return False
    samline =  pe['qName'] + "\t"        # 1. QNAME
    if pe['strand'] == '-':
      samline += '16' + "\t"             # 2. FLAG
    else:
      samline += '0' + "\t"
    samline += pe['tName'] + "\t"        # 3. RNAME
    samline += str(t_coord_start) + "\t" # 4. POS
    samline += '0' + "\t"                # 5. MAPQ
    samline += cigar + "\t"         # 6. CIGAR
    samline += '*' + "\t"           # 7. RNEXT
    samline += '0' + "\t"           # 8. PNEXT
    samline += '0' + "\t"           # 9. TLEN
    samline += q_seq_trimmed + "\t" # 10. SEQ
    samline += qual_trimmed + "\t"  # 11. QUAL
    if spliceflag_set:
      samline += 'XS:A:'+spliceflag + "\t"
    if self.ref_genome_set:
      samline += 'NH:i:'+str(self.mapping_counts[pe['qName']]) + "\t"
    samline += 'XC:i:'+str(len(junctions)) + "\t"
    samline += 'NM:i:0'
    return samline