Example #1
0
def encl_sam_genotype(sam_path, arg_dict):
    exp_dir = arg_dict['exp_dir']
    read_len = arg_dict['read_len']
    motif = arg_dict['motif']
    score_dict = { 'match':  3, \
       'mismatch': -1, \
       'gap':   -3}
    verbose = False
    pre, post = extract_pre_post_flank(exp_dir, read_len)

    nCopy_dict = {}
    total_count = 0
    with open(sam_path, 'r') as encl_handle:
        for record in csv.reader(encl_handle, dialect='excel-tab'):
            if record[0][0] != '@':
                sample = record[9]
                nCopy, pos, score = expansion_aware_realign(
                    sample, pre, post, motif, score_dict, verbose)
                if nCopy not in nCopy_dict:
                    nCopy_dict[nCopy] = 1
                else:
                    nCopy_dict[nCopy] = nCopy_dict[nCopy] + 1
                total_count = total_count + 1
            nCopy_list = nCopy_dict.keys()
            freq_list = []
            for nCopy in nCopy_list:
                freq_list.append(float(nCopy_dict[nCopy]) / float(total_count))

    return nCopy_list, freq_list
Example #2
0
     print TLEN
     nc_col = 'nc:i:' + str(0)
     ps_col = 'ps:i:' + str(0)
     sc_col = 'sc:i:' + str(0)
     rc_col = 'rc:Z:' + 'unknown'
     is_col = 'is:i:' + str(int(np.abs(TLEN)))
     out_sam_handle.write(
         '\t'.join(row + [nc_col, ps_col, sc_col, rc_col, is_col]) +
         '\n')
 elif (RNAME == chrom and POS <= locus_start - read_len and RNEXT == '=' and PNEXT <= locus_start) or \
  (RNAME == chrom and POS >= locus_end and RNEXT == '=' and PNEXT >= locus_end):
     # ignoring read pairs that are both before or after STR region.
     pass
 else:
     # We realign first read to check if it's pre or post flanking
     nCopy, pos, score = expansion_aware_realign(
         SEQ, pre, post, motif, score_dict, verbose)
     nCopy_rev, pos_rev, score_rev = expansion_aware_realign(
         reverse_strand(SEQ), pre, post, motif, score_dict, verbose)
     if score_rev > score:
         nCopy = nCopy_rev
         pos = pos_rev
         score = score_rev
         SEQ = reverse_strand(SEQ)
     read_class = classify_realigned_read(SEQ, motif, pos, nCopy,
                                          score, score_dict,
                                          read_len, margin, verbose)
     # We're allowing flanking from both sides, but one side should be mapped to the correct location for this to work.
     # i.e. we don't rescue the case that both reads are flanking, and both are mapped to a different locus.
     nc_col = 'nc:i:' + str(nCopy)
     ps_col = 'ps:i:' + str(pos)
     sc_col = 'sc:i:' + str(score)
verbose = False
margin = 2
chrom, locus_start, locus_end = extract_locus_info(locus_bed)

with open(in_sam, 'r') as in_sam_handle:
    for record in in_sam_handle:
        if record[0] == '@':
            # out_sam_handle.write(record)
            pass
        else:
            row = record.split()

            if (row[2] == chrom and int(row[3]) >= locus_start - read_len and
                    int(row[3]) <= locus_end):  # Reads mapped within region
                sample = row[9]
                nCopy, pos, score = expansion_aware_realign(
                    sample, pre, post, motif, score_dict, verbose)
                read_class = classify_realigned_read(sample, motif, pos, nCopy,
                                                     score, score_dict,
                                                     read_len, margin, verbose)
                print '>> ', read_class, ':'
                print '>> nCopy =', nCopy
                print '>> score =', score
                print '>> pos =', pos
                print sample
                print

            # if (row[2] == chrom and row[6] == '=') or (row[6] == chrom):		# checking correct chrom for mate
            # 	if int(row[7]) <= locus_start:									# checking if mate is before STR region
            # 		if row[2] != chrom:
            # 			out_sam_handle.write(record)
            # 			print row[9]