def extractFromSAM(sam_fname, qnames_fname): sys.stderr.write('\nLoading qnames file!') qnames = [] qnames_dict = {} with open(qnames_fname, 'rU') as qnames_f: qnames = qnames_f.readlines() qnames_f.close() # Creating a dictionary for faster search # Also removing '\n' from the end for qname in qnames: qnames_dict[qname[:-1]] = 1 sys.stderr.write('\nLoading SAM file!') # Loading SAM file into hash # Keeping only SAM lines with regular CIGAR string, and sorting them according to position qnames_with_multiple_alignments = {} [sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines ] = utility_sam.HashSAMWithFilter(sam_fname, qnames_with_multiple_alignments) sys.stderr.write('\nExtracting ...') # Keys in the dictionary (hash) correspond to qnames for (samline_key, samline_list) in sam_hash.iteritems(): if samline_key in qnames_dict: for samline in samline_list: sys.stdout.write(samline.original_line + '\n') else: # import pdb # pdb.set_trace() pass sys.stderr.write('\nFinished!')
def analyze_chimeric_SAM(filename): fname, fext = os.path.splitext(filename) count = 0 count_all = 0 chimeric_reads = {} if fext != '.SAM' and fext != '.sam': raise Exception('File format need to be SAM!: %s' % fext) # Loading SAM file into hash # Keeping only SAM lines with regular CIGAR string, and sorting them according to position qnames_with_multiple_alignments = {} [sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines ] = utility_sam.HashSAMWithFilter(filename, qnames_with_multiple_alignments) for (samline_key, samline_list) in sam_hash.iteritems(): count_all += 1 for samline in samline_list: # import pdb # pdb.set_trace() flag = samline.flag # Detecting a chimeric alignment from a SAM file if flag & test_flag > 0: chimeric_reads[samline.qname] = 1 count += 1 # KK: printing out only the names of chimeric reads for name in chimeric_reads.iterkeys(): sys.stdout.write('%s\n' % name) sys.stderr.write('\n Count occurences: %d' % count) sys.stderr.write('\n Count reads: %d' % len(chimeric_reads)) sys.stderr.write('\n Count all reads: %d' % count_all)
def main(): if (len(sys.argv) != 4): verbose_usage_and_exit(); query_sam = sys.argv[1]; reference_sam = sys.argv[2]; vcf_file = sys.argv[3]; sys.stderr.write('Loading query SAM file...\n'); [hashed_query, num_queries, num_unique_queries] = utility_sam.HashSAMWithFilter(query_sam, {}); sys.stderr.write('Loading reference SAM file...\n'); [hashed_reference, num_references, num_unique_references] = utility_sam.HashSAMWithFilter(reference_sam, {}); sys.stderr.write('Loading positions from the VCF file...\n'); [positions, ref_bases, alt_bases] = parse_vcf_positions(vcf_file); out_summary_prefix = os.path.splitext(vcf_file)[0]; sys.stderr.write('Starting the counting process...\n'); [accuracy, accuracy_called_bases] = utility_sam.CountCorrectlyMappedBasesAtPositions(hashed_query, hashed_reference, positions, ref_bases, alt_bases, out_summary_prefix=out_summary_prefix); sys.stderr.write('Accuracy: %.2f\n' % accuracy); sys.stderr.write('Accuracy (only called bases): %.2f\n' % accuracy_called_bases);
def load_and_process_SAM(sam_file, BBMapFormat=False): # Loading SAM file into hash # Keeping only SAM lines with regular CIGAR string, and sorting them according to position qnames_with_multiple_alignments = {} [sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines ] = utility_sam.HashSAMWithFilter(sam_file, qnames_with_multiple_alignments) # If variable BBMapFormat is set to true, all samfiles referring to the same query will be collected together # Stil have to decide what to do with query names, currently removing '_part' if BBMapFormat: new_sam_hash = {} for (qname, sam_lines) in sam_hash.items(): pos = qname.find('_part') if pos > -1: origQname = qname[:pos] else: origQname = qname if origQname not in new_sam_hash: new_sam_hash[origQname] = sam_lines else: # import pdb # pdb.set_trace() new_sam_lines = sam_lines + new_sam_hash[origQname] new_sam_hash[origQname] = sam_lines sam_hash = new_sam_hash # NOTE: This is a quick and dirty solution # Setting this to true so that large deletions are turned into Ns # BBMap marks intron RNA alignment gaps with deletions! BBMapFormat = True # Reorganizing SAM lines, removing unmapped queries, leaving only the first alignment and # other alignments that possibly costitute a split alignment together with the first one samlines = [] cnt = 0 pattern = '(\d+)(.)' # for samline_list in sam_hash.itervalues(): for (samline_key, samline_list) in sam_hash.items(): cnt += 1 if samline_list[0].cigar != '*' and samline_list[ 0].cigar != '': # if the first alignment doesn't have a regular cigar string, skip if BBMapFormat: # All deletes that are 10 or more bases are replaced with Ns of the same length operations = re.findall(pattern, samline_list[0].cigar) newcigar = '' for op in operations: op1 = op[1] op0 = op[0] if op[1] == 'D' and int(op[0]) >= 10: op1 = 'N' newcigar += op0 + op1 samline_list[0].cigar = newcigar operations = re.findall(pattern, samline_list[0].cigar) split = False for op in operations[ 1: -1]: # Ns cannot appear as the first or the last operation if op[1] == 'N': split = True break # If the first alignment is split (had Ns in the middle), keep only the first alignment and drop the others if split: # Transform split alignments containing Ns into multiple alignments with clipping temp_samline_list = [] posread = 0 posref = 0 # NOTE: I don't seem to be using this, probably should remove it newcigar = '' readlength = samline_list[0].CalcReadLengthFromCigar() new_samline = copy.deepcopy(samline_list[0]) mapping_pos = new_samline.pos clipped_bases = new_samline.pos - new_samline.clipped_pos hclip_seq = 0 # Used with hard clipping, how big part of sequence should be removed clip_type = 'S' # Soft_clipping by default for op in operations: if op[1] == 'N' and int( op[0]) > 1: # Create a new alignment with clipping newcigar += '%dS' % ( readlength - posread ) # Always use soft clipping at the end new_samline.cigar = newcigar # After some deliberation, I concluded that this samline doesn't have to have its position changed # The next samline does, and by the size of N operation in cigar string + any operations before temp_samline_list.append(new_samline) new_samline = copy.deepcopy(samline_list[0]) mapping_pos += int(op[0]) new_samline.pos = mapping_pos new_samline.clipped_pos = new_samline.pos - clipped_bases posref += int(op[0]) if clip_type == 'H': new_samline.seq = new_samline.seq[hclip_seq:] newcigar = '%d%c' % (posread, clip_type) else: # Expand a current alignment newcigar += op[0] + op[1] if op[1] in ('D', 'N'): posref += int(op[0]) mapping_pos += int(op[0]) elif op[1] == 'I': posread += int(op[0]) # Everything besides deletes and Ns will be clipped in the next partial alignment # Therefore have to adjust both pos and clipped pos clipped_bases += int(op[0]) hclip_seq += int(op[0]) elif op[1] in ('S', 'H'): clip_type = op[1] # Clipped bases can not appear in the middle of the original cigar string # And they have already been added to the position, # so I shouldn't adjust my mapping_pos and clipped_bases again # TODO: I should probably diferentiate between hars and soft clipping posread += int(op[0]) posref += int(op[0]) else: posref += int(op[0]) posread += int(op[0]) clipped_bases += int(op[0]) mapping_pos += int(op[0]) hclip_seq += int(op[0]) new_samline.cigar = newcigar temp_samline_list.append(new_samline) samlines.append(temp_samline_list) else: temp_samline_list = [ samline_list[0] ] # add the first alignment to the temp list multi_alignment = False for samline in samline_list[ 1:]: # look through other alignments and see if they could form a split alignment with the current temp_samline_list if BBMapFormat: # All deletes that are 10 or more bases are replaced with Ns of the same length operations = re.findall(pattern, samline.cigar) newcigar = '' for op in operations: op0 = op[0] op1 = op[1] if op[1] == 'D' and int(op[0]) >= 10: op1 = 'N' newcigar += op0 + op1 samline.cigar = newcigar if not join_split_alignment(temp_samline_list, samline): multi_alignment = True samlines.append(temp_samline_list) else: pass # Sorting SAM lines according to the position of the first alignment samlines.sort(key=lambda samline: samline[0].pos) #for samline_list in samlines: # print samline_list[0].qname, samline_list[0].rname return samlines
def CompareTwoSAMs(sam_file1, sam_file2, distance_threshold, out_summary_prefix=''): # print 'Loading first SAM file...'; qnames_with_multiple_alignments = {} # print sam_file1, sam_file2, distance_threshold, out_summary_prefix sys.stderr.write('Loading the first SAM file into hash...\n') [sam_hash1, sam_hash1_num_lines, sam_hash1_num_unique_lines ] = utility_sam.HashSAMWithFilter(sam_file1, qnames_with_multiple_alignments) sam_headers1 = utility_sam.LoadOnlySAMHeaders(sam_file1) sys.stderr.write('Loading the second SAM file into hash...\n') [sam_hash2, sam_hash2_num_lines, sam_hash2_num_unique_lines ] = utility_sam.HashSAMWithFilter(sam_file2, qnames_with_multiple_alignments) sam_headers2 = utility_sam.LoadOnlySAMHeaders(sam_file2) not_in_sam_file1 = 0 not_in_sam_file2 = 0 num_different_reference = 0 num_different_orientation = 0 num_not_mapped_1 = 0 num_not_mapped_2 = 0 num_mapped_1 = 0 num_mapped_2 = 0 qname_to_distance_hash = {} # qname_to_pos = {}; distance_count_hash = {} distance_to_qname_hash = {} distance_to_sam_hash = {} shared_qnames = {} num_processed = 0 qnames_not_in_sam_file1 = [] qnames_not_in_sam_file2 = [] for qname in sam_hash1.keys(): num_processed += 1 if ((num_processed % 1000) == 0): sys.stderr.write('\rProcessed %d alignments...' % num_processed) if (len(sam_hash1[qname]) > 0 and sam_hash1[qname][0].IsMapped() == True): num_mapped_1 += 1 # TODO: THIS NEEDS TO BE REMOVED OR IMPLEMENTED SOMEHOW DIFFERENTLY!! # The point of this was that, BLASR doesn't conform to the SAM standard, and makes it difficult to # uniformly evaluate the results! # if 'blasr' in sam_file1.lower(): # qname = '/'.join(qname.split('/')[:-1]); sam_line_list1 = sam_hash1[qname] sam_line_list_2 = [] try: sam_line_list2 = sam_hash2[qname] if (len(sam_line_list2) > 0 and sam_line_list2[0].IsMapped() == False): not_in_sam_file2 += 1 qnames_not_in_sam_file2.append( [sam_line_list1[0].evalue, qname]) except: not_in_sam_file2 += 1 qnames_not_in_sam_file2.append([sam_line_list1[0].evalue, qname]) continue sorted_sam_line_list1 = sorted( sam_line_list1, key=lambda sam_line: ( (not sam_line.IsMapped()), -sam_line.chosen_quality)) sorted_sam_line_list2 = sorted( sam_line_list2, key=lambda sam_line: ( (not sam_line.IsMapped()), -sam_line.chosen_quality)) if (len(sorted_sam_line_list1) > 0 and len(sorted_sam_line_list2) > 0): if (sorted_sam_line_list1[0].IsMapped() == False): num_not_mapped_1 += 1 if (sorted_sam_line_list2[0].IsMapped() == False): num_not_mapped_2 += 1 if (sorted_sam_line_list1[0].IsMapped() == False or sorted_sam_line_list2[0].IsMapped() == False): continue if (not ((sorted_sam_line_list1[0].rname in sorted_sam_line_list2[0].rname) or (sorted_sam_line_list2[0].rname in sorted_sam_line_list1[0].rname))): num_different_reference += 1 continue if (sorted_sam_line_list1[0].IsReverse() != sorted_sam_line_list2[0].IsReverse()): num_different_orientation += 1 continue distance = abs(sorted_sam_line_list1[0].clipped_pos - sorted_sam_line_list2[0].clipped_pos) if (not (qname in shared_qnames)): shared_qnames[qname] = 1 qname_to_distance_hash[qname] = distance # qname_to_pos[qname] = [sorted_sam_line_list1[0].clipped_pos, sorted_sam_line_list2[0].clipped_pos]; if (distance in distance_count_hash): distance_count_hash[distance] += 1 distance_to_qname_hash[distance].append(qname) distance_to_sam_hash[distance].append(sorted_sam_line_list1[0]) else: distance_count_hash[distance] = 1 distance_to_qname_hash[distance] = [qname] distance_to_sam_hash[distance] = [sorted_sam_line_list1[0]] else: if (len(sorted_sam_line_list1) == 0): not_in_sam_file1 += 1 # qnames_not_in_sam_file1.append(qname); if (len(sorted_sam_line_list2) == 0): not_in_sam_file2 += 1 # qnames_not_in_sam_file2.append(qname); sys.stderr.write( 'Warning: Something odd with qname "%s". Qname present in both files, but lists are empty.\n' % (qname)) continue # min_distance = -1; # i = 0; # for sam_line1 in sorted_sam_line_list1: # for sam_line2 in sorted_sam_line_list2: # distance = abs(sam_line1.clipped_pos - sam_line2.clipped_pos); # if (i == 0 or distance < min_distance): # min_distance = distance; # i += 1; # distance_hash[qname] = min_distance; sys.stderr.write('\n') sys.stderr.write( 'Counting qnames present in sam_file2 that are missing from sam_file1...\n' ) num_processed = 0 for qname in sam_hash2.keys(): num_processed += 1 if ((num_processed % 1000) == 0): sys.stderr.write('\rProcessed %d alignments...' % num_processed) sam_hash2_list = sam_hash2[qname] if (len(sam_hash2_list) > 0): if (sam_hash2_list[0].IsMapped() == True): num_mapped_2 += 1 try: sam_hash1_list = sam_hash1[qname] if (sam_hash1_list[0].IsMapped() == False): not_in_sam_file1 += 1 qnames_not_in_sam_file1.append( [sam_hash2_list[0].evalue, qname]) except: not_in_sam_file1 += 1 qnames_not_in_sam_file1.append( [sam_hash2_list[0].evalue, qname]) # if (len(sam_hash2_list) > 0): # if (sam_hash2_list[0].IsMapped() == True): # num_mapped_2 += 1; # if (qname in sam_hash1.keys()): # pass; # else: # not_in_sam_file1 += 1; # qnames_not_in_sam_file1.append(qname); sys.stderr.write('\n') sys.stderr.write('\n') fp_out = None fp_out_lt0bp = None fp_out_gt5000bp = None out_file = out_summary_prefix + '.csv' out_file_lt0bp = out_summary_prefix + '_lt0bp.csv' out_file_gt5000bp = out_summary_prefix + '_gt5000bp.csv' if (out_summary_prefix != ''): try: fp_out = open(out_file, 'w') fp_out_lt0bp = open(out_file_lt0bp, 'w') fp_out_gt5000bp = open(out_file_gt5000bp, 'w') except IOError: sys.stderr.write( '[%s] ERROR: Could not open file "%s" for writing!\n' % (__name__, out_file)) return # exit(1); summary_line = '' summary_line += 'SAM file 1: %s\n' % sam_file1 summary_line += 'SAM file 2: %s\n' % sam_file2 summary_line += 'Number of qnames not present in SAM file 1: %d\n' % ( not_in_sam_file1) summary_line += 'Number of qnames not present in SAM file 2: %d\n' % ( not_in_sam_file2) summary_line += 'Number of qnames mapped to different references: %d\n' % ( num_different_reference) summary_line += 'Number of alignments of different orientation: %d\n' % ( num_different_orientation) summary_line += 'Number of shared qnames: %d\n' % (len( shared_qnames.keys())) summary_line += 'Mapped in SAM 1: %d\n' % (num_mapped_1) summary_line += 'Unmapped in SAM 1: %d\n' % (num_not_mapped_1) summary_line += 'Mapped in SAM 2: %d\n' % (num_mapped_2) summary_line += 'Unmapped in SAM 2: %d\n' % (num_not_mapped_2) summary_line += '\n' length_threshold = 9000 sys.stdout.write(summary_line) if (out_summary_prefix != ''): fp_out.write(summary_line) summary_line = '' summary_line_lt0bp = '' summary_line_gt5000bp = '' num_same_alignments = 0 i = 0 # while i < len(distance_to_qname_hash.keys() # print distance_to_qname_hash; for distance in sorted(distance_to_qname_hash.keys()): sorted_by_length = sorted(distance_to_sam_hash[distance], reverse=True, key=lambda sam_line: len(sam_line.seq)) # sorted_qnames = ['%s <%d, %d>' % (single_sam_line.qname, len(single_sam_line.seq), single_sam_line.mapq) for single_sam_line in sorted_by_length]; # positions = qname_to_pos[distance_to_qname_hash[distance]]; # sorted_qnames = ['%s <len:%d, SAM1:%d, SAM2:%d>' % (single_sam_line.qname, len(single_sam_line.seq), positions[0], positions[1]) for single_sam_line in sorted_by_length]; sorted_qnames = [ '%s <len:%d, pos:%d>' % (single_sam_line.qname, len( single_sam_line.seq), single_sam_line.clipped_pos) for single_sam_line in sorted_by_length ] sorted_qnames_above_length = [ ('%s' % (single_sam_line.qname)) for single_sam_line in sorted_by_length if (len(single_sam_line.seq) > length_threshold) ] if (distance == 0): summary_line_lt0bp = ' \\\n'.join(sorted_qnames_above_length) if (distance > 5000): if (len(summary_line_gt5000bp) > 0): summary_line_gt5000bp += ' \\\n' summary_line_gt5000bp += ' \\\n'.join(sorted_qnames_above_length) # sorted_qnames = [str(len(single_sam_line.seq)) for single_sam_line in sorted(distance_to_sam_hash[distance], reverse=True, key=lambda sam_line: len(sam_line.seq))]; # summary_line = str(distance) + '\t' + str(len(distance_to_qname_hash[distance])) + '\t' + '\t'.join(distance_to_qname_hash[distance]) + '\n'; summary_line = str(distance) + '\t' + str( len(distance_to_qname_hash[distance])) + '\t' + '\t'.join( sorted_qnames) + '\n' if (distance <= distance_threshold): num_same_alignments += len(distance_to_qname_hash[distance]) # sys.stdout.write(summary_line); if (out_summary_prefix != ''): fp_out.write(summary_line) summary_line = '' summary_line = 'Distance threshold to consider mappings same: %d\n' % distance_threshold summary_line += 'Number of same mappings: %d\n' % num_same_alignments summary_line += '(verbose) Number of same mappings: %d (%.2f%% in SAM1 / %.2f%% in SAM2) within %d bp distance.\n' % ( num_same_alignments, 100.0 * float(num_same_alignments) / float(num_mapped_1 + num_not_mapped_1), 100.0 * float(num_same_alignments) / float(num_mapped_2 + num_not_mapped_2), distance_threshold) summary_line += '\n' sys.stdout.write(summary_line) if (out_summary_prefix != ''): fp_out.write(summary_line) fp_out_lt0bp.write(summary_line_lt0bp) fp_out_gt5000bp.write(summary_line_gt5000bp) summary_line = '' summary_line_lt0bp = '' summary_line_gt5000bp = '' sam1_basename = os.path.splitext(os.path.basename(sam_file1))[0] sam2_basename = os.path.splitext(os.path.basename(sam_file2))[0] out_file_qnames_only_in_sam2 = out_summary_prefix + '_qnames_only_in_%s.csv' % ( sam2_basename) out_file_qnames_only_in_sam1 = out_summary_prefix + '_qnames_only_in_%s.csv' % ( sam1_basename) out_file_qnames_only_in_sam2_as_sam = out_summary_prefix + '_qnames_only_in_%s.sam' % ( sam2_basename) out_file_qnames_only_in_sam1_as_sam = out_summary_prefix + '_qnames_only_in_%s.sam' % ( sam1_basename) out_file_qnames_in_both_sam1_as_sam = out_summary_prefix + '_qnames_in_both-alignments_from_%s.sam' % ( sam1_basename) out_file_qnames_in_both_sam2_as_sam = out_summary_prefix + '_qnames_in_both-alignments_from_%s.sam' % ( sam2_basename) summary_line += 'Output files:\n' summary_line += '\t%s\n' % (out_file_qnames_only_in_sam1) summary_line += '\t%s\n' % (out_file_qnames_only_in_sam2) summary_line += '\t%s\n' % (out_file_qnames_only_in_sam1_as_sam) summary_line += '\t%s\n' % (out_file_qnames_only_in_sam2_as_sam) summary_line += '\t%s\n' % (out_file_qnames_in_both_sam1_as_sam) summary_line += '\t%s\n' % (out_file_qnames_in_both_sam2_as_sam) try: fp_out_qnames_only_in_sam2 = open(out_file_qnames_only_in_sam2, 'w') fp_out_qnames_only_in_sam1 = open(out_file_qnames_only_in_sam1, 'w') # fp_out_qnames_only_in_sam2.write('\n'.join(qnames_not_in_sam_file1) + '\n'); fp_out_qnames_only_in_sam2.write('\n'.join([ '%e\t%s' % (value[0], value[1]) for value in sorted(qnames_not_in_sam_file1, key=lambda x: x[0]) ]) + '\n') fp_out_qnames_only_in_sam1.write('\n'.join([ '%e\t%s' % (value[0], value[1]) for value in sorted(qnames_not_in_sam_file2, key=lambda x: x[0]) ]) + '\n') fp_out_qnames_only_in_sam2.close() fp_out_qnames_only_in_sam1.close() fp_out1 = open(out_file_qnames_only_in_sam2_as_sam, 'w') fp_out1.write('\n'.join(sam_headers2) + '\n') for value in sorted(qnames_not_in_sam_file1, key=lambda x: x[0]): fp_out1.write('\n'.join([ sam_line.original_line for sam_line in sam_hash2[value[1]] ]) + '\n') fp_out1.close() fp_out2 = open(out_file_qnames_only_in_sam1_as_sam, 'w') fp_out2.write('\n'.join(sam_headers1) + '\n') for value in sorted(qnames_not_in_sam_file2, key=lambda x: x[0]): fp_out2.write('\n'.join([ sam_line.original_line for sam_line in sam_hash1[value[1]] ]) + '\n') fp_out2.close() fp_out1 = open(out_file_qnames_in_both_sam1_as_sam, 'w') fp_out1.write('\n'.join(sam_headers1) + '\n') for value in shared_qnames: fp_out1.write('\n'.join( [sam_line.original_line for sam_line in sam_hash1[value]]) + '\n') fp_out1.close() fp_out2 = open(out_file_qnames_in_both_sam2_as_sam, 'w') fp_out2.write('\n'.join(sam_headers2) + '\n') for value in shared_qnames: fp_out2.write('\n'.join( [sam_line.original_line for sam_line in sam_hash2[value]]) + '\n') fp_out2.close() except IOError: sys.stderr.write( 'ERROR: Could not open file(s) for writing! Either "%s" or "%s".\n' % (out_file_qnames_only_in_sam2, out_file_qnames_only_in_sam1)) if (out_summary_prefix != ''): fp_out.close() fp_out_lt0bp.close() fp_out_gt5000bp.close()
def test_cigars(samfile, fastaref): paramdict = {} report = EvalReport(ReportType.TEMP_REPORT) sys.stderr.write('\n(%s) Loading and processing FASTA reference ... ' % datetime.now().time().isoformat()) [chromname2seq, headers, seqs, quals] = load_and_process_reference(fastaref, paramdict, report) sys.stderr.write( '\n(%s) Loading and processing SAM file with mappings ... ' % datetime.now().time().isoformat()) qnames_with_multiple_alignments = {} [sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines ] = utility_sam.HashSAMWithFilter(samfile, qnames_with_multiple_alignments) sys.stdout.write('\nTYPE\tQNAME\tMAX numMatch\tLENGTH\tFLAG\n') for (samline_key, samline_list) in sam_hash.iteritems(): if samline_list[0].cigar <> '*' and samline_list[0].cigar <> '': for samline in samline_list: chromname = getChromName(samline.rname) if chromname not in chromname2seq: # import pdb # pdb.set_trace() raise Exception( '\nERROR: Unknown chromosome name in SAM file! (chromname:"%s", samline.rname:"%s")' % (chromname, samline.rname)) chromidx = chromname2seq[chromname] cigar = samline.cigar length = samline.CalcReadLengthFromCigar() numMatch = numMatch1 = numMatch2 = 0 flag = -1 try: # Using regular expressions to find repeating digit and skipping one character after that # Used to separate CIGAR string into individual operations pattern = '(\d+)(.)' pos = samline.pos flag = samline.flag # Calculating regular matches extcigar = samline.CalcExtendedCIGAR(seqs[chromidx]) operations = re.findall(pattern, extcigar) for op in operations: if op[1] in ('M', '='): numMatch += int(op[0]) elif op[1] in ('I', 'D', 'X', 'N', 'S', 'H', 'P'): pass else: sys.stderr.write( '\nERROR: Invalid CIGAR string operation (%s)' % op[1]) # Calculating for pos + 1 samline.pos = pos + 1 extcigar = samline.CalcExtendedCIGAR(seqs[chromidx]) operations = re.findall(pattern, extcigar) for op in operations: if op[1] in ('M', '='): numMatch1 += int(op[0]) elif op[1] in ('I', 'D', 'X', 'N', 'S', 'H', 'P'): pass else: sys.stderr.write( '\nERROR: Invalid CIGAR string operation (%s)' % op[1]) # Calculating for pos - 1 samline.pos = pos - 1 extcigar = samline.CalcExtendedCIGAR(seqs[chromidx]) operations = re.findall(pattern, extcigar) for op in operations: if op[1] in ('M', '='): numMatch2 += int(op[0]) elif op[1] in ('I', 'D', 'X', 'N', 'S', 'H', 'P'): pass else: sys.stderr.write( '\nERROR: Invalid CIGAR string operation (%s)' % op[1]) except Exception, Argument: # import pdb # pdb.set_trace() sys.stderr.write( 'ERROR: querry/ref/pos/message = %s/%s/%d/%s \n' % (samline.qname, samline.rname, samline.pos, Argument)) pass if (numMatch > numMatch1 and numMatch > numMatch2): sys.stdout.write('REGULAR\t%s\t%d\t%d\t%d\n' % (samline.qname, numMatch, length, flag)) elif (numMatch1 > numMatch and numMatch1 > numMatch2): sys.stdout.write('PLUS ONE\t%s\t%d\t%d\t%d\n' % (samline.qname, numMatch1, length, flag)) elif (numMatch2 > numMatch and numMatch2 > numMatch1): sys.stdout.write('MINUS ONE\t%s\t%d\t%d\t%d\n' % (samline.qname, numMatch2, length, flag)) else: sys.stdout.write('NONE\t%s\t%d\t%d\t%d\n' % (samline.qname, numMatch, length, flag))
def scara_analyze(scaffolds_file, reference_file, output_folder): sys.stderr.write('\nSTARTING SCAFFOLDING ANALYSIS SCRIPT') output_folder_path = os.path.join(os.getcwd(), output_folder) ### STEP 0. Checking paths and folders if not os.path.exists(scaffolds_file): sys.stderr.write('\nScaffolds file does not exist (%s)! Exiting ...' % scaffolds_file) return elif not os.path.exists(reference_file): sys.stderr.write('\nReference file does not exist (%s)! Exiting ...' % reference_file) return elif not os.path.exists(output_folder): sys.stderr.write( '\nOutput folder does not exist (%s)! Creating it ...' % output_folder) os.mkdir(output_folder_path) ### STEP 1. Running Minimap2 sys.stderr.write('\nCALCULATING MAPPINGS BETWEEN SCAFFOLDS AND REFERENCE!') minimap2_output_file = os.path.join(output_folder_path, 'scaffolds2reference.sam') if os.path.exists(minimap2_output_file): sys.stderr.write('\nMapping file already present! Skipping ...!') else: cmd = '%s %s %s %s > %s' % (MINIMAP2, default_MM2options, reference_file, scaffolds_file, minimap2_output_file) sys.stderr.write('\nRUNNING COMMAND: %s' % cmd) (status, output) = commands.getstatusoutput(cmd) logfile = os.path.join(output_folder_path, 'Minimap2_r2r.log') with open(logfile, 'w') as lfile: lfile.write(output) ### STEP 2. Load and analyze Minimap2 file # Loading SAM file into a dictionary # Keeping only SAM lines with regular CIGAR string, and sorting them according to position sys.stderr.write('\nANALYZING MAPPINGS!') qnames_with_multiple_alignments = {} [sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines ] = utility_sam.HashSAMWithFilter(minimap2_output_file, qnames_with_multiple_alignments) # Load scaffolds [theaders, sseqs, squals] = read_fastq(scaffolds_file) # Cutting headers at first space sheaders = [] for theader in theaders: sheader = theader[:theader.find(' ')] sheaders.append(sheader) # Load reference [theaders, rseqs, rquals] = read_fastq(reference_file) # Cutting headers at first space rheaders = [] for theader in theaders: rheader = theader[:theader.find(' ')] rheaders.append(rheader) # Analyze SAM scaffold_mappings = { } # A dictionary that for each scaffold that is mapped to a reference contains # a list of reference parts (chromosome) to which the scaffold is mapped reference_mappings = { } # A dictionary that for each reference that is mapped to a scaffold contains # a list of scaffolds mapped to it for sheader in sheaders: scaffold_mappings[sheader] = [] for rheader in rheaders: reference_mappings[rheader] = [] for (qname, sam_lines) in sam_hash.iteritems(): for samline in sam_lines: # Skip samlines with invalid CIGAR if samline.cigar == '*': continue if qname in sheaders: sname = qname rname = samline.rname elif qname in rheaders: sname = samline.rname rname = qname else: sys.stderr.write( '\nERROR: Invalid query name in mappings file (%s)!' % qname) return smappings = scaffold_mappings[sname] if rname not in smappings: scaffold_mappings[sname].append(rname) rmappings = reference_mappings[rname] if sname not in rmappings: reference_mappings[rname].append(sname) # Print scaffold-reference mappings (both dictionaries) found_double_mappings = False found_zero_mappings = False mapping_analysis_file = os.path.join(output_folder_path, 'mapping_analysis.txt') with open(mapping_analysis_file, 'w') as mafile: mafile.write('SCAFFOLD: REFERENCE LIST\n') for sname, rname_list in scaffold_mappings.iteritems(): if len(rname_list) > 1: found_double_mappings = True if len(rname_list) == 0: found_zero_mappings = True mafile.write('%s: %s\n' % (sname, ', '.join(rname_list))) mafile.write('REFERENCE: SCAFFOLD LIST\n') for rname, sname_list in reference_mappings.iteritems(): mafile.write('%s: %s\n' % (rname, ', '.join(sname_list))) if found_double_mappings: sys.stderr.write( '\nWARNING: Found scaffolds mapped to multiple references!') if found_zero_mappings: sys.stderr.write('\nWARNING: Found unmapped scaffolds!') ### STEP 3. Generate gepard dot plots for all mappings between scaffolds and references sys.stderr.write('\nGENERATING DOT PLOTS FOR SCAFFOLDS!') # Create separate fasta files for each scaffold scaffolds_folder = os.path.join(output_folder_path, 'scaffolds') os.mkdir(scaffolds_folder) for i in xrange(len(sheaders)): sheader = sheaders[i] sseq = sseqs[i] sfilename = os.path.join(scaffolds_folder, sheader + '.fasta') with open(sfilename, 'w') as sfile: sfile.write('>%s\n%s\n' % (sheader, sseq)) # Create separate fasta file for each reference referencess_folder = os.path.join(output_folder_path, 'references') os.mkdir(referencess_folder) for i in xrange(len(rheaders)): rheader = rheaders[i] rseq = rseqs[i] rfilename = os.path.join(referencess_folder, rheader + '.fasta') with open(rfilename, 'w') as rfile: rfile.write('>%s\n%s\n' % (rheader, rseq)) # Generate dot plots gepard_folder = os.path.join(output_folder_path, 'scaff_gepard') os.mkdir(gepard_folder) for sname, rname_list in scaffold_mappings.iteritems(): for rname in rname_list: sfilename = os.path.join(scaffolds_folder, sname + '.fasta') if not os.path.exists(sfilename): sys.stderr.write('\nERROR: Scaffold fasta file not found: %s' % sfilename) rfilename = os.path.join(referencess_folder, rname + '.fasta') if not os.path.exists(rfilename): sys.stderr.write( '\nERROR: Reference fasta file not found: %s' % rfilename) gepard_file = os.path.join(gepard_folder, '%s_%s.png' % (sname, rname)) cmd = 'java -cp %s org.gepard.client.cmdline.CommandLine -seq1 %s -seq2 %s -matrix %s -outfile %s' \ % (GEPARD_JAR, sfilename, rfilename, GEPARD_MATRIX, gepard_file) sys.stderr.write('\nRUNNING COMMAND: %s' % cmd) (status, output) = commands.getstatusoutput(cmd) ### STEP 3.1 Generate additional dotplots, one for each reference against all scaffolds mapped to it sys.stderr.write('\nGENERATING DOT PLOTS FOR REFERENCES!') gepard_folder2 = os.path.join(output_folder_path, 'ref_gepard') os.mkdir(gepard_folder2) for rname, sname_list in reference_mappings.iteritems(): if len(sname_list) > 0: sfilename = os.path.join(gepard_folder2, '%s_scaffolds.fasta' % rname) rfilename = os.path.join(referencess_folder, rname + '.fasta') with open(sfilename, 'w') as sfile: for i in xrange(len(sheaders)): sheader = sheaders[i] sseq = sseqs[i] if sheader in sname_list: sfile.write('>%s\n%s\n' % (sheader, sseq)) gepard_file2 = os.path.join(gepard_folder2, '%s_scaffolds.png' % rname) cmd = 'java -cp %s org.gepard.client.cmdline.CommandLine -seq1 %s -seq2 %s -matrix %s -outfile %s' \ % (GEPARD_JAR, sfilename, rfilename, GEPARD_MATRIX, gepard_file2) sys.stderr.write('\nRUNNING COMMAND: %s' % cmd) (status, output) = commands.getstatusoutput(cmd) return