Esempio n. 1
0
def extractFromSAM(sam_fname, qnames_fname):
    sys.stderr.write('\nLoading qnames file!')
    qnames = []
    qnames_dict = {}
    with open(qnames_fname, 'rU') as qnames_f:
        qnames = qnames_f.readlines()
        qnames_f.close()
    # Creating a dictionary for faster search
    # Also removing '\n' from the end
    for qname in qnames:
        qnames_dict[qname[:-1]] = 1

    sys.stderr.write('\nLoading SAM file!')
    # Loading SAM file into hash
    # Keeping only SAM lines with regular CIGAR string, and sorting them according to position
    qnames_with_multiple_alignments = {}
    [sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines
     ] = utility_sam.HashSAMWithFilter(sam_fname,
                                       qnames_with_multiple_alignments)

    sys.stderr.write('\nExtracting ...')
    # Keys in the dictionary (hash) correspond to qnames
    for (samline_key, samline_list) in sam_hash.iteritems():
        if samline_key in qnames_dict:
            for samline in samline_list:
                sys.stdout.write(samline.original_line + '\n')
        else:
            # import pdb
            # pdb.set_trace()
            pass

    sys.stderr.write('\nFinished!')
Esempio n. 2
0
def analyze_chimeric_SAM(filename):

    fname, fext = os.path.splitext(filename)
    count = 0
    count_all = 0
    chimeric_reads = {}

    if fext != '.SAM' and fext != '.sam':
        raise Exception('File format need to be SAM!: %s' % fext)

    # Loading SAM file into hash
    # Keeping only SAM lines with regular CIGAR string, and sorting them according to position
    qnames_with_multiple_alignments = {}
    [sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines
     ] = utility_sam.HashSAMWithFilter(filename,
                                       qnames_with_multiple_alignments)

    for (samline_key, samline_list) in sam_hash.iteritems():
        count_all += 1
        for samline in samline_list:
            # import pdb
            # pdb.set_trace()
            flag = samline.flag
            # Detecting a chimeric alignment from a SAM file
            if flag & test_flag > 0:
                chimeric_reads[samline.qname] = 1
                count += 1

    # KK: printing out only the names of chimeric reads
    for name in chimeric_reads.iterkeys():
        sys.stdout.write('%s\n' % name)

    sys.stderr.write('\n Count occurences: %d' % count)
    sys.stderr.write('\n Count reads: %d' % len(chimeric_reads))
    sys.stderr.write('\n Count all reads: %d' % count_all)
def main():
	if (len(sys.argv) != 4):
		verbose_usage_and_exit();

	query_sam = sys.argv[1];
	reference_sam = sys.argv[2];
	vcf_file = sys.argv[3];

	sys.stderr.write('Loading query SAM file...\n');
	[hashed_query, num_queries, num_unique_queries] = utility_sam.HashSAMWithFilter(query_sam, {});
	sys.stderr.write('Loading reference SAM file...\n');
	[hashed_reference, num_references, num_unique_references] = utility_sam.HashSAMWithFilter(reference_sam, {});
	sys.stderr.write('Loading positions from the VCF file...\n');
	[positions, ref_bases, alt_bases] = parse_vcf_positions(vcf_file);

	out_summary_prefix = os.path.splitext(vcf_file)[0];

	sys.stderr.write('Starting the counting process...\n');
	[accuracy, accuracy_called_bases] = utility_sam.CountCorrectlyMappedBasesAtPositions(hashed_query, hashed_reference, positions, ref_bases, alt_bases, out_summary_prefix=out_summary_prefix);
	sys.stderr.write('Accuracy: %.2f\n' % accuracy);
	sys.stderr.write('Accuracy (only called bases): %.2f\n' % accuracy_called_bases);
Esempio n. 4
0
def load_and_process_SAM(sam_file, BBMapFormat=False):
    # Loading SAM file into hash
    # Keeping only SAM lines with regular CIGAR string, and sorting them according to position
    qnames_with_multiple_alignments = {}
    [sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines
     ] = utility_sam.HashSAMWithFilter(sam_file,
                                       qnames_with_multiple_alignments)

    # If variable BBMapFormat is set to true, all samfiles referring to the same query will be collected together
    # Stil have to decide what to do with query names, currently removing '_part'
    if BBMapFormat:
        new_sam_hash = {}
        for (qname, sam_lines) in sam_hash.items():
            pos = qname.find('_part')
            if pos > -1:
                origQname = qname[:pos]
            else:
                origQname = qname
            if origQname not in new_sam_hash:
                new_sam_hash[origQname] = sam_lines
            else:
                # import pdb
                # pdb.set_trace()
                new_sam_lines = sam_lines + new_sam_hash[origQname]
                new_sam_hash[origQname] = sam_lines

        sam_hash = new_sam_hash

    # NOTE: This is a quick and dirty solution
    # Setting this to true so that large deletions are turned into Ns
    # BBMap marks intron RNA alignment gaps with deletions!
    BBMapFormat = True

    # Reorganizing SAM lines, removing unmapped queries, leaving only the first alignment and
    # other alignments that possibly costitute a split alignment together with the first one
    samlines = []
    cnt = 0
    pattern = '(\d+)(.)'
    # for samline_list in sam_hash.itervalues():
    for (samline_key, samline_list) in sam_hash.items():
        cnt += 1
        if samline_list[0].cigar != '*' and samline_list[
                0].cigar != '':  # if the first alignment doesn't have a regular cigar string, skip

            if BBMapFormat:
                # All deletes that are 10 or more bases are replaced with Ns of the same length
                operations = re.findall(pattern, samline_list[0].cigar)
                newcigar = ''
                for op in operations:
                    op1 = op[1]
                    op0 = op[0]
                    if op[1] == 'D' and int(op[0]) >= 10:
                        op1 = 'N'
                    newcigar += op0 + op1
                samline_list[0].cigar = newcigar

            operations = re.findall(pattern, samline_list[0].cigar)
            split = False

            for op in operations[
                    1:
                    -1]:  # Ns cannot appear as the first or the last operation
                if op[1] == 'N':
                    split = True
                    break
            # If the first alignment is split (had Ns in the middle), keep only the first alignment and drop the others
            if split:
                # Transform split alignments containing Ns into multiple alignments with clipping
                temp_samline_list = []
                posread = 0
                posref = 0  # NOTE: I don't seem to be using this, probably should remove it
                newcigar = ''
                readlength = samline_list[0].CalcReadLengthFromCigar()
                new_samline = copy.deepcopy(samline_list[0])
                mapping_pos = new_samline.pos
                clipped_bases = new_samline.pos - new_samline.clipped_pos
                hclip_seq = 0  # Used with hard clipping, how big part of sequence should be removed
                clip_type = 'S'  # Soft_clipping by default
                for op in operations:
                    if op[1] == 'N' and int(
                            op[0]) > 1:  # Create a new alignment with clipping
                        newcigar += '%dS' % (
                            readlength - posread
                        )  # Always use soft clipping at the end
                        new_samline.cigar = newcigar
                        # After some deliberation, I concluded that this samline doesn't have to have its position changed
                        # The next samline does, and by the size of N operation in cigar string + any operations before
                        temp_samline_list.append(new_samline)
                        new_samline = copy.deepcopy(samline_list[0])
                        mapping_pos += int(op[0])
                        new_samline.pos = mapping_pos
                        new_samline.clipped_pos = new_samline.pos - clipped_bases
                        posref += int(op[0])
                        if clip_type == 'H':
                            new_samline.seq = new_samline.seq[hclip_seq:]
                        newcigar = '%d%c' % (posread, clip_type)
                    else:  # Expand a current alignment
                        newcigar += op[0] + op[1]
                        if op[1] in ('D', 'N'):
                            posref += int(op[0])
                            mapping_pos += int(op[0])
                        elif op[1] == 'I':
                            posread += int(op[0])
                            # Everything besides deletes and Ns will be clipped in the next partial alignment
                            # Therefore have to adjust both pos and clipped pos
                            clipped_bases += int(op[0])
                            hclip_seq += int(op[0])
                        elif op[1] in ('S', 'H'):
                            clip_type = op[1]
                            # Clipped bases can not appear in the middle of the original cigar string
                            # And they have already been added to the position,
                            # so I shouldn't adjust my mapping_pos and clipped_bases again
                            # TODO: I should probably diferentiate between hars and soft clipping
                            posread += int(op[0])
                            posref += int(op[0])
                        else:
                            posref += int(op[0])
                            posread += int(op[0])
                            clipped_bases += int(op[0])
                            mapping_pos += int(op[0])
                            hclip_seq += int(op[0])

                new_samline.cigar = newcigar
                temp_samline_list.append(new_samline)

                samlines.append(temp_samline_list)
            else:
                temp_samline_list = [
                    samline_list[0]
                ]  # add the first alignment to the temp list
                multi_alignment = False
                for samline in samline_list[
                        1:]:  # look through other alignments and see if they could form a split alignment with the current temp_samline_list
                    if BBMapFormat:
                        # All deletes that are 10 or more bases are replaced with Ns of the same length
                        operations = re.findall(pattern, samline.cigar)
                        newcigar = ''
                        for op in operations:
                            op0 = op[0]
                            op1 = op[1]
                            if op[1] == 'D' and int(op[0]) >= 10:
                                op1 = 'N'
                            newcigar += op0 + op1
                        samline.cigar = newcigar
                    if not join_split_alignment(temp_samline_list, samline):
                        multi_alignment = True

                samlines.append(temp_samline_list)
        else:
            pass

    # Sorting SAM lines according to the position of the first alignment
    samlines.sort(key=lambda samline: samline[0].pos)

    #for samline_list in samlines:
    #    print samline_list[0].qname, samline_list[0].rname
    return samlines
Esempio n. 5
0
def CompareTwoSAMs(sam_file1,
                   sam_file2,
                   distance_threshold,
                   out_summary_prefix=''):

    # print 'Loading first SAM file...';
    qnames_with_multiple_alignments = {}
    # print sam_file1, sam_file2, distance_threshold, out_summary_prefix
    sys.stderr.write('Loading the first SAM file into hash...\n')
    [sam_hash1, sam_hash1_num_lines, sam_hash1_num_unique_lines
     ] = utility_sam.HashSAMWithFilter(sam_file1,
                                       qnames_with_multiple_alignments)
    sam_headers1 = utility_sam.LoadOnlySAMHeaders(sam_file1)
    sys.stderr.write('Loading the second SAM file into hash...\n')
    [sam_hash2, sam_hash2_num_lines, sam_hash2_num_unique_lines
     ] = utility_sam.HashSAMWithFilter(sam_file2,
                                       qnames_with_multiple_alignments)
    sam_headers2 = utility_sam.LoadOnlySAMHeaders(sam_file2)

    not_in_sam_file1 = 0
    not_in_sam_file2 = 0

    num_different_reference = 0
    num_different_orientation = 0
    num_not_mapped_1 = 0
    num_not_mapped_2 = 0
    num_mapped_1 = 0
    num_mapped_2 = 0

    qname_to_distance_hash = {}
    # qname_to_pos = {};
    distance_count_hash = {}
    distance_to_qname_hash = {}
    distance_to_sam_hash = {}
    shared_qnames = {}

    num_processed = 0

    qnames_not_in_sam_file1 = []
    qnames_not_in_sam_file2 = []

    for qname in sam_hash1.keys():
        num_processed += 1
        if ((num_processed % 1000) == 0):
            sys.stderr.write('\rProcessed %d alignments...' % num_processed)

        if (len(sam_hash1[qname]) > 0
                and sam_hash1[qname][0].IsMapped() == True):
            num_mapped_1 += 1

        # TODO: THIS NEEDS TO BE REMOVED OR IMPLEMENTED SOMEHOW DIFFERENTLY!!
        # The point of this was that, BLASR doesn't conform to the SAM standard, and makes it difficult to
        # uniformly evaluate the results!
        # if 'blasr' in sam_file1.lower():
        # 	qname = '/'.join(qname.split('/')[:-1]);

        sam_line_list1 = sam_hash1[qname]

        sam_line_list_2 = []
        try:
            sam_line_list2 = sam_hash2[qname]
            if (len(sam_line_list2) > 0
                    and sam_line_list2[0].IsMapped() == False):
                not_in_sam_file2 += 1
                qnames_not_in_sam_file2.append(
                    [sam_line_list1[0].evalue, qname])
        except:
            not_in_sam_file2 += 1
            qnames_not_in_sam_file2.append([sam_line_list1[0].evalue, qname])
            continue

        sorted_sam_line_list1 = sorted(
            sam_line_list1,
            key=lambda sam_line: (
                (not sam_line.IsMapped()), -sam_line.chosen_quality))
        sorted_sam_line_list2 = sorted(
            sam_line_list2,
            key=lambda sam_line: (
                (not sam_line.IsMapped()), -sam_line.chosen_quality))
        if (len(sorted_sam_line_list1) > 0 and len(sorted_sam_line_list2) > 0):

            if (sorted_sam_line_list1[0].IsMapped() == False):
                num_not_mapped_1 += 1
            if (sorted_sam_line_list2[0].IsMapped() == False):
                num_not_mapped_2 += 1
            if (sorted_sam_line_list1[0].IsMapped() == False
                    or sorted_sam_line_list2[0].IsMapped() == False):
                continue

            if (not ((sorted_sam_line_list1[0].rname
                      in sorted_sam_line_list2[0].rname) or
                     (sorted_sam_line_list2[0].rname
                      in sorted_sam_line_list1[0].rname))):
                num_different_reference += 1
                continue
            if (sorted_sam_line_list1[0].IsReverse() !=
                    sorted_sam_line_list2[0].IsReverse()):
                num_different_orientation += 1
                continue

            distance = abs(sorted_sam_line_list1[0].clipped_pos -
                           sorted_sam_line_list2[0].clipped_pos)
            if (not (qname in shared_qnames)):
                shared_qnames[qname] = 1
            qname_to_distance_hash[qname] = distance
            # qname_to_pos[qname] = [sorted_sam_line_list1[0].clipped_pos, sorted_sam_line_list2[0].clipped_pos];
            if (distance in distance_count_hash):
                distance_count_hash[distance] += 1
                distance_to_qname_hash[distance].append(qname)
                distance_to_sam_hash[distance].append(sorted_sam_line_list1[0])
            else:
                distance_count_hash[distance] = 1
                distance_to_qname_hash[distance] = [qname]
                distance_to_sam_hash[distance] = [sorted_sam_line_list1[0]]
        else:
            if (len(sorted_sam_line_list1) == 0):
                not_in_sam_file1 += 1
                # qnames_not_in_sam_file1.append(qname);
            if (len(sorted_sam_line_list2) == 0):
                not_in_sam_file2 += 1
                # qnames_not_in_sam_file2.append(qname);
            sys.stderr.write(
                'Warning: Something odd with qname "%s". Qname present in both files, but lists are empty.\n'
                % (qname))
            continue

        # min_distance = -1;
        # i = 0;
        # for sam_line1 in sorted_sam_line_list1:
        # 	for sam_line2 in sorted_sam_line_list2:
        # 		distance = abs(sam_line1.clipped_pos - sam_line2.clipped_pos);
        # 		if (i == 0 or distance < min_distance):
        # 			min_distance = distance;
        # 		i += 1;
        # distance_hash[qname] = min_distance;

    sys.stderr.write('\n')
    sys.stderr.write(
        'Counting qnames present in sam_file2 that are missing from sam_file1...\n'
    )
    num_processed = 0
    for qname in sam_hash2.keys():
        num_processed += 1
        if ((num_processed % 1000) == 0):
            sys.stderr.write('\rProcessed %d alignments...' % num_processed)
        sam_hash2_list = sam_hash2[qname]
        if (len(sam_hash2_list) > 0):
            if (sam_hash2_list[0].IsMapped() == True):
                num_mapped_2 += 1

                try:
                    sam_hash1_list = sam_hash1[qname]
                    if (sam_hash1_list[0].IsMapped() == False):
                        not_in_sam_file1 += 1
                        qnames_not_in_sam_file1.append(
                            [sam_hash2_list[0].evalue, qname])
                except:
                    not_in_sam_file1 += 1
                    qnames_not_in_sam_file1.append(
                        [sam_hash2_list[0].evalue, qname])

        # if (len(sam_hash2_list) > 0):
        # 	if (sam_hash2_list[0].IsMapped() == True):
        # 		num_mapped_2 += 1;
        # 	if (qname in sam_hash1.keys()):
        # 		pass;
        # 	else:
        # 		not_in_sam_file1 += 1;
        # 		qnames_not_in_sam_file1.append(qname);
    sys.stderr.write('\n')
    sys.stderr.write('\n')

    fp_out = None
    fp_out_lt0bp = None
    fp_out_gt5000bp = None
    out_file = out_summary_prefix + '.csv'
    out_file_lt0bp = out_summary_prefix + '_lt0bp.csv'
    out_file_gt5000bp = out_summary_prefix + '_gt5000bp.csv'

    if (out_summary_prefix != ''):
        try:
            fp_out = open(out_file, 'w')
            fp_out_lt0bp = open(out_file_lt0bp, 'w')
            fp_out_gt5000bp = open(out_file_gt5000bp, 'w')
        except IOError:
            sys.stderr.write(
                '[%s] ERROR: Could not open file "%s" for writing!\n' %
                (__name__, out_file))
            return
            # exit(1);

    summary_line = ''
    summary_line += 'SAM file 1: %s\n' % sam_file1
    summary_line += 'SAM file 2: %s\n' % sam_file2
    summary_line += 'Number of qnames not present in SAM file 1: %d\n' % (
        not_in_sam_file1)
    summary_line += 'Number of qnames not present in SAM file 2: %d\n' % (
        not_in_sam_file2)
    summary_line += 'Number of qnames mapped to different references: %d\n' % (
        num_different_reference)
    summary_line += 'Number of alignments of different orientation: %d\n' % (
        num_different_orientation)
    summary_line += 'Number of shared qnames: %d\n' % (len(
        shared_qnames.keys()))
    summary_line += 'Mapped in SAM 1: %d\n' % (num_mapped_1)
    summary_line += 'Unmapped in SAM 1: %d\n' % (num_not_mapped_1)
    summary_line += 'Mapped in SAM 2: %d\n' % (num_mapped_2)
    summary_line += 'Unmapped in SAM 2: %d\n' % (num_not_mapped_2)
    summary_line += '\n'

    length_threshold = 9000

    sys.stdout.write(summary_line)
    if (out_summary_prefix != ''):
        fp_out.write(summary_line)
    summary_line = ''

    summary_line_lt0bp = ''
    summary_line_gt5000bp = ''

    num_same_alignments = 0
    i = 0
    # while i < len(distance_to_qname_hash.keys()
    # print distance_to_qname_hash;
    for distance in sorted(distance_to_qname_hash.keys()):
        sorted_by_length = sorted(distance_to_sam_hash[distance],
                                  reverse=True,
                                  key=lambda sam_line: len(sam_line.seq))
        # sorted_qnames = ['%s <%d, %d>' % (single_sam_line.qname, len(single_sam_line.seq), single_sam_line.mapq) for single_sam_line in sorted_by_length];
        # positions = qname_to_pos[distance_to_qname_hash[distance]];
        # sorted_qnames = ['%s <len:%d, SAM1:%d, SAM2:%d>' % (single_sam_line.qname, len(single_sam_line.seq), positions[0], positions[1]) for single_sam_line in sorted_by_length];
        sorted_qnames = [
            '%s <len:%d, pos:%d>' %
            (single_sam_line.qname, len(
                single_sam_line.seq), single_sam_line.clipped_pos)
            for single_sam_line in sorted_by_length
        ]

        sorted_qnames_above_length = [
            ('%s' % (single_sam_line.qname))
            for single_sam_line in sorted_by_length
            if (len(single_sam_line.seq) > length_threshold)
        ]
        if (distance == 0):
            summary_line_lt0bp = ' \\\n'.join(sorted_qnames_above_length)
        if (distance > 5000):
            if (len(summary_line_gt5000bp) > 0):
                summary_line_gt5000bp += ' \\\n'
            summary_line_gt5000bp += ' \\\n'.join(sorted_qnames_above_length)

        # sorted_qnames = [str(len(single_sam_line.seq)) for single_sam_line in sorted(distance_to_sam_hash[distance], reverse=True, key=lambda sam_line: len(sam_line.seq))];
        # summary_line = str(distance) + '\t' + str(len(distance_to_qname_hash[distance])) + '\t' + '\t'.join(distance_to_qname_hash[distance]) + '\n';
        summary_line = str(distance) + '\t' + str(
            len(distance_to_qname_hash[distance])) + '\t' + '\t'.join(
                sorted_qnames) + '\n'
        if (distance <= distance_threshold):
            num_same_alignments += len(distance_to_qname_hash[distance])

        # sys.stdout.write(summary_line);
        if (out_summary_prefix != ''):
            fp_out.write(summary_line)
        summary_line = ''

    summary_line = 'Distance threshold to consider mappings same: %d\n' % distance_threshold
    summary_line += 'Number of same mappings: %d\n' % num_same_alignments
    summary_line += '(verbose) Number of same mappings: %d (%.2f%% in SAM1 / %.2f%% in SAM2) within %d bp distance.\n' % (
        num_same_alignments, 100.0 * float(num_same_alignments) /
        float(num_mapped_1 + num_not_mapped_1), 100.0 *
        float(num_same_alignments) / float(num_mapped_2 + num_not_mapped_2),
        distance_threshold)
    summary_line += '\n'
    sys.stdout.write(summary_line)
    if (out_summary_prefix != ''):
        fp_out.write(summary_line)
        fp_out_lt0bp.write(summary_line_lt0bp)
        fp_out_gt5000bp.write(summary_line_gt5000bp)
        summary_line = ''
        summary_line_lt0bp = ''
        summary_line_gt5000bp = ''

        sam1_basename = os.path.splitext(os.path.basename(sam_file1))[0]
        sam2_basename = os.path.splitext(os.path.basename(sam_file2))[0]

        out_file_qnames_only_in_sam2 = out_summary_prefix + '_qnames_only_in_%s.csv' % (
            sam2_basename)
        out_file_qnames_only_in_sam1 = out_summary_prefix + '_qnames_only_in_%s.csv' % (
            sam1_basename)
        out_file_qnames_only_in_sam2_as_sam = out_summary_prefix + '_qnames_only_in_%s.sam' % (
            sam2_basename)
        out_file_qnames_only_in_sam1_as_sam = out_summary_prefix + '_qnames_only_in_%s.sam' % (
            sam1_basename)
        out_file_qnames_in_both_sam1_as_sam = out_summary_prefix + '_qnames_in_both-alignments_from_%s.sam' % (
            sam1_basename)
        out_file_qnames_in_both_sam2_as_sam = out_summary_prefix + '_qnames_in_both-alignments_from_%s.sam' % (
            sam2_basename)

        summary_line += 'Output files:\n'
        summary_line += '\t%s\n' % (out_file_qnames_only_in_sam1)
        summary_line += '\t%s\n' % (out_file_qnames_only_in_sam2)
        summary_line += '\t%s\n' % (out_file_qnames_only_in_sam1_as_sam)
        summary_line += '\t%s\n' % (out_file_qnames_only_in_sam2_as_sam)
        summary_line += '\t%s\n' % (out_file_qnames_in_both_sam1_as_sam)
        summary_line += '\t%s\n' % (out_file_qnames_in_both_sam2_as_sam)

        try:
            fp_out_qnames_only_in_sam2 = open(out_file_qnames_only_in_sam2,
                                              'w')
            fp_out_qnames_only_in_sam1 = open(out_file_qnames_only_in_sam1,
                                              'w')
            # fp_out_qnames_only_in_sam2.write('\n'.join(qnames_not_in_sam_file1) + '\n');
            fp_out_qnames_only_in_sam2.write('\n'.join([
                '%e\t%s' % (value[0], value[1])
                for value in sorted(qnames_not_in_sam_file1,
                                    key=lambda x: x[0])
            ]) + '\n')
            fp_out_qnames_only_in_sam1.write('\n'.join([
                '%e\t%s' % (value[0], value[1])
                for value in sorted(qnames_not_in_sam_file2,
                                    key=lambda x: x[0])
            ]) + '\n')
            fp_out_qnames_only_in_sam2.close()
            fp_out_qnames_only_in_sam1.close()

            fp_out1 = open(out_file_qnames_only_in_sam2_as_sam, 'w')
            fp_out1.write('\n'.join(sam_headers2) + '\n')
            for value in sorted(qnames_not_in_sam_file1, key=lambda x: x[0]):
                fp_out1.write('\n'.join([
                    sam_line.original_line for sam_line in sam_hash2[value[1]]
                ]) + '\n')
            fp_out1.close()

            fp_out2 = open(out_file_qnames_only_in_sam1_as_sam, 'w')
            fp_out2.write('\n'.join(sam_headers1) + '\n')
            for value in sorted(qnames_not_in_sam_file2, key=lambda x: x[0]):
                fp_out2.write('\n'.join([
                    sam_line.original_line for sam_line in sam_hash1[value[1]]
                ]) + '\n')
            fp_out2.close()

            fp_out1 = open(out_file_qnames_in_both_sam1_as_sam, 'w')
            fp_out1.write('\n'.join(sam_headers1) + '\n')
            for value in shared_qnames:
                fp_out1.write('\n'.join(
                    [sam_line.original_line
                     for sam_line in sam_hash1[value]]) + '\n')
            fp_out1.close()

            fp_out2 = open(out_file_qnames_in_both_sam2_as_sam, 'w')
            fp_out2.write('\n'.join(sam_headers2) + '\n')
            for value in shared_qnames:
                fp_out2.write('\n'.join(
                    [sam_line.original_line
                     for sam_line in sam_hash2[value]]) + '\n')
            fp_out2.close()

        except IOError:
            sys.stderr.write(
                'ERROR: Could not open file(s) for writing! Either "%s" or "%s".\n'
                % (out_file_qnames_only_in_sam2, out_file_qnames_only_in_sam1))

    if (out_summary_prefix != ''):
        fp_out.close()
        fp_out_lt0bp.close()
        fp_out_gt5000bp.close()
Esempio n. 6
0
def test_cigars(samfile, fastaref):

    paramdict = {}
    report = EvalReport(ReportType.TEMP_REPORT)

    sys.stderr.write('\n(%s) Loading and processing FASTA reference ... ' %
                     datetime.now().time().isoformat())
    [chromname2seq, headers, seqs,
     quals] = load_and_process_reference(fastaref, paramdict, report)

    sys.stderr.write(
        '\n(%s) Loading and processing SAM file with mappings ... ' %
        datetime.now().time().isoformat())
    qnames_with_multiple_alignments = {}
    [sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines
     ] = utility_sam.HashSAMWithFilter(samfile,
                                       qnames_with_multiple_alignments)

    sys.stdout.write('\nTYPE\tQNAME\tMAX numMatch\tLENGTH\tFLAG\n')
    for (samline_key, samline_list) in sam_hash.iteritems():
        if samline_list[0].cigar <> '*' and samline_list[0].cigar <> '':
            for samline in samline_list:
                chromname = getChromName(samline.rname)
                if chromname not in chromname2seq:
                    # import pdb
                    # pdb.set_trace()
                    raise Exception(
                        '\nERROR: Unknown chromosome name in SAM file! (chromname:"%s", samline.rname:"%s")'
                        % (chromname, samline.rname))
                chromidx = chromname2seq[chromname]
                cigar = samline.cigar
                length = samline.CalcReadLengthFromCigar()
                numMatch = numMatch1 = numMatch2 = 0
                flag = -1
                try:
                    # Using regular expressions to find repeating digit and skipping one character after that
                    # Used to separate CIGAR string into individual operations
                    pattern = '(\d+)(.)'
                    pos = samline.pos
                    flag = samline.flag

                    # Calculating regular matches
                    extcigar = samline.CalcExtendedCIGAR(seqs[chromidx])
                    operations = re.findall(pattern, extcigar)
                    for op in operations:
                        if op[1] in ('M', '='):
                            numMatch += int(op[0])
                        elif op[1] in ('I', 'D', 'X', 'N', 'S', 'H', 'P'):
                            pass
                        else:
                            sys.stderr.write(
                                '\nERROR: Invalid CIGAR string operation (%s)'
                                % op[1])

                    # Calculating for pos + 1
                    samline.pos = pos + 1
                    extcigar = samline.CalcExtendedCIGAR(seqs[chromidx])
                    operations = re.findall(pattern, extcigar)
                    for op in operations:
                        if op[1] in ('M', '='):
                            numMatch1 += int(op[0])
                        elif op[1] in ('I', 'D', 'X', 'N', 'S', 'H', 'P'):
                            pass
                        else:
                            sys.stderr.write(
                                '\nERROR: Invalid CIGAR string operation (%s)'
                                % op[1])

                    # Calculating for pos - 1
                    samline.pos = pos - 1
                    extcigar = samline.CalcExtendedCIGAR(seqs[chromidx])
                    operations = re.findall(pattern, extcigar)
                    for op in operations:
                        if op[1] in ('M', '='):
                            numMatch2 += int(op[0])
                        elif op[1] in ('I', 'D', 'X', 'N', 'S', 'H', 'P'):
                            pass
                        else:
                            sys.stderr.write(
                                '\nERROR: Invalid CIGAR string operation (%s)'
                                % op[1])

                except Exception, Argument:
                    # import pdb
                    # pdb.set_trace()
                    sys.stderr.write(
                        'ERROR: querry/ref/pos/message = %s/%s/%d/%s \n' %
                        (samline.qname, samline.rname, samline.pos, Argument))
                    pass

                if (numMatch > numMatch1 and numMatch > numMatch2):
                    sys.stdout.write('REGULAR\t%s\t%d\t%d\t%d\n' %
                                     (samline.qname, numMatch, length, flag))
                elif (numMatch1 > numMatch and numMatch1 > numMatch2):
                    sys.stdout.write('PLUS ONE\t%s\t%d\t%d\t%d\n' %
                                     (samline.qname, numMatch1, length, flag))
                elif (numMatch2 > numMatch and numMatch2 > numMatch1):
                    sys.stdout.write('MINUS ONE\t%s\t%d\t%d\t%d\n' %
                                     (samline.qname, numMatch2, length, flag))
                else:
                    sys.stdout.write('NONE\t%s\t%d\t%d\t%d\n' %
                                     (samline.qname, numMatch, length, flag))
Esempio n. 7
0
def scara_analyze(scaffolds_file, reference_file, output_folder):
    sys.stderr.write('\nSTARTING SCAFFOLDING ANALYSIS SCRIPT')

    output_folder_path = os.path.join(os.getcwd(), output_folder)

    ### STEP 0. Checking paths and folders
    if not os.path.exists(scaffolds_file):
        sys.stderr.write('\nScaffolds file does not exist (%s)! Exiting ...' %
                         scaffolds_file)
        return
    elif not os.path.exists(reference_file):
        sys.stderr.write('\nReference file does not exist (%s)! Exiting ...' %
                         reference_file)
        return
    elif not os.path.exists(output_folder):
        sys.stderr.write(
            '\nOutput folder does not exist (%s)! Creating it ...' %
            output_folder)
        os.mkdir(output_folder_path)

    ### STEP 1. Running Minimap2
    sys.stderr.write('\nCALCULATING MAPPINGS BETWEEN SCAFFOLDS AND REFERENCE!')
    minimap2_output_file = os.path.join(output_folder_path,
                                        'scaffolds2reference.sam')
    if os.path.exists(minimap2_output_file):
        sys.stderr.write('\nMapping file already present! Skipping ...!')
    else:
        cmd = '%s %s %s %s > %s' % (MINIMAP2, default_MM2options,
                                    reference_file, scaffolds_file,
                                    minimap2_output_file)
        sys.stderr.write('\nRUNNING COMMAND: %s' % cmd)
        (status, output) = commands.getstatusoutput(cmd)
        logfile = os.path.join(output_folder_path, 'Minimap2_r2r.log')
        with open(logfile, 'w') as lfile:
            lfile.write(output)

    ### STEP 2. Load and analyze Minimap2 file
    # Loading SAM file into a dictionary
    # Keeping only SAM lines with regular CIGAR string, and sorting them according to position
    sys.stderr.write('\nANALYZING MAPPINGS!')
    qnames_with_multiple_alignments = {}
    [sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines
     ] = utility_sam.HashSAMWithFilter(minimap2_output_file,
                                       qnames_with_multiple_alignments)

    # Load scaffolds
    [theaders, sseqs, squals] = read_fastq(scaffolds_file)
    # Cutting headers at first space
    sheaders = []
    for theader in theaders:
        sheader = theader[:theader.find(' ')]
        sheaders.append(sheader)

    # Load reference
    [theaders, rseqs, rquals] = read_fastq(reference_file)
    # Cutting headers at first space
    rheaders = []
    for theader in theaders:
        rheader = theader[:theader.find(' ')]
        rheaders.append(rheader)

    # Analyze SAM
    scaffold_mappings = {
    }  # A dictionary that for each scaffold that is mapped to a reference contains
    # a list of reference parts (chromosome) to which the scaffold is mapped
    reference_mappings = {
    }  # A dictionary that for each reference that is mapped to a scaffold contains
    # a list of scaffolds mapped to it
    for sheader in sheaders:
        scaffold_mappings[sheader] = []
    for rheader in rheaders:
        reference_mappings[rheader] = []
    for (qname, sam_lines) in sam_hash.iteritems():
        for samline in sam_lines:
            # Skip samlines with invalid CIGAR
            if samline.cigar == '*':
                continue
            if qname in sheaders:
                sname = qname
                rname = samline.rname
            elif qname in rheaders:
                sname = samline.rname
                rname = qname
            else:
                sys.stderr.write(
                    '\nERROR: Invalid query name in mappings file (%s)!' %
                    qname)
                return

            smappings = scaffold_mappings[sname]
            if rname not in smappings:
                scaffold_mappings[sname].append(rname)

            rmappings = reference_mappings[rname]
            if sname not in rmappings:
                reference_mappings[rname].append(sname)

    # Print scaffold-reference mappings (both dictionaries)
    found_double_mappings = False
    found_zero_mappings = False
    mapping_analysis_file = os.path.join(output_folder_path,
                                         'mapping_analysis.txt')
    with open(mapping_analysis_file, 'w') as mafile:
        mafile.write('SCAFFOLD: REFERENCE LIST\n')
        for sname, rname_list in scaffold_mappings.iteritems():
            if len(rname_list) > 1:
                found_double_mappings = True
            if len(rname_list) == 0:
                found_zero_mappings = True
            mafile.write('%s: %s\n' % (sname, ', '.join(rname_list)))

        mafile.write('REFERENCE: SCAFFOLD LIST\n')
        for rname, sname_list in reference_mappings.iteritems():
            mafile.write('%s: %s\n' % (rname, ', '.join(sname_list)))

    if found_double_mappings:
        sys.stderr.write(
            '\nWARNING: Found scaffolds mapped to multiple references!')
    if found_zero_mappings:
        sys.stderr.write('\nWARNING: Found unmapped scaffolds!')

    ### STEP 3. Generate gepard dot plots for all mappings between scaffolds and references
    sys.stderr.write('\nGENERATING DOT PLOTS FOR SCAFFOLDS!')
    # Create separate fasta files for each scaffold
    scaffolds_folder = os.path.join(output_folder_path, 'scaffolds')
    os.mkdir(scaffolds_folder)
    for i in xrange(len(sheaders)):
        sheader = sheaders[i]
        sseq = sseqs[i]
        sfilename = os.path.join(scaffolds_folder, sheader + '.fasta')
        with open(sfilename, 'w') as sfile:
            sfile.write('>%s\n%s\n' % (sheader, sseq))

    # Create separate fasta file for each reference
    referencess_folder = os.path.join(output_folder_path, 'references')
    os.mkdir(referencess_folder)
    for i in xrange(len(rheaders)):
        rheader = rheaders[i]
        rseq = rseqs[i]
        rfilename = os.path.join(referencess_folder, rheader + '.fasta')
        with open(rfilename, 'w') as rfile:
            rfile.write('>%s\n%s\n' % (rheader, rseq))

    # Generate dot plots
    gepard_folder = os.path.join(output_folder_path, 'scaff_gepard')
    os.mkdir(gepard_folder)
    for sname, rname_list in scaffold_mappings.iteritems():
        for rname in rname_list:
            sfilename = os.path.join(scaffolds_folder, sname + '.fasta')
            if not os.path.exists(sfilename):
                sys.stderr.write('\nERROR: Scaffold fasta file not found: %s' %
                                 sfilename)
            rfilename = os.path.join(referencess_folder, rname + '.fasta')
            if not os.path.exists(rfilename):
                sys.stderr.write(
                    '\nERROR: Reference fasta file not found: %s' % rfilename)
            gepard_file = os.path.join(gepard_folder,
                                       '%s_%s.png' % (sname, rname))
            cmd = 'java -cp %s org.gepard.client.cmdline.CommandLine -seq1 %s -seq2 %s -matrix %s -outfile %s' \
                % (GEPARD_JAR, sfilename, rfilename, GEPARD_MATRIX, gepard_file)
            sys.stderr.write('\nRUNNING COMMAND: %s' % cmd)
            (status, output) = commands.getstatusoutput(cmd)

    ### STEP 3.1 Generate additional dotplots, one for each reference against all scaffolds mapped to it
    sys.stderr.write('\nGENERATING DOT PLOTS FOR REFERENCES!')
    gepard_folder2 = os.path.join(output_folder_path, 'ref_gepard')
    os.mkdir(gepard_folder2)
    for rname, sname_list in reference_mappings.iteritems():
        if len(sname_list) > 0:
            sfilename = os.path.join(gepard_folder2,
                                     '%s_scaffolds.fasta' % rname)
            rfilename = os.path.join(referencess_folder, rname + '.fasta')
            with open(sfilename, 'w') as sfile:
                for i in xrange(len(sheaders)):
                    sheader = sheaders[i]
                    sseq = sseqs[i]
                    if sheader in sname_list:
                        sfile.write('>%s\n%s\n' % (sheader, sseq))
            gepard_file2 = os.path.join(gepard_folder2,
                                        '%s_scaffolds.png' % rname)
            cmd = 'java -cp %s org.gepard.client.cmdline.CommandLine -seq1 %s -seq2 %s -matrix %s -outfile %s' \
                % (GEPARD_JAR, sfilename, rfilename, GEPARD_MATRIX, gepard_file2)
            sys.stderr.write('\nRUNNING COMMAND: %s' % cmd)
            (status, output) = commands.getstatusoutput(cmd)

    return