Beispiel #1
0
def loadNewAnnotations(newAnnotationsFile):
    new_annotations = []
    with open(newAnnotationsFile, 'rU') as newann_file:
        while (True):
            line = newann_file.readline()
            if line == '':      # End of file reached (empty lines will have \n)
                break

            if line.startswith('Name:'):
                # Loading a new annotations
                # import pdb
                # pdb.set_trace()
                new_annotation = Annotation_formats.GeneDescription()
                new_annotation.seqname = line[6:-1]
                line = newann_file.readline()
                new_annotation.source = line[9:-1]
                line = newann_file.readline()
                new_annotation.strand = line[8:-1]
                line = newann_file.readline()
                noreads = int(line[16:-1])
                new_annotation.items = []

                # Reading annotation items
                while (True):
                    line = newann_file.readline()
                    if line == '':
                        sys.stderr.write('Error reading new annotations file %s!' % newAnnotationsFile);
                        import pdb
                        pdb.set_trace()
                        exit()
                    if line.startswith('Items:'):
                        # Extracting annotation items
                        line = line[7:-1]       # Removing starting text and \n at the end
                        line = line[1:-1]       # Removing starting and ending bracket for easier splitting
                        elements = line.split('] [')     # Splitting into separate items
                        for element in elements:
                            pos = element.find(',')
                            start = int(element[:pos])
                            end = int(element[pos+2:])
                            new_item = Annotation_formats.GeneItem()
                            new_item.start = start
                            new_item.end = end
                            new_annotation.items.append(new_item)
                        break

                if noreads >= NEW_ANNOTATION_MIN:
                    new_annotations.append(new_annotation)

    return new_annotations
Beispiel #2
0
def processData(datafolder, resultfile, annotationfile, Array, SS_list,
                csv_path):
    sys.stderr.write(
        '\n(%s) Loading and processing SAM file with mappings ... ' %
        datetime.now().time().isoformat())
    all_sam_lines = load_and_process_SAM(resultfile, BBMapFormat=True)

    # Reading annotation file
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)

    # Hashing annotations according to name
    annotation_dict = {}
    for annotation in annotations:
        if annotation.transcriptname in annotation_dict:
            pass
            #sys.stderr.write('\nWARNING: anotation with name %s already in the dictionary!' % annotation.genename)
        else:
            #annotation_dict[annotation.genename] = annotation
            annotation_dict[annotation.transcriptname] = annotation

    #***********************************
    #***********************************
    static_dict = {}
    #"A": with exon < 30 "B": exon > 30
    #"C": single splicing "D": alternative splicing
    # key = ["All", "A", "B", "C", "D", "E", "F", "G"]
    key = ["All", "A", "B", "C", "D"]
    for i in range(len(key)):
        static_dict[key[i]] = Static()

    ss_array = list()
    with open(SS_list, 'r') as f_ss:
        for line in f_ss:
            ss_array.append(line.strip())
    #**********************************

    allowed_inacc = Annotation_formats.DEFAULT_ALLOWED_INACCURACY  # Allowing some shift in positions
    # Setting allowed inaccuracy
    # allowed_inacc = 25

    # All samlines in a list should have the same query name
    for samline_list in all_sam_lines:
        qname = samline_list[0].qname

        # Checking the SAM file if all samlines in a list have the same qname
        for samline in samline_list[1:]:
            if samline.qname != qname:
                sys.stderr.write(
                    '\nWARNING: two samlines in the same list with different query names (%s/%s)'
                    % (qname, samline.qname))

        # Look for the first underscore in query name
        # Everything before that is the simulation folder name
        # Everything after that is simulated query name
        pos = qname.find('_')
        if pos < 0:
            raise Exception('Invalid query name in results file (%s)!' % qname)

        simFolderKey = qname[:pos]
        if simFolderKey not in simFolderDict:
            # import pdb
            # pdb.set_trace()
            raise Exception('Bad simulation folder short name (%s)!' %
                            simFolderKey)
        simFolder = simFolderDict[simFolderKey]
        simQName = qname[pos + 1:]
        # print(simFolderKey)
        # print(simFolder)
        # print(simQName)

        simFileSuffix = 'SimG2_S'

        pos = simQName.find('_')
        pos2 = simQName.find('_part')
        if pos < 0:
            raise Exception(
                'Invalid simulated query name in results file (%s)!' %
                simQName)

        # BBMap separates a query into smaller parts he can manage
        # Extends query with '_part_#', which has to be ignored
        if pos2 != -1:
            simQName = simQName[:pos2]

        simRefNumber = int(simQName[1:pos])
        simFileName = simFileSuffix + '_%04d' % simRefNumber
        simRefFileName = simFileName + '.ref'
        simSeqFileName = simFileName + '.fastq'
        simMafFileName = simFileName + '.maf'

        simFilePath = os.path.join(datafolder, simFolder)
        simRefFilePath = os.path.join(simFilePath, simRefFileName)
        # simSeqFilePath = os.path.join(simFilePath, simSeqFileName)
        simMafFilePath = os.path.join(simFilePath, simMafFileName)

        if not os.path.exists(simRefFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Reference file for simulated read %s does not exist!' % qname)
        #if not os.path.exists(simSeqFilePath):
        #    raise Exception('Sequence file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simMafFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Sequence alignment (MAF) for simulated read %s does not exist!'
                % qname)

        # Reading reference file
        [headers, seqs, quals] = read_fastq(simRefFilePath)
        simGeneName = headers[0]
        # if "transcript" in simGeneName:
        #     simGeneName = simGeneName.split(':')[1]
        annotation = annotation_dict[
            simGeneName]  # Getting the correct annotation

        #---------------------
        #for i in range(len(annotation.items)):
        #    print "(%d,%d)" %(annotation.items[i].start, annotation.items[i].end)

        # Reading MAF file to get original position and length of the simulated read
        # Query name should be a second item
        maf_startpos = maf_length = 0
        maf_reflen = 0
        i = 0
        with open(simMafFilePath, 'rU') as maffile:
            i += 1
            for line in maffile:
                if line[0] == 's':
                    elements = line.split()
                    maf_qname = elements[1]
                    if maf_qname == 'ref':  # Have to remember data for the last reference before the actual read
                        maf_startpos = int(elements[2])
                        maf_length = int(elements[3])
                        maf_strand = elements[4]
                        maf_reflen = int(int(elements[5]) / 3)
                    if maf_qname == simQName:
                        # maf_startpos = int(elements[2])
                        # maf_length = int(elements[3])
                        break

        if maf_qname != simQName:
            # import pdb
            # pdb.set_trace()
            print("maf_qname = %s, simQName = %s" % (maf_qname, simQName))
            raise Exception('ERROR: could not find query %s in maf file %s' %
                            (qname, simMafFileName))

        # IMPORTANT: If the reads were generated from an annotation on reverse strand
        #            expected partial alignments must be reversed
        if annotation.strand == Annotation_formats.GFF_STRANDRV:
            maf_startpos = maf_reflen * 3 - maf_length - maf_startpos
            if maf_startpos > maf_reflen * 2:
                maf_startpos = maf_startpos - maf_reflen * 2
            elif maf_startpos > maf_reflen:
                maf_startpos = maf_startpos - maf_reflen

        # Calculating expected partial alignmetns from MAF and annotations
        sigA = False
        sigB = True
        sigC = False
        sigD = False

        # 1. Calculating the index of the first exon
        # i - the index of exon currently being considered
        i = 0
        flag_wrong = 0
        while annotation.items[i].getLength() <= maf_startpos:
            maf_startpos -= annotation.items[i].getLength()
            i += 1
            if len(annotation.items) == i:
                flag_wrong = 1
                break
        if flag_wrong == 1:
            continue

        # Calculating expected partial alignments by filling up exons using maf_length
        maf_length = int(maf_length / 3)
        expected_partial_alignments = []
        while maf_length > 0:
            start = annotation.items[i].start + maf_startpos
            end = annotation.items[i].end
            assert start <= end

            #    print "(%d, %d)" %(start, end)

            # OLD: length = end-start+1
            # KK: End is already indicating position after the last base, so adding one when callculating length is not correct
            length = end - start
            if length <= maf_length:
                expected_partial_alignments.append((start, end))
                maf_length -= length
                i += 1
                if len(annotation.items) == i:
                    maf_length = 0
            else:
                expected_partial_alignments.append((start, start + maf_length))
                maf_length = 0
                i += 1

            # Start position should only be considered for the first exon
            maf_startpos = 0
        #*****************************************
        #*****************************************
        #level2
        for ele in expected_partial_alignments[1:-1]:
            if ele[1] - ele[0] < 30:
                sigA = True
                sigB = False
                break

        #level4
        n = len(expected_partial_alignments)

        #level3
        if simGeneName in ss_array:
            sigC = True
        else:
            sigD = True

        if DEBUG:
            print("exon in expected alignment---------------")
            for i in range(len(expected_partial_alignments)):
                print("(%d, %d)" % (expected_partial_alignments[i][0],
                                    expected_partial_alignments[i][1]))
            print("exon in real alignment-------------")

        numparts = len(expected_partial_alignments)
        # For each part of expected partial alignments, these maps will count
        # how many real partial alignments overlap or equal it
        parteqmap = {(i + 1): 0 for i in range(numparts)}
        parthitmap = {(i + 1): 0 for i in range(numparts)}

        if getChromName(samline_list[0].rname) != getChromName(
                annotation.seqname):
            static_dict["All"].Total_aligned_reads += 1
            part_cal.cal(static_dict, sigA, sigC, "Total_aligned_reads", 1)
        else:
            for samline in samline_list:
                # sl_startpos = samline.pos - 1   # SAM positions are 1-based
                sl_startpos = samline.pos
                reflength = samline.CalcReferenceLengthFromCigar()
                readlength = samline.CalcReadLengthFromCigar()
                #************************
                #************************
                sl_endpos = sl_startpos + reflength

                if DEBUG:
                    print("(%d, %d)" % (sl_startpos, sl_endpos))

                # Comparing a samline to all expected partial alignments
                tmp_aln = 0
                for i in range(len(expected_partial_alignments)):
                    expected_alignement = expected_partial_alignments[i]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]

                    if numparts > 2 and i == 0 and abs(
                            sl_endpos - maf_endpos) < allowed_inacc:
                        parteqmap[i + 1] += 1
                        parthitmap[i + 1] += 1
                    elif numparts > 2 and (
                            i == len(expected_partial_alignments) - 1
                    ) and abs(sl_startpos - maf_startpos) < allowed_inacc:
                        parteqmap[i + 1] += 1
                        parthitmap[i + 1] += 1
                    elif interval_equals((sl_startpos, sl_endpos),
                                         (maf_startpos, maf_endpos),
                                         allowed_inacc):
                        parteqmap[i + 1] += 1
                        parthitmap[i + 1] += 1
                    elif interval_overlaps((sl_startpos, sl_endpos),
                                           (maf_startpos, maf_endpos), 5):
                        parthitmap[i + 1] += 1

                    if interval_overlaps((sl_startpos, sl_endpos),
                                         (maf_startpos, maf_endpos), 5):
                        l = basesInside(sl_startpos, sl_endpos, maf_startpos,
                                        maf_endpos)
                        if tmp_aln < l:
                            tmp_aln = l
                if tmp_aln > readlength:
                    tmp_aln = readlength
                static_dict["All"].Total_aligned_bases += tmp_aln
                part_cal.cal(static_dict, sigA, sigC, "Total_aligned_bases",
                             tmp_aln)

            #*************************************************************************************
            #*************************************************************************************
            num_recover_exons = len([x for x in parteqmap.values() if x == 1])
            num_hit_exons = len([x for x in parthitmap.values() if x == 1])

            if num_hit_exons == numparts:
                static_dict["All"].Hit100 += 1
                part_cal.cal(static_dict, sigA, sigC, "Hit100", 1)
            if num_hit_exons >= int(0.8 * numparts):
                static_dict["All"].Hit80 += 1
                part_cal.cal(static_dict, sigA, sigC, "Hit80", 1)

            sam_l = len(samline_list)
            if num_recover_exons == numparts:
                static_dict["All"].ExR100 += 1
                part_cal.cal(static_dict, sigA, sigC, "ExR100", 1)
                if num_recover_exons == sam_l:
                    static_dict["All"].ExA100 += 1
                    part_cal.cal(static_dict, sigA, sigC, "ExA100", 1)
            if num_recover_exons >= int(0.8 * numparts):
                static_dict["All"].ExR80 += 1
                part_cal.cal(static_dict, sigA, sigC, "ExR80", 1)
                if num_recover_exons >= int(0.8 * sam_l):
                    static_dict["All"].ExA80 += 1
                    part_cal.cal(static_dict, sigA, sigC, "ExA80", 1)
            static_dict["All"].Total_aligned_exons += num_recover_exons
            part_cal.cal(static_dict, sigA, sigC, "Total_aligned_exons",
                         num_recover_exons)
            static_dict["All"].Total_aligned_reads += 1
            part_cal.cal(static_dict, sigA, sigC, "Total_aligned_reads", 1)
            #*************************************************************************************

    #************************************************
    #******************************************write csv
    static_dict["All"].Total_reads = Array.Total_reads
    static_dict["All"].Total_bases = Array.Total_bases
    static_dict["All"].Total_expected_exons = Array.Total_expected_exons
    static_dict["A"].Total_reads = Array.Total_level2_reads
    static_dict["A"].Total_bases = Array.Total_level2_bases
    static_dict["A"].Total_expected_exons = Array.Total_level2_expected_exons
    static_dict["B"].Total_reads = Array.Total_level2_r_reads
    static_dict["B"].Total_bases = Array.Total_level2_r_bases
    static_dict["B"].Total_expected_exons = Array.Total_level2_r_expected_exons
    static_dict["C"].Total_reads = Array.Total_level3_SS_reads
    static_dict["C"].Total_bases = Array.Total_level3_SS_bases
    static_dict[
        "C"].Total_expected_exons = Array.Total_level3_SS_expected_exons
    static_dict["D"].Total_reads = Array.Total_level3_AS_reads
    static_dict["D"].Total_bases = Array.Total_level3_AS_bases
    static_dict[
        "D"].Total_expected_exons = Array.Total_level3_AS_expected_exons
    # static_dict["E"].Total_reads = Array.Total_level4_2_5_reads
    # static_dict["E"].Total_bases = Array.Total_level4_2_5_bases
    # static_dict["E"].Total_expected_exons = Array.Total_level4_2_5_expected_exons
    # static_dict["F"].Total_reads = Array.Total_level4_6_9_reads
    # static_dict["F"].Total_bases = Array.Total_level4_6_9_bases
    # static_dict["F"].Total_expected_exons = Array.Total_level4_6_9_expected_exons
    # static_dict["G"].Total_reads = Array.Total_level4_10_reads
    # static_dict["G"].Total_bases = Array.Total_level4_10_bases
    # static_dict["G"].Total_expected_exons = Array.Total_level4_10_expected_exons

    with open(csv_path, "w") as fw:
        csv_write = csv.writer(fw, dialect='excel')
        header = [" ", resultfile]
        csv_write.writerow(header)
        for item in key:
            level = [
                item,
                str(static_dict[item].Total_reads) + ' reads/' +
                str(static_dict[item].Total_bases) + ' bases/' +
                str(static_dict[item].Total_expected_exons) + ' exons'
            ]
            row1 = [
                "Aligned",
                round(
                    100 * static_dict[item].Total_aligned_reads /
                    float(static_dict[item].Total_reads), 2)
            ]
            row2 = [
                "bases%",
                round(
                    100 * static_dict[item].Total_aligned_bases /
                    float(static_dict[item].Total_bases), 2)
            ]
            #line = str(round(100*static_dict[item].ExR100/float(static_dict[item].Total_reads), 2)) + '/' + str(round(100*static_dict[item].ExR80/float(static_dict[item].Total_reads), 2))
            #row3 = ["ExR100/80%", line]
            line = str(
                round(
                    100 * static_dict[item].ExA100 /
                    float(static_dict[item].Total_reads), 2)) + '/' + str(
                        round(
                            100 * static_dict[item].ExA80 /
                            float(static_dict[item].Total_reads), 2))
            row4 = ["Read100/80%", line]
            #line = str(round(100*static_dict[item].Hit100/float(static_dict[item].Total_reads), 2)) + '/' + str(round(100*static_dict[item].Hit80/float(static_dict[item].Total_reads), 2))
            #row5 = ["Hit100/80%", line]
            row6 = [
                "Exons%",
                round(
                    100 * static_dict[item].Total_aligned_exons /
                    float(static_dict[item].Total_expected_exons), 2)
            ]
            csv_write.writerow(level)
            csv_write.writerow(row1)
            csv_write.writerow(row2)
            #csv_write.writerow(row3)
            csv_write.writerow(row4)
            #csv_write.writerow(row5)
            csv_write.writerow(row6)
Beispiel #3
0
def processData(datafolder, resultfile, annotationfile):

    # Loading results SAM file
    report = EvalReport(ReportType.FASTA_REPORT)  # not needed
    paramdict = {}

    sys.stderr.write(
        '\n(%s) Loading and processing SAM file with mappings ... ' %
        datetime.now().time().isoformat())
    all_sam_lines = RNAseqEval.load_and_process_SAM(resultfile,
                                                    paramdict,
                                                    report,
                                                    BBMapFormat=True)

    # Reading annotation file
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)

    s_num_multiexon_genes = 0

    # Hashing annotations according to name
    annotation_dict = {}
    for annotation in annotations:
        if annotation.genename in annotation_dict:
            sys.stderr.write(
                '\nWARNING: anotation with name %s already in the dictionary!'
                % annotation.genename)
        else:
            annotation_dict[annotation.genename] = annotation
        if len(annotation.items) > 1:
            s_num_multiexon_genes += 1

    # Statistical information for evaluating the qualitiy of mapping
    s_gene_hits = 0
    s_gene_misses = 0
    s_whole_alignment_hits = 0
    s_whole_alignment_misses = 0
    s_partial_alignment_hits = 0
    s_partial_alignment_misses = 0
    s_num_start_hits = 0
    s_num_end_hits = 0
    s_num_start_end_hits = 0

    s_num_fw_strand = 0
    s_num_rv_strand = 0

    s_num_split_alignment = 0
    s_num_oversplit_alignment = 0  # Alignments that have more parts than exons

    s_num_good_alignments = 0

    s_num_badchrom_alignments = 0

    s_maf_suspicious_alignments = 0
    s_maf_bad_alignments = 0
    s_maf_good_alignments = 0

    s_maf_split_reads = 0
    s_maf_good_split_alignments = 0
    s_maf_bad_split_alignments = 0

    s_maf_hit_all_parts = 0
    s_maf_hit_one_part = 0
    s_maf_eq_one_part = 0
    s_maf_multihit_parts = 0

    s_maf_split_hit_all_parts = 0
    s_maf_split_hit_one_part = 0
    s_maf_split_eq_one_part = 0

    s_num_potential_bad_strand = 0

    # allowed_inacc = Annotation_formats.DEFAULT_ALLOWED_INACCURACY       # Allowing some shift in positions
    # Setting allowed inaccuracy
    allowed_inacc = 5

    # All samlines in a list should have the same query name
    for samline_list in all_sam_lines:
        qname = samline_list[0].qname

        isSplitAlignment = False
        if len(samline_list) > 1:
            s_num_split_alignment += 1
            isSplitAlignment = True

        # Checking the SAM file if all samlines in a list have the same qname
        for samline in samline_list[1:]:
            if samline.qname != qname:
                sys.stderr.write(
                    '\nWARNING: two samlines in the same list with different query names (%s/%s)'
                    % (qname, samline.qname))

        # Look for the first underscore in query name
        # Everything before that is the simulation folder name
        # Everything after that is simulated query name
        pos = qname.find('_')
        if pos < 0:
            raise Exception('Invalid query name in results file (%s)!' % qname)

        simFolderKey = qname[:pos]
        if simFolderKey not in simFolderDict:
            # import pdb
            # pdb.set_trace()
            raise Exception('Bad simulation folder short name (%s)!' %
                            simFolderKey)
        simFolder = simFolderDict[simFolderKey]
        simQName = qname[pos + 1:]

        # Due to error in data preparation, have to make some extra processing
        if simQName[:6] == 'SimG2_':
            simQName = simQName[6:]


#        if simFolderKey == 'SimG1':
#            simFileSuffix = 'g1'
#        elif simFolderKey == 'SimG2':
#            simFileSuffix = 'g2'
#        elif simFolderKey == 'SimG3':
#            simFileSuffix = 'g3'
#        else:
#            simFileSuffix = 'sd'

        simFileSuffix = 'sd'

        pos = simQName.find('_')
        pos2 = simQName.find('_part')
        if pos < 0:
            raise Exception(
                'Invalid simulated query name in results file (%s)!' %
                simQName)

        simQLetter = simQName[0]  # Should always be S

        # BBMap separates a query into smaller parts he can manage
        # Extends query with '_part_#', which has to be ignored
        if pos2 <> -1:
            simQName = simQName[:pos2]

        simRefNumber = int(simQName[1:pos])
        simQNumber = int(simQName[pos + 1:])
        simFileName = simFileSuffix + '_%04d' % simRefNumber
        simRefFileName = simFileName + '.ref'
        simSeqFileName = simFileName + '.fastq'
        simMafFileName = simFileName + '.maf'

        simFilePath = os.path.join(datafolder, simFolder)
        simRefFilePath = os.path.join(simFilePath, simRefFileName)
        simSeqFilePath = os.path.join(simFilePath, simSeqFileName)
        simMafFilePath = os.path.join(simFilePath, simMafFileName)

        if not os.path.exists(simRefFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Reference file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simSeqFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Sequence file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simMafFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Sequence alignment (MAF) for simulated read %s does not exist!'
                % qname)

        # Reading reference file
        [headers, seqs, quals] = read_fastq(simRefFilePath)
        simGeneName = headers[0]
        annotation = annotation_dict[
            simGeneName]  # Getting the correct annotation

        if len(samline_list) > len(annotation.items):
            # sys.stderr.write('\nWARNING: A number of partial alignments exceeds the number of exons for query %s! (%d / %d)' % (qname, len(samline_list), len(annotation.items)))
            s_num_oversplit_alignment += 1

        # Reading MAF file to get original position and length of the simulated read
        # Query name should be a second item
        maf_startpos = maf_length = 0
        i = 0
        with open(simMafFilePath, 'rU') as maffile:
            i += 1
            for line in maffile:
                if line[0] == 's':
                    elements = line.split()
                    maf_qname = elements[1]
                    if maf_qname == 'ref':  # Have to remember data for the last reference before the actual read
                        maf_startpos = int(elements[2])
                        maf_length = int(elements[3])
                    if maf_qname == simQName:
                        # maf_startpos = int(elements[2])
                        # maf_length = int(elements[3])
                        break

        if maf_qname != simQName:
            # import pdb
            # pdb.set_trace()
            raise Exception('ERROR: could not find query %s in maf file %s' %
                            (qname, simMafFileName))

        # Calculating expected partial alignmetns from MAF and annotations

        # Saving "maf_length" to be able to check it later
        t_maf_length = maf_length

        # 1. Calculating the index of the first exon
        # i - the index of exon currently being considered
        i = 0
        while annotation.items[i].getLength() < maf_startpos:
            maf_startpos -= annotation.items[i].getLength()
            i += 1

        # Calculating expected partial alignments by filling up exons using maf_length
        expected_partial_alignments = []
        while maf_length > 0:
            # try:
            #     start = annotation.items[i].start + maf_startpos
            #     end = annotation.items[i].end
            # except Exception:
            #     import pdb
            #     pdb.set_trace()
            #if not start < end:
            #    import pdb
            #    pdb.set_trace()
            start = annotation.items[i].start + maf_startpos
            end = annotation.items[i].end
            assert start <= end
            length = end - start + 1
            if length <= maf_length:
                expected_partial_alignments.append((start, end))
                maf_length -= length
                i += 1
            else:
                expected_partial_alignments.append((start, start + maf_length))
                maf_length = 0
                i += 1

            # Start position should only be considered for the first exon
            maf_startpos = 0

        numparts = len(expected_partial_alignments)
        # For each part of expected partial alignments, these maps will count
        # how many real partial alignments overlap or equal it
        parthitmap = {(i + 1): 0 for i in xrange(numparts)}
        parteqmap = {(i + 1): 0 for i in xrange(numparts)}

        isSplitRead = False
        if len(expected_partial_alignments) > 1:
            s_maf_split_reads += 1
            isSplitRead = True

        if RNAseqEval.getChromName(
                samline_list[0].rname) != RNAseqEval.getChromName(
                    annotation.seqname):
            # import pdb
            # pdb.set_trace()
            s_num_badchrom_alignments += 1
        else:
            if len(samline_list) != len(expected_partial_alignments):
                # sys.stderr.write('\nWARNING: suspicious number of alignments for query %s!' % qname)
                s_maf_suspicious_alignments += 1
            # import pdb
            # pdb.set_trace()

            good_alignment = True
            k = 0
            for samline in samline_list:
                # sl_startpos = samline.pos - 1   # SAM positions are 1-based
                sl_startpos = samline.pos
                reflength = samline.CalcReferenceLengthFromCigar()
                sl_endpos = sl_startpos + reflength

                # Comparing a samline to the corresponding expected partial alignment
                if k < len(expected_partial_alignments):
                    expected_alignement = expected_partial_alignments[k]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]
                    if abs(sl_startpos - maf_startpos) > allowed_inacc or abs(
                            sl_endpos - maf_endpos) > allowed_inacc:
                        good_alignment = False
                else:
                    good_alignment = False
                k += 1

                # Comparing a samline to all expected partial alignments
                for i in xrange(len(expected_partial_alignments)):
                    expected_alignement = expected_partial_alignments[i]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]

                    if interval_equals((sl_startpos, sl_endpos),
                                       (maf_startpos, maf_endpos),
                                       allowed_inacc):
                        parteqmap[i + 1] += 1
                    if interval_overlaps((sl_startpos, sl_endpos),
                                         (maf_startpos, maf_endpos),
                                         allowed_inacc):
                        parthitmap[i + 1] += 1

            if good_alignment:
                s_maf_good_alignments += 1
                if isSplitRead:
                    s_maf_good_split_alignments += 1
            else:
                # import pdb
                # pdb.set_trace()
                s_maf_bad_alignments += 1
                if isSplitRead:
                    s_maf_bad_split_alignments += 1
                # TODO: check which alignments are bad and why
                # If the choromosome is different its obviously a bad alignment
                if RNAseqEval.getChromName(
                        samline.rname) == RNAseqEval.getChromName(
                            annotation.seqname):
                    # import pdb
                    # pdb.set_trace()
                    pass
                else:
                    s_num_badchrom_alignments += 1

        # Analyzing parthitmap and parteqmap
        oneHit = False
        allHits = True
        oneEq = False
        multiHit = False
        for i in xrange(numparts):
            if parthitmap[i + 1] > 0:
                oneHit = True
            if parthitmap[i + 1] == 0:
                allHits = False
            if parthitmap[i + 1] > 1:
                multiHit = True
            if parteqmap[i + 1] > 0:
                oneEq = True

        if oneHit:
            s_maf_hit_one_part += 1
            if isSplitRead:
                s_maf_split_hit_one_part += 1
        if allHits:
            s_maf_hit_all_parts += 1
            if isSplitRead:
                s_maf_split_hit_all_parts += 1
                #import pdb
                #pdb.set_trace()
        if oneEq:
            s_maf_eq_one_part += 1
            if isSplitRead:
                s_maf_split_eq_one_part += 1
        if multiHit:
            s_maf_multihit_parts += 1

        num_start_hits = 0
        num_end_hits = 0
        num_hits = 0

        num_partial_alignements = len(samline_list)
        whole_alignment_hit = False
        for samline in samline_list:
            startpos = samline.pos - 1
            reflength = samline.CalcReferenceLengthFromCigar()
            endpos = startpos + reflength

            if samline.flag & 16 == 0:
                readstrand = Annotation_formats.GFF_STRANDFW
                s_num_fw_strand += 1
            else:
                readstrand = Annotation_formats.GFF_STRANDRV
                s_num_rv_strand += 1

            chromname = RNAseqEval.getChromName(samline.rname)

            if chromname == RNAseqEval.getChromName(
                    annotation.seqname
            ) and readstrand != annotation.strand and annotation.overlapsGene(
                    startpos, endpos):
                s_num_potential_bad_strand += 1

            if chromname == RNAseqEval.getChromName(
                    annotation.seqname) and annotation.overlapsGene(
                        startpos, endpos) and (not P_CHECK_STRAND or readstrand
                                               == annotation.strand):
                whole_alignment_hit = True
                s_partial_alignment_hits += 1
            else:
                s_partial_alignment_misses += 1

            # Checking how well partial alignments match exons
            startsItem = False
            endsItem = False
            for item in annotation.items:
                if item.overlapsItem(startpos, endpos):
                    num_hits += 1
                if item.startsItem(startpos, endpos):
                    num_start_hits += 1
                    startsItem = True
                if item.endsItem(startpos, endpos):
                    num_end_hits += 1
                    endsItem = True
                if startsItem and endsItem:
                    s_num_start_end_hits += 1

        s_num_start_hits += num_start_hits
        s_num_end_hits += num_end_hits

        # I'm allowing one start and one end not to match starts and ends of exons
        if (num_hits == num_partial_alignements) and (
                num_start_hits + num_end_hits >=
                2 * num_partial_alignements - 2):
            s_num_good_alignments += 1
        # else:
        #     if num_hits > 0:
        #         import pdb
        #         pdb.set_trace()

        if whole_alignment_hit:
            s_whole_alignment_hits += 1
        else:
            s_whole_alignment_misses += 1

    # Printing out results : NEW
    # Variables names matching RNA benchmark paper
    sys.stdout.write('\n\nAnalysis results:')
    sys.stdout.write('\nOriginal Samlines: %d' % report.num_alignments)
    sys.stdout.write(
        '\nUsable whole alignments (with valid CIGAR string): %d' %
        len(all_sam_lines))
    sys.stdout.write('\nAnnotations: %d' % len(annotation_dict))
    sys.stdout.write('\nMultiexon genes: %d' % s_num_multiexon_genes)

    sys.stdout.write('\nNumber of exon start hits: %d' % s_num_start_hits)
    sys.stdout.write('\nNumber of exon end hits: %d' % s_num_end_hits)
    sys.stdout.write('\nNumber of exon start and end hits: %d' %
                     s_num_start_end_hits)
    sys.stdout.write('\nNumber of good whole alignments: %d' %
                     s_num_good_alignments)

    sys.stdout.write('\nMAF: Correct alignment: %d' % s_maf_good_alignments)
    sys.stdout.write('\nMAF: Hit all parts: %d' % s_maf_hit_all_parts)
    sys.stdout.write('\nMAF: Hit at least one part: %d' % s_maf_hit_one_part)
    sys.stdout.write('\nMAF: Equals at least one part: %d' % s_maf_eq_one_part)

    sys.stdout.write('\nMAF: Number of split reads: %d' % s_maf_split_reads)
    sys.stdout.write('\nMAF: Correct alignment, SPLIT read: %d' %
                     s_maf_good_split_alignments)
    sys.stdout.write('\nMAF: Hit all parts, SPLIT read: %d' %
                     s_maf_split_hit_all_parts)
    sys.stdout.write('\nMAF: Hit at least one part, SPLIT read: %d' %
                     s_maf_split_hit_one_part)
    sys.stdout.write('\nMAF: Equals at least one part, SPLIT read: %d' %
                     s_maf_split_eq_one_part)

    sys.stdout.write('\nDone!\n')
Beispiel #4
0
def processData(datafolder, annotationfile, ss_list):

    #load annotation:
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)
    annotation_dict = {}
    for annotation in annotations:
        if annotation.transcriptname in annotation_dict:
            pass
        else:
            annotation_dict[annotation.transcriptname] = annotation

    #cal file count
    fFile = os.listdir(datafolder)
    file_count = int(len(fFile) / 2)

    SS_list = list()
    with open(ss_list, 'r') as f_ss:
        for line in f_ss:
            SS_list.append(line.strip())

    report = Report()

    simFileSuffix = 'sd'

    for i in xrange(file_count):
        simFileName = simFileSuffix + '_%04d' % (i + 1)
        simRefFileName = simFileName + '.ref'
        simMafFileName = simFileName + '.maf'

        simFilePath = datafolder
        simRefFilePath = os.path.join(simFilePath, simRefFileName)
        simMafFilePath = os.path.join(simFilePath, simMafFileName)

        if not os.path.exists(simRefFilePath):
            raise Exception('Reference file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simMafFilePath):
            raise Exception('Sequence alignment (MAF) for simulated read %s does not exist!' % qname)

        # Reading reference file
        [headers, seqs, quals] = read_fastq(simRefFilePath)
        simGeneName = headers[0]
        if "transcript" in simGeneName:
            simGeneName = simGeneName.split(':')[1]
        annotation = annotation_dict[simGeneName]       # Getting the correct annotation

        maf_startpos = maf_length = 0
        i = 0
        l_c = 0
        sigA = False
        sigE = False
        sigF = False
        total_sim_bases = 0
        total_sim_exons = 0
        with open(simMafFilePath, 'rU') as maffile:
            for line in maffile:
                if line[0] == 's':
                    if line.split()[1] == 'ref': # sim ref
                        l_c += 1
                        elements = line.split()
                        maf_startpos = int(elements[2])
                        maf_length = int(elements[3])
                        maf_reflen = int(elements[5])

                        # Calculating expected partial alignmetns from MAF and annotations
                        #IMPORTANT:  if the reads were generated from an annotation on reverse strand, expected partial alignments must be reversed
                        if annotation.strand == Annotation_formats.GFF_STRANDRV:
                            maf_startpos = maf_reflen - maf_length - maf_startpos

                        # 1. Calculating the index of the first exon
                        # i - the index of exon currently being considered
                        i = 0
                        while annotation.items[i].getLength() <= maf_startpos:
                            maf_startpos -= annotation.items[i].getLength()
                            i += 1

                        # Calculating expected partial alignments by filling up exons using maf_length
                        expected_partial_alignments = []
                        while maf_length > 0:
                            start = annotation.items[i].start + maf_startpos
                            end = annotation.items[i].end
                            assert start <= end

                            # OLD: length = end-start+1
                            # KK: End is already indicating position after the last base, so adding one when callculating length is not correct
                            length = end - start
                            if length <= maf_length:
                                expected_partial_alignments.append((start, end))
                                maf_length -= length
                                i += 1
                            else:
                                expected_partial_alignments.append((start, start + maf_length))
                                maf_length = 0
                                i += 1

                            # Start position should only be considered for the first exon
                            maf_startpos = 0

                        report.Total_expected_exons += len(expected_partial_alignments)
                        num = len(expected_partial_alignments)
                        total_sim_exons += num
                        #level2
                        for ele in expected_partial_alignments[1:-1]:
                            if ele[1] - ele[0] < 30:
                                report.Total_level2_reads += 1
                                report.Total_level2_expected_exons += num
                                sigA = True
                                break
                        if sigA == False:
                            report.Total_level2_r_reads += 1
                            report.Total_level2_r_expected_exons += num
                        #level4
                        if num < 6:
                            report.Total_level4_2_5_reads += 1
                            report.Total_level4_2_5_expected_exons += num
                            sigE = True
                        elif num > 5 and num < 10:
                            report.Total_level4_6_9_reads += 1
                            report.Total_level4_6_9_expected_exons += num
                            sigF = True
                        else:
                            report.Total_level4_10_reads += 1
                            report.Total_level4_10_expected_exons += num
                    else: #sim read
                        sim_bases = int(line.split()[3])
                        report.Total_bases += sim_bases
                        total_sim_bases += sim_bases
                        #level2
                        if sigA == True:
                            report.Total_level2_bases += sim_bases
                            sigA = False
                        else:
                            report.Total_level2_r_bases += sim_bases
                        #level4
                        if sigE == True:
                            report.Total_level4_2_5_bases += sim_bases
                            sigE = False
                        elif sigF == True:
                            report.Total_level4_6_9_bases += sim_bases
                            sigF = False
                        else:
                            report.Total_level4_10_bases += sim_bases


            #level3
            #print simGeneName
            if simGeneName in SS_list:
                report.Total_level3_SS_reads += l_c
                report.Total_level3_SS_bases += total_sim_bases
                report.Total_level3_SS_expected_exons += total_sim_exons
            else:
                report.Total_level3_AS_reads += l_c
                report.Total_level3_AS_bases += total_sim_bases
                report.Total_level3_AS_expected_exons += total_sim_exons

    report.Total_reads = report.Total_level3_SS_reads + report.Total_level3_AS_reads
    # print report.Total_reads, report.Total_bases, report.Total_expected_exons
    # print report.Total_level2_reads, report.Total_level2_bases, report.Total_level2_expected_exons
    # print report.Total_level2_r_reads, report.Total_level2_r_bases, report.Total_level2_r_expected_exons
    # print report.Total_level3_AS_reads, report.Total_level3_AS_bases, report.Total_level3_AS_expected_exons
    # print report.Total_level3_SS_reads, report.Total_level3_SS_bases, report.Total_level3_SS_expected_exons
    # print report.Total_level4_2_5_reads, report.Total_level4_2_5_bases, report.Total_level4_2_5_expected_exons
    # print report.Total_level4_6_9_reads, report.Total_level4_6_9_bases, report.Total_level4_6_9_expected_exons
    # print report.Total_level4_10_reads, report.Total_level4_10_bases, report.Total_level4_10_expected_exons
    return report
Beispiel #5
0
def processData(datafolder, resultfile, annotationfile, paramdict):

    split_qnames = False
    filename = ''
    if '--split-qnames' in paramdict:
        split_qnames = True
        filename = paramdict['--split-qnames'][0]

    filename_correct = filename + '_correct.names'
    filename_hitall = filename + '_hitall.names'
    filename_hitone = filename + '_hitone.names'
    filename_bad = filename + '_incorrect.names'
    filename_unmapped = filename + '_unmapped.names'

    printMap = False
    filename_mapping = ''
    if '--print_mapping' in paramdict:
        filename_mapping = paramdict['--print_mapping'][0]
        printMap = True

    file_correct = None
    file_hitall = None
    file_hitone = None
    file_bad = None
    file_unmapped = None
    folder = os.getcwd()

    # If splittng qnames into files, have to open files first
    if split_qnames:
        file_correct = open(os.path.join(folder, filename_correct), 'w+')
        file_hitall = open(os.path.join(folder, filename_hitall), 'w+')
        file_hitone = open(os.path.join(folder, filename_hitone), 'w+')
        file_bad = open(os.path.join(folder, filename_bad), 'w+')

    # Loading results SAM file
    report = EvalReport(ReportType.FASTA_REPORT
                        )  # not really needed, used for unmapped query names
    # Have to preserve the paramdict
    # paramdict = {}

    sys.stderr.write(
        '\n(%s) Loading and processing SAM file with mappings ... ' %
        datetime.now().time().isoformat())
    all_sam_lines = RNAseqEval.load_and_process_SAM(resultfile,
                                                    paramdict,
                                                    report,
                                                    BBMapFormat=True)

    # Reading annotation file
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)

    s_num_multiexon_genes = 0

    mapfile = None
    if printMap:
        mapfile = open(filename_mapping, 'w+')

    # Hashing annotations according to name
    annotation_dict = {}
    for annotation in annotations:
        if annotation.genename in annotation_dict:
            sys.stderr.write(
                '\nWARNING: anotation with name %s already in the dictionary!'
                % annotation.genename)
        else:
            annotation_dict[annotation.genename] = annotation
        if len(annotation.items) > 1:
            s_num_multiexon_genes += 1

    # Statistical information for evaluating the qualitiy of mapping
    s_gene_hits = 0
    s_gene_misses = 0
    s_whole_alignment_hits = 0
    s_whole_alignment_misses = 0
    s_partial_alignment_hits = 0
    s_partial_alignment_misses = 0
    s_num_start_hits = 0
    s_num_end_hits = 0
    s_num_start_end_hits = 0

    s_num_fw_strand = 0
    s_num_rv_strand = 0

    s_num_split_alignment = 0
    s_num_oversplit_alignment = 0  # Alignments that have more parts than exons

    s_num_good_alignments = 0

    s_num_badchrom_alignments = 0

    s_maf_suspicious_alignments = 0
    s_maf_bad_alignments = 0
    s_maf_good_alignments = 0

    s_maf_split_reads = 0
    s_maf_good_split_alignments = 0
    s_maf_bad_split_alignments = 0

    s_maf_hit_all_parts = 0
    s_maf_hit_one_part = 0
    s_maf_eq_one_part = 0
    s_maf_multihit_parts = 0

    s_maf_split_hit_all_parts = 0
    s_maf_split_hit_one_part = 0
    s_maf_split_eq_one_part = 0

    s_maf_miss_alignment = 0
    s_maf_too_many_alignments = 0

    s_num_potential_bad_strand = 0

    allowed_inacc = Annotation_formats.DEFAULT_ALLOWED_INACCURACY  # Allowing some shift in positions
    min_overlap = Annotation_formats.DEFAULT_MINIMUM_OVERLAP  # Minimum overlap that is considered

    # Setting allowed_inaccuracy from parameters
    if '--allowed_inacc' in paramdict:
        allowed_inacc = int(paramdict['--allowed_inacc'][0])
    elif '-ai' in paramdict:
        allowed_inacc = int(paramdict['-ai'][0])

    # Setting minimum overlap from parameters
    if '--allowed_inacc' in paramdict:
        min_overlap = int(paramdict['--allowed_inacc'][0])
    elif '-mo' in paramdict:
        min_overlap = int(paramdict['-mo'][0])

    # All samlines in a list should have the same query name
    for samline_list in all_sam_lines:
        qname = samline_list[0].qname

        isSplitAlignment = False
        if len(samline_list) > 1:
            s_num_split_alignment += 1
            isSplitAlignment = True

        # Checking the SAM file if all samlines in a list have the same qname
        for samline in samline_list[1:]:
            if samline.qname != qname:
                sys.stderr.write(
                    '\nWARNING: two samlines in the same list with different query names (%s/%s)'
                    % (qname, samline.qname))

        # Look for the first underscore in query name
        # Everything before that is the simulation folder name
        # Everything after that is simulated query name
        pos = qname.find('_')
        if pos < 0:
            raise Exception('Invalid query name in results file (%s)!' % qname)

        simFolderKey = qname[:pos]
        if simFolderKey not in simFolderDict:
            # import pdb
            # pdb.set_trace()
            raise Exception('Bad simulation folder short name (%s)!' %
                            simFolderKey)
        simFolder = simFolderDict[simFolderKey]
        simQName = qname[pos + 1:]

        # Due to error in data preparation, have to make some extra processing
        if simQName[:6] == 'SimG2_':
            simQName = simQName[6:]


#        if simFolderKey == 'SimG1':
#            simFileSuffix = 'g1'
#        elif simFolderKey == 'SimG2':
#            simFileSuffix = 'g2'
#        elif simFolderKey == 'SimG3':
#            simFileSuffix = 'g3'
#        else:
#            simFileSuffix = 'sd'

        simFileSuffix = 'sd'

        pos = simQName.find('_')
        pos2 = simQName.find('_part')
        if pos < 0:
            raise Exception(
                'Invalid simulated query name in results file (%s)!' %
                simQName)

        simQLetter = simQName[0]  # Should always be S

        # BBMap separates a query into smaller parts he can manage
        # Extends query with '_part_#', which has to be ignored
        if pos2 <> -1:
            simQName = simQName[:pos2]

        simRefNumber = int(simQName[1:pos])
        simQNumber = int(simQName[pos + 1:])
        simFileName = simFileSuffix + '_%04d' % simRefNumber
        simRefFileName = simFileName + '.ref'
        simSeqFileName = simFileName + '.fastq'
        simMafFileName = simFileName + '.maf'

        simFilePath = os.path.join(datafolder, simFolder)
        simRefFilePath = os.path.join(simFilePath, simRefFileName)
        simSeqFilePath = os.path.join(simFilePath, simSeqFileName)
        simMafFilePath = os.path.join(simFilePath, simMafFileName)

        if not os.path.exists(simRefFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Reference file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simSeqFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Sequence file for simulated read %s does not exist!' % qname)
        if not os.path.exists(simMafFilePath):
            # import pdb
            # pdb.set_trace()
            raise Exception(
                'Sequence alignment (MAF) for simulated read %s does not exist!'
                % qname)

        # Reading reference file
        [headers, seqs, quals] = read_fastq(simRefFilePath)
        simGeneName = headers[0]
        annotation = annotation_dict[
            simGeneName]  # Getting the correct annotation

        if len(samline_list) > len(annotation.items):
            # sys.stderr.write('\nWARNING: A number of partial alignments exceeds the number of exons for query %s! (%d / %d)' % (qname, len(samline_list), len(annotation.items)))
            s_num_oversplit_alignment += 1

        # Reading MAF file to get original position and length of the simulated read
        # Query name should be a second item
        maf_startpos = maf_length = 0
        maf_strand = '0'
        maf_reflen = 0
        i = 0
        with open(simMafFilePath, 'rU') as maffile:
            i += 1
            for line in maffile:
                if line[0] == 's':
                    elements = line.split()
                    maf_qname = elements[1]
                    if maf_qname == 'ref':  # Have to remember data for the last reference before the actual read
                        maf_startpos = int(elements[2])
                        maf_length = int(elements[3])
                        maf_strand = elements[4]
                        maf_reflen = int(elements[5])
                    if maf_qname == simQName:
                        # maf_startpos = int(elements[2])
                        # maf_length = int(elements[3])
                        break

        if maf_qname != simQName:
            # import pdb
            # pdb.set_trace()
            raise Exception('ERROR: could not find query %s in maf file %s' %
                            (qname, simMafFileName))

        # IMPORTANT: If the reads were generated from an annotation on reverse strand
        #            expected partial alignments must be reversed
        if annotation.strand == Annotation_formats.GFF_STRANDRV:
            maf_startpos = maf_reflen - maf_length - maf_startpos

        # Saving "maf_length" and "maf_startpos" to be able to check it later
        t_maf_length = maf_length
        t_maf_startpos = maf_startpos

        # Calculating expected partial alignmetns from MAF and annotations

        # 1. Calculating the index of the first exon
        # i - the index of exon currently being considered
        i = 0
        while annotation.items[i].getLength() < maf_startpos:
            maf_startpos -= annotation.items[i].getLength()
            i += 1

        # Calculating expected partial alignments by filling up exons using maf_length
        expected_partial_alignments = []
        while maf_length > 0:
            start = annotation.items[i].start + maf_startpos
            end = annotation.items[i].end
            assert start <= end

            # OLD: length = end-start+1
            # KK: End is already indicating position after the last base, so adding one when callculating length is not correct
            length = end - start
            if length <= maf_length:
                expected_partial_alignments.append((start, end))
                maf_length -= length
                i += 1
            else:
                expected_partial_alignments.append((start, start + maf_length))
                maf_length = 0
                i += 1

            # Start position should only be considered for the first exon
            maf_startpos = 0

        # import pdb
        # pdb.set_trace()

        numparts = len(expected_partial_alignments)
        # For each part of expected partial alignments, these maps will count
        # how many real partial alignments overlap or equal it
        parthitmap = {(i + 1): 0 for i in xrange(numparts)}
        parteqmap = {(i + 1): 0 for i in xrange(numparts)}

        isSplitRead = False
        if len(expected_partial_alignments) > 1:
            s_maf_split_reads += 1
            isSplitRead = True

        oneHit = False
        allHits = False
        oneEq = False
        multiHit = False
        good_alignment = False
        has_miss_alignments = False

        if RNAseqEval.getChromName(
                samline_list[0].rname) != RNAseqEval.getChromName(
                    annotation.seqname):
            # import pdb
            # pdb.set_trace()
            s_num_badchrom_alignments += 1
        else:
            if len(samline_list) != len(expected_partial_alignments):
                # sys.stderr.write('\nWARNING: suspicious number of alignments for query %s!' % qname)
                s_maf_suspicious_alignments += 1
            # import pdb
            # pdb.set_trace()

            good_alignment = True
            k = 0
            for samline in samline_list:
                # sl_startpos = samline.pos - 1   # SAM positions are 1-based
                sl_startpos = samline.pos
                reflength = samline.CalcReferenceLengthFromCigar()
                sl_endpos = sl_startpos + reflength

                # Comparing a samline to the corresponding expected partial alignment
                if k < len(expected_partial_alignments):
                    expected_alignement = expected_partial_alignments[k]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]
                    if abs(sl_startpos - maf_startpos) > allowed_inacc or abs(
                            sl_endpos - maf_endpos) > allowed_inacc:
                        good_alignment = False
                else:
                    good_alignment = False
                k += 1

                # Comparing a samline to all expected partial alignments
                for i in xrange(len(expected_partial_alignments)):
                    expected_alignement = expected_partial_alignments[i]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]

                    if interval_equals((sl_startpos, sl_endpos),
                                       (maf_startpos, maf_endpos),
                                       allowed_inacc, min_overlap):
                        parteqmap[i + 1] += 1
                    if interval_overlaps((sl_startpos, sl_endpos),
                                         (maf_startpos, maf_endpos),
                                         allowed_inacc, min_overlap):
                        parthitmap[i + 1] += 1

            has_miss_alignments = False
            for expected_alignement in expected_partial_alignments:
                maf_startpos = expected_alignement[0]
                maf_endpos = expected_alignement[1]
                overlap = False
                for samline in samline_list:
                    sl_startpos = samline.pos
                    reflength = samline.CalcReferenceLengthFromCigar()
                    sl_endpos = sl_startpos + reflength
                    if interval_overlaps((sl_startpos, sl_endpos),
                                         (maf_startpos, maf_endpos),
                                         allowed_inacc, min_overlap):
                        overlap = True
                if not overlap:
                    has_miss_alignments = True
                    break

            if len(samline_list) < len(expected_partial_alignments):
                s_maf_too_many_alignments += 1

            # Testing the evaluation process
            # import pdb
            # pdb.set_trace()
            if len(samline_list) <> len(expected_partial_alignments):
                good_alignment = False

            if good_alignment:
                s_maf_good_alignments += 1

                # Writting qnames to files
                if split_qnames:
                    file_correct.write(samline_list[0].qname + '\n')

                if isSplitRead:
                    s_maf_good_split_alignments += 1
            else:
                # import pdb
                # pdb.set_trace()
                s_maf_bad_alignments += 1
                if isSplitRead:
                    s_maf_bad_split_alignments += 1
                # TODO: check which alignments are bad and why
                # If the choromosome is different its obviously a bad alignment
                if RNAseqEval.getChromName(
                        samline.rname) == RNAseqEval.getChromName(
                            annotation.seqname):
                    # import pdb
                    # pdb.set_trace()
                    pass
                else:
                    s_num_badchrom_alignments += 1

            # Analyzing parthitmap and parteqmap
            oneHit = False
            allHits = True
            oneEq = False
            multiHit = False
            for i in xrange(numparts):
                if parthitmap[i + 1] > 0:
                    oneHit = True
                if parthitmap[i + 1] == 0:
                    allHits = False
                if parthitmap[i + 1] > 1:
                    multiHit = True
                if parteqmap[i + 1] > 0:
                    oneEq = True

        if printMap:
            status = 'INCORRECT'
            if good_alignment:
                status = 'CORRECT'
            elif allHits:
                status = 'HITALL'
            elif oneHit:
                status = 'HITONE'
            mapfile.write('QNAME: %s, STATUS: %s\n\n' %
                          (samline_list[0].qname, status))
            mapfile.write('EXPECTED (%s, %s):\t' % (RNAseqEval.getChromName(
                annotation.seqname), annotation.strand))
            for epa in expected_partial_alignments:
                mapfile.write('(%d, %d)\t' % (epa[0], epa[1]))
            mapfile.write('\n')
            if samline_list[0].flag & 16 == 0:
                readstrand = Annotation_formats.GFF_STRANDFW
            else:
                readstrand = Annotation_formats.GFF_STRANDRV
            mapfile.write(
                'ACTUAL   (%s, %s):\t' %
                (RNAseqEval.getChromName(samline_list[0].rname), readstrand))
            for samline in samline_list:
                mapfile.write('(%d, %d)\t' %
                              (samline.pos, samline.pos +
                               samline.CalcReferenceLengthFromCigar()))
            mapfile.write('\n\n')

        if oneHit:
            s_maf_hit_one_part += 1
            if isSplitRead:
                s_maf_split_hit_one_part += 1

            # Writting qnames to files
            if split_qnames:
                file_hitone.write(samline_list[0].qname + '\n')

            if not allHits:
                if '--debug' in paramdict:
                    import pdb
                    pdb.set_trace()

            # Misses are calculated only for alignments that have at least one hit
            if has_miss_alignments:
                s_maf_miss_alignment += 1

        else:
            # Writting qnames to files
            if split_qnames:
                file_bad.write(samline_list[0].qname + '\n')

            # if '--debug' in paramdict:
            #     import pdb
            #     pdb.set_trace()

        if allHits:
            s_maf_hit_all_parts += 1
            if isSplitRead:
                s_maf_split_hit_all_parts += 1

            # Writting qnames to files
            if split_qnames:
                file_hitall.write(samline_list[0].qname + '\n')

        # Sanity check
        if '--debug' in paramdict and good_alignment and not allHits:
            import pdb
            pdb.set_trace()
            pass

        if oneEq:
            s_maf_eq_one_part += 1
            if isSplitRead:
                s_maf_split_eq_one_part += 1
        if multiHit:
            s_maf_multihit_parts += 1

        num_start_hits = 0
        num_end_hits = 0
        num_hits = 0

        num_partial_alignements = len(samline_list)
        whole_alignment_hit = False
        for samline in samline_list:
            startpos = samline.pos - 1
            reflength = samline.CalcReferenceLengthFromCigar()
            endpos = startpos + reflength

            if samline.flag & 16 == 0:
                readstrand = Annotation_formats.GFF_STRANDFW
                s_num_fw_strand += 1
            else:
                readstrand = Annotation_formats.GFF_STRANDRV
                s_num_rv_strand += 1

            chromname = RNAseqEval.getChromName(samline.rname)

            if chromname == RNAseqEval.getChromName(
                    annotation.seqname
            ) and readstrand != annotation.strand and annotation.overlapsGene(
                    startpos, endpos):
                s_num_potential_bad_strand += 1

            if chromname == RNAseqEval.getChromName(
                    annotation.seqname) and annotation.overlapsGene(
                        startpos, endpos) and (not P_CHECK_STRAND or readstrand
                                               == annotation.strand):
                whole_alignment_hit = True
                s_partial_alignment_hits += 1
            else:
                s_partial_alignment_misses += 1

            # Checking how well partial alignments match exons
            startsItem = False
            endsItem = False
            for item in annotation.items:
                if item.overlapsItem(startpos, endpos):
                    num_hits += 1
                if item.startsItem(startpos, endpos):
                    num_start_hits += 1
                    startsItem = True
                if item.endsItem(startpos, endpos):
                    num_end_hits += 1
                    endsItem = True
                if startsItem and endsItem:
                    s_num_start_end_hits += 1

        s_num_start_hits += num_start_hits
        s_num_end_hits += num_end_hits

        # I'm allowing one start and one end not to match starts and ends of exons
        if (num_hits == num_partial_alignements) and (
                num_start_hits + num_end_hits >=
                2 * num_partial_alignements - 2):
            s_num_good_alignments += 1
        # else:
        #     if num_hits > 0:
        #         import pdb
        #         pdb.set_trace()

        if whole_alignment_hit:
            s_whole_alignment_hits += 1
        else:
            s_whole_alignment_misses += 1

    if printMap:
        mapfile.close()

    # Writting unmapped query names to a file, if so specified
    if split_qnames:
        with open(filename_unmapped, 'w+') as file_unmapped:
            file_unmapped.write(report.get_unmapped_names())
            file_unmapped.close()

    # Printing out results : NEW
    # Variables names matching RNA benchmark paper
    sys.stdout.write('\n\nAnalysis results:')
    sys.stdout.write('\nOriginal Samlines: %d' % report.num_alignments)
    sys.stdout.write(
        '\nUsable whole alignments (with valid CIGAR string): %d' %
        len(all_sam_lines))
    sys.stdout.write('\nAnnotations: %d' % len(annotation_dict))
    sys.stdout.write('\nMultiexon genes: %d' % s_num_multiexon_genes)

    sys.stdout.write('\nNumber of exon start hits: %d' % s_num_start_hits)
    sys.stdout.write('\nNumber of exon end hits: %d' % s_num_end_hits)
    sys.stdout.write('\nNumber of exon start and end hits: %d' %
                     s_num_start_end_hits)
    sys.stdout.write('\nNumber of good whole alignments: %d' %
                     s_num_good_alignments)
    sys.stdout.write(
        '\nNumber of alignments mapped to an incorrect chromosome: %d' %
        s_num_badchrom_alignments)

    sys.stdout.write('\nMAF: Correct alignment: %d' % s_maf_good_alignments)
    sys.stdout.write('\nMAF: Hit all parts: %d' % s_maf_hit_all_parts)
    sys.stdout.write('\nMAF: Hit at least one part: %d' % s_maf_hit_one_part)
    sys.stdout.write('\nMAF: Equals at least one part: %d' % s_maf_eq_one_part)

    sys.stdout.write('\nMAF: Number of split reads: %d' % s_maf_split_reads)
    sys.stdout.write('\nMAF: Correct alignment, SPLIT read: %d' %
                     s_maf_good_split_alignments)
    sys.stdout.write('\nMAF: Hit all parts, SPLIT read: %d' %
                     s_maf_split_hit_all_parts)
    sys.stdout.write('\nMAF: Hit at least one part, SPLIT read: %d' %
                     s_maf_split_hit_one_part)
    sys.stdout.write('\nMAF: Equals at least one part, SPLIT read: %d' %
                     s_maf_split_eq_one_part)

    sys.stdout.write('\nMAF: Partial alignment that misses: %d' %
                     s_maf_miss_alignment)
    sys.stdout.write('\nMAF: More alignments than expected: %d' %
                     s_maf_too_many_alignments)
    sys.stdout.write('\nMAF: Multihit parts (fragmented) alignments: %d' %
                     s_maf_multihit_parts)

    sys.stdout.write('\nDone!\n')

    # Closing file with names
    if split_qnames:
        file_correct.close()
        file_hitall.close()
        file_hitone.close()
        file_bad.close()
Beispiel #6
0
def split_alternate(annotations_file):

    filename, file_extension = os.path.splitext(annotations_file)
    processed_annotations_file_AS = filename + '_AS' + file_extension
    processed_annotations_file_SS = filename + '_SS' + file_extension

    if file_extension.lower() in ['.gtf', '.gff']:
        filetype = 'GTF'
    elif file_extension.lower() in ['.bed']:
        filetype = 'BED'
    else:
        raise Exception('Invalid annotation file type: %s' % file_extension)

    # Reading annotation file
    # annotations = Annotation_formats.Load_Annotation_From_File(annotations_file, check_duplicates = True)
    annotations = Annotation_formats.Load_Annotation_From_File(annotations_file)

    # for annotation in annotations:
    #     if len(annotation.items) > 1 and annotation.genename[0] == 'Q':
    #         import pdb
    #         pdb.set_trace()

    # Analyzing annotations to discover alternate splicings
    # Groupign annotations which overlap and are on the same strand
    start_new_group = True
    grouped_annotations = []

    for new_annotation in annotations:
        if start_new_group:
            annotation_group = []
            annotation_group.append(new_annotation)
            group_start = new_annotation.start
            group_end = new_annotation.end
            group_strand = new_annotation.strand
            group_chrom = new_annotation.seqname
            start_new_group = False
        else:
            if new_annotation.overlapsGene(group_start, group_end) and group_strand == new_annotation.strand and group_chrom == new_annotation.seqname:
                # Add annotation to current group
                annotation_group.append(new_annotation)
                # Adjust group start and end
                if new_annotation.start < group_start:
                    group_start = new_annotation.start
                if new_annotation.end > group_end:
                    group_end = new_annotation.end
            else:
                # Save the current group and start the new one
                grouped_annotations.append(annotation_group)
                annotation_group = []
                annotation_group.append(new_annotation)
                group_start = new_annotation.start
                group_end = new_annotation.end
                group_strand = new_annotation.strand
                group_chrom = new_annotation.seqname

    # At the end, add last group if it exists
    if len(annotation_group) > 0:
        grouped_annotations.append(annotation_group)


    # Annotations with alternate splicing
    as_annotations = []

    # Annotation with single splicing
    ss_annotations = []

    # Separate annotations into those for genes with alternate splicing and genes with single splicing
    # For genes with alternate splicing, keep only ALTERNATE_SPLICINGS_TO_KEEP annotations
    # Have to watch out for duplicate annotation names. Annotation is considered only the first times
    # It enters a list. If it already exists in any list, it is ignored.
    duplicate_genename = False
    for annotation_group in grouped_annotations:
        if len(annotation_group) > 1:
            i = 0
            tr = 0
            for annotation in annotation_group:
                if ALTERNATE_SPLICINGS_TO_KEEP > 0 and ALTERNATE_SPLICINGS_TO_KEEP <= tr:
                    break
                if annotation_group[i].genename in ss_annotations or annotation_group[i].genename in as_annotations:
                    duplicate_genename = True
                else:
                    as_annotations.append(annotation_group[i].genename)
                    tr += 1
                i += 1
        else:
            if annotation_group[0].genename in ss_annotations or annotation_group[0].genename in as_annotations:
                duplicate_genename = True
            else:
                ss_annotations.append(annotation_group[0].genename)

    if duplicate_genename:
        sys.stderr.write('\nWARNING: there were duplicate annotations!\n')


    # Reading original annotations file and writing lines in separate files for
    # single-splicing and alternate-splicing
    # Variable old_genename is used to detect genename change in gtf files, in case duplicate annotations need to be skipped.
    # It is assumed that duplicate genename enteries do not come one after the other (there are other enteries inbetween)
    old_genename = ''
    with open(processed_annotations_file_AS, 'w') as pafile_AS, open(processed_annotations_file_SS, 'w') as pafile_SS, open(annotations_file) as afile:
        for line in afile:
            is_AS = False
            is_SS = False

            count = 0       # Used for sanity check
            # extracting genename from annotation line
            if filetype == 'BED':
                if line.startswith('#') or line.startswith('track') or line.startswith('browser'):
                    pass
                else:
                    elements = line.split()
                    genename = elements[3]
            elif filetype == 'GTF':
                genename = 'Unknown'
                elements = line.split('\t')
                att_line = elements[8]
                att_list = att_line.split(';')          # Separating attribute definitions
                for i in xrange(len(att_list)):
                    elements = att_list[i].split()      # Separating key and value for each attribute
                    if len(elements) > 1 and elements[0] == 'transcript_id':
                        genename = elements[1][1:-1]


            # Checking if the line is for an alternate spliced gene
            if genename in as_annotations:
                is_AS = True
                pafile_AS.write(line)
                if not KEEP_DUPLICATES:
                    if filetype == 'BED':
                        as_annotations.remove(genename)

            # Checking if the line is for a single spliced gene
            if genename in ss_annotations:
                is_SS = True
                pafile_SS.write(line)
                if not KEEP_DUPLICATES:
                    if filetype == 'BED':
                        ss_annotations.remove(genename)

            if not KEEP_DUPLICATES and filetype == 'GTF' and old_genename != '' and old_genename != genename:
                if old_genename in as_annotations:
                    as_annotations.remove(old_genename)
                if old_genename in ss_annotations:
                    ss_annotations.remove(old_genename)

            old_genename = genename

            # For testing purposes
            # if (not is_AS) and (not is_SS):
            #     import pdb
            #     pdb.set_trace()

            if is_AS and is_SS:
                sys.stderr.write('\nERROR: genename found in both lists (single splices and alternate spliced)\n')
                sys.stderr.write(line)
Beispiel #7
0
def processData(resultfile, annotationfile, SS_list, TotalReport, csv_path):
    sys.stderr.write(
        '\n(%s) Loading and processing SAM file with mappings ... ' %
        datetime.now().time().isoformat())
    all_sam_lines = load_and_process_SAM(resultfile, BBMapFormat=True)

    # Reading annotation file
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)

    # Hashing annotations according to name
    annotation_dict = {}
    for annotation in annotations:
        if annotation.transcriptname in annotation_dict:
            pass
            #sys.stderr.write('\nWARNING: anotation with name %s already in the dictionary!' % annotation.genename)
        else:
            #annotation_dict[annotation.genename] = annotation
            annotation_dict[annotation.transcriptname] = annotation

    #***********************************
    #***********************************
    static_dict = {}
    #"A": with exon < 30 "B": exon > 30
    #"C": single splicing "D": alternative splicing
    #"E": 2-5 exons "F": 6-9 exons "G": >10 exons
    key = ["All", "A", "B", "C", "D", "E", "F", "G"]
    for i in xrange(len(key)):
        static_dict[key[i]] = Static()

    ss_array = list()
    with open(SS_list, 'r') as f_ss:
        for line in f_ss:
            ss_array.append(line.strip())
    #**********************************

    allowed_inacc = Annotation_formats.DEFAULT_ALLOWED_INACCURACY  # Allowing some shift in positions
    # Setting allowed inaccuracy
    #allowed_inacc = 5

    # All samlines in a list should have the same query name
    for samline_list in all_sam_lines:
        qname = samline_list[0].qname
        seqlen = len(samline_list[0].seq)

        # Checking the SAM file if all samlines in a list have the same qname
        for samline in samline_list[1:]:
            if samline.qname != qname:
                sys.stderr.write(
                    '\nWARNING: two samlines in the same list with different query names (%s/%s)'
                    % (qname, samline.qname))

        pos = qname.split('_')
        simGeneName = pos[0]
        maf_startpos = int(pos[1])
        aln_sig = pos[2]
        read_idx = int(pos[3])
        maf_strand = pos[4]
        l_clip = int(pos[5])
        maf_length = int(pos[6])
        r_clip = int(pos[7])

        if "transcript" in simGeneName:
            simGeneName = simGeneName.split(':')[1]

        annotation = annotation_dict[
            simGeneName]  # Getting the correct annotation
        maf_reflen = 0
        for i in range(len(annotation.items)):
            maf_reflen += annotation.items[i].getLength(
            )  # get the reference length from exons itemso

        # IMPORTANT: If the reads were generated from an annotation on reverse strand
        #            expected partial alignments must be reversed
        if annotation.strand == Annotation_formats.GFF_STRANDRV:
            maf_startpos = maf_reflen - maf_length - maf_startpos

        # Calculating expected partial alignmetns from MAF and annotations
        sigA = False
        sigB = True
        sigC = False
        sigD = False
        sigE = False
        sigF = False
        sigG = False

        # 1. Calculating the index of the first exon
        # i - the index of exon currently being considered
        i = 0
        while annotation.items[i].getLength() <= maf_startpos:
            maf_startpos -= annotation.items[i].getLength()
            i += 1

        # Calculating expected partial alignments by filling up exons using maf_length
        expected_partial_alignments = []
        while maf_length > 0:
            start = annotation.items[i].start + maf_startpos
            end = annotation.items[i].end
            assert start <= end

            # OLD: length = end-start+1
            # KK: End is already indicating position after the last base, so adding one when callculating length is not correct
            length = end - start
            if length <= maf_length:
                expected_partial_alignments.append((start, end))
                maf_length -= length
                i += 1
            else:
                expected_partial_alignments.append((start, start + maf_length))
                maf_length = 0
                i += 1

            # Start position should only be considered for the first exon
            maf_startpos = 0
        #*****************************************
        #*****************************************

        # Total
        num = len(expected_partial_alignments)

        #level2
        for ele in expected_partial_alignments[1:-1]:
            if ele[1] - ele[0] < 30:
                sigA = True
                sigB = False
                break

        #level4
        if num < 6:
            sigE = True
        elif num > 5 and num < 10:
            sigF = True
        else:
            sigG = True

        #level3
        if simGeneName in ss_array:
            sigC = True
        else:
            sigD = True

        if DEBUG:
            print "exon in expected alignment---------------"
            for i in xrange(len(expected_partial_alignments)):
                print "(%d, %d)" % (expected_partial_alignments[i][0],
                                    expected_partial_alignments[i][1])
            print "exon in real alignment-------------"

        numparts = len(expected_partial_alignments)
        # For each part of expected partial alignments, these maps will count
        # how many real partial alignments overlap or equal it
        parteqmap = {(i + 1): 0 for i in xrange(numparts)}
        parthitmap = {(i + 1): 0 for i in xrange(numparts)}

        if getChromName(samline_list[0].rname) != getChromName(
                annotation.seqname):
            static_dict["All"].Total_aligned_reads += 1
            part_cal.cal(static_dict, sigA, sigC, sigE, sigF,
                         "Total_aligned_reads", 1)
        else:
            for samline in samline_list:
                # sl_startpos = samline.pos - 1   # SAM positions are 1-based
                sl_startpos = samline.pos
                reflength = samline.CalcReferenceLengthFromCigar()
                readlength = samline.CalcReadLengthFromCigar()
                #************************
                #************************
                sl_endpos = sl_startpos + reflength

                if DEBUG:
                    print "(%d, %d)" % (sl_startpos, sl_endpos)

                # Comparing a samline to all expected partial alignments
                tmp_aln = 0
                for i in xrange(len(expected_partial_alignments)):
                    expected_alignement = expected_partial_alignments[i]
                    maf_startpos = expected_alignement[0]
                    maf_endpos = expected_alignement[1]

                    if numparts > 2 and i == 0 and abs(
                            sl_endpos - maf_endpos) < allowed_inacc:
                        parteqmap[i + 1] += 1
                        parthitmap[i + 1] += 1
                    elif numparts > 2 and (
                            i == len(expected_partial_alignments) - 1
                    ) and abs(sl_startpos - maf_startpos) < allowed_inacc:
                        parteqmap[i + 1] += 1
                        parthitmap[i + 1] += 1
                    elif interval_equals((sl_startpos, sl_endpos),
                                         (maf_startpos, maf_endpos),
                                         allowed_inacc):
                        parteqmap[i + 1] += 1
                        parthitmap[i + 1] += 1
                    elif interval_overlaps((sl_startpos, sl_endpos),
                                           (maf_startpos, maf_endpos), 5):
                        parthitmap[i + 1] += 1

                    if interval_overlaps((sl_startpos, sl_endpos),
                                         (maf_startpos, maf_endpos), 5):
                        l = basesInside(sl_startpos, sl_endpos, maf_startpos,
                                        maf_endpos)
                        if tmp_aln < l:
                            tmp_aln = l
                if tmp_aln > readlength:
                    tmp_aln = readlength
                static_dict["All"].Total_aligned_bases += tmp_aln
                part_cal.cal(static_dict, sigA, sigC, sigE, sigF,
                             "Total_aligned_bases", tmp_aln)

            #*************************************************************************************
            #*************************************************************************************
            num_recover_exons = len([x for x in parteqmap.values() if x == 1])
            num_hit_exons = len([x for x in parthitmap.values() if x == 1])

            if num_hit_exons == numparts:
                static_dict["All"].Hit100 += 1
                part_cal.cal(static_dict, sigA, sigC, sigE, sigF, "Hit100", 1)
            if num_hit_exons >= int(0.8 * numparts):
                static_dict["All"].Hit80 += 1
                part_cal.cal(static_dict, sigA, sigC, sigE, sigF, "Hit80", 1)

            sam_l = len(samline_list)
            if num_recover_exons == numparts:
                static_dict["All"].ExR100 += 1
                part_cal.cal(static_dict, sigA, sigC, sigE, sigF, "ExR100", 1)
                if num_recover_exons == sam_l:
                    static_dict["All"].ExA100 += 1
                    part_cal.cal(static_dict, sigA, sigC, sigE, sigF, "ExA100",
                                 1)
                    #file_correct.write(qname + '\n')
            if num_recover_exons >= int(0.8 * numparts):
                static_dict["All"].ExR80 += 1
                part_cal.cal(static_dict, sigA, sigC, sigE, sigF, "ExR80", 1)
                if num_recover_exons >= int(0.8 * sam_l):
                    static_dict["All"].ExA80 += 1
                    part_cal.cal(static_dict, sigA, sigC, sigE, sigF, "ExA80",
                                 1)
            if num_recover_exons >= int(0.9 * numparts):
                static_dict["All"].ExR90 += 1
                part_cal.cal(static_dict, sigA, sigC, sigE, sigF, "ExR90", 1)
                if num_recover_exons >= int(0.9 * sam_l):
                    static_dict["All"].ExA90 += 1
                    part_cal.cal(static_dict, sigA, sigC, sigE, sigF, "ExA90",
                                 1)
            static_dict["All"].Total_aligned_exons += num_recover_exons
            part_cal.cal(static_dict, sigA, sigC, sigE, sigF,
                         "Total_aligned_exons", num_recover_exons)
            static_dict["All"].Total_aligned_reads += 1
            part_cal.cal(static_dict, sigA, sigC, sigE, sigF,
                         "Total_aligned_reads", 1)
            #**************************************************************************************

    #************************************************
    #******************************************write csv
    static_dict["All"].Total_reads = TotalReport.Total_reads + 1
    static_dict["All"].Total_bases = TotalReport.Total_bases + 1
    static_dict[
        "All"].Total_expected_exons = TotalReport.Total_expected_exons + 1
    static_dict["A"].Total_reads = TotalReport.Total_level2_reads + 1
    static_dict["A"].Total_bases = TotalReport.Total_level2_bases + 1
    static_dict[
        "A"].Total_expected_exons = TotalReport.Total_level2_expected_exons + 1
    static_dict["B"].Total_reads = TotalReport.Total_level2_r_reads + 1
    static_dict["B"].Total_bases = TotalReport.Total_level2_r_bases + 1
    static_dict[
        "B"].Total_expected_exons = TotalReport.Total_level2_r_expected_exons + 1
    static_dict["C"].Total_reads = TotalReport.Total_level3_SS_reads + 1
    static_dict["C"].Total_bases = TotalReport.Total_level3_SS_bases + 1
    static_dict[
        "C"].Total_expected_exons = TotalReport.Total_level3_SS_expected_exons + 1
    static_dict["D"].Total_reads = TotalReport.Total_level3_AS_reads + 1
    static_dict["D"].Total_bases = TotalReport.Total_level3_AS_bases + 1
    static_dict[
        "D"].Total_expected_exons = TotalReport.Total_level3_AS_expected_exons + 1
    static_dict["E"].Total_reads = TotalReport.Total_level4_2_5_reads + 1
    static_dict["E"].Total_bases = TotalReport.Total_level4_2_5_bases + 1
    static_dict[
        "E"].Total_expected_exons = TotalReport.Total_level4_2_5_expected_exons + 1
    static_dict["F"].Total_reads = TotalReport.Total_level4_6_9_reads + 1
    static_dict["F"].Total_bases = TotalReport.Total_level4_6_9_bases + 1
    static_dict[
        "F"].Total_expected_exons = TotalReport.Total_level4_6_9_expected_exons + 1
    static_dict["G"].Total_reads = TotalReport.Total_level4_10_reads + 1
    static_dict["G"].Total_bases = TotalReport.Total_level4_10_bases + 1
    static_dict[
        "G"].Total_expected_exons = TotalReport.Total_level4_10_expected_exons + 1

    #print_static_dict(static_dict)

    with open(csv_path, "w") as fw:
        csv_write = csv.writer(fw, dialect='excel')
        header = [" ", resultfile]
        csv_write.writerow(header)
        for item in key:
            level = [
                item,
                str(static_dict[item].Total_reads) + ' reads/' +
                str(static_dict[item].Total_bases) + ' bases/' +
                str(static_dict[item].Total_expected_exons) + ' exons'
            ]
            row1 = [
                "Aligned", static_dict[item].Total_aligned_reads,
                round(
                    100 * static_dict[item].Total_aligned_reads /
                    float(static_dict[item].Total_reads), 2)
            ]
            row2 = [
                "bases%", static_dict[item].Total_aligned_bases,
                round(
                    100 * static_dict[item].Total_aligned_bases /
                    float(static_dict[item].Total_bases), 2)
            ]
            #indicator for recall
            line = str(
                round(
                    100 * static_dict[item].ExR100 /
                    float(static_dict[item].Total_reads), 2)) + '/' + str(
                        round(
                            100 * static_dict[item].ExR90 /
                            float(static_dict[item].Total_reads),
                            2)) + '/' + str(
                                round(
                                    100 * static_dict[item].ExR80 /
                                    float(static_dict[item].Total_reads), 2))
            row3 = ["ExR100/90/80%", line]
            #indicator for accuracy
            #line = str(round(100*static_dict[item].ExA100/float(static_dict[item].Total_reads), 2)) + '/' + str(round(100*static_dict[item].ExA90/float(static_dict[item].Total_reads), 2)) + '/' + str(round(100*static_dict[item].ExA80/float(static_dict[item].Total_reads), 2))
            #row4 = ["ExA100/90/80%", line]
            line = str(
                round(
                    100 * static_dict[item].ExA100 /
                    float(static_dict[item].Total_reads), 2)) + '/' + str(
                        round(
                            100 * static_dict[item].ExA80 /
                            float(static_dict[item].Total_reads), 2))
            row4 = [
                "Read100/80%", static_dict[item].ExA100,
                static_dict[item].ExA80, line
            ]
            line = str(
                round(
                    100 * static_dict[item].Hit100 /
                    float(static_dict[item].Total_reads), 2)) + '/' + str(
                        round(
                            100 * static_dict[item].Hit80 /
                            float(static_dict[item].Total_reads), 2))
            row5 = ["Hit100/80%", line]
            row6 = [
                "Exons%", static_dict[item].Total_aligned_exons,
                round(
                    100 * static_dict[item].Total_aligned_exons /
                    float(static_dict[item].Total_expected_exons), 2)
            ]
            csv_write.writerow(level)
            csv_write.writerow(row1)
            csv_write.writerow(row2)
            #csv_write.writerow(row3)
            csv_write.writerow(row4)
            #csv_write.writerow(row5)
            csv_write.writerow(row6)
Beispiel #8
0
def analyze(annotations_file):

    filename, file_extension = os.path.splitext(annotations_file)

    if file_extension.lower() in ['.gtf', '.gff']:
        filetype = 'GTF'
    elif file_extension.lower() in ['.bed']:
        filetype = 'BED'
    else:
        raise Exception('Invalid annotation file type: %s' % file_extension)

    # Reading annotation file
    # annotations = Annotation_formats.Load_Annotation_From_File(annotations_file, check_duplicates = True)
    annotations = Annotation_formats.Load_Annotation_From_File(annotations_file)

    # for annotation in annotations:
    #     if len(annotation.items) > 1 and annotation.genename[0] == 'Q':
    #         import pdb
    #         pdb.set_trace()

    # Analyzing annotations to discover alternate splicings
    # Grouping annotations which overlap and are on the same strand
    annotation_groups = {}
    group_found = True
    gene_start = gene_end = trcnt = iden = 0

    for annotation in annotations:
        group_found = False
        for idgroup, group in annotation_groups.iteritems():
            gene_start = group[0]
            gene_end = group[1]
            trcnt = group[2]
            iden = idgroup
            if annotation.overlapsGene(gene_start, gene_end):
                group_found = True
                break

        if group_found:
            if annotation.start < gene_start:
                gene_start = annotation.start
            if annotation.end > gene_end:
                gene_end = annotation.end
            trcnt += 1
            annotation_groups[iden] = (gene_start, gene_end, trcnt)
        else:
            iden = annotation.start
            annotation_groups[iden] = (annotation.start, annotation.end, 1)


    new = True
    new_groups = {}
    groupid = group_start = group_end = 0
    for iden, group in sorted(annotation_groups.iteritems(), key=lambda(k,v):v[0]):
        # group = annotation_groups[iden]

        if new:
            groupid = group[0]
            group_start = group[0]
            group_end = group[1]
            trcnt = group[2]
            new = False
        else:
            # If overlaps with the current group join it
            if  not (group[0] > group_end or group[1] < group_start):
                if group[0] < group_start:
                    group_start = group[0]
                if group[1] > group_end:
                    group_end = group[1]
                trcnt += group[2]

            # And if it doesnt overlap, add old group to the new group dictionary
            # And start a new group
            else:
                new_groups[groupid] = (group_start, group_end, trcnt)
                groupid = group[0]
                group_start = group[0]
                group_end = group[1]
                trcnt = group[2]

    # Add last group to thenew  dictionary
    new_groups[groupid] = (group_start, group_end, trcnt)


    sys.stderr.write("\nWritting annotation groups (%d)\n" % len(new_groups))
    sys.stdout.write("ID\tSTART\tEND\tTRCNT\n")
    for idgroup in sorted(new_groups.iterkeys()):
        group = new_groups[idgroup]
        sys.stdout.write("%d\t%d\t%d\t%d\n" % (idgroup, group[0], group[1], group[2]))
Beispiel #9
0
def processData(read_fastq, annotationfile, ss_list):
    report = Report()
    #load annotation:
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)
    annotation_dict = {}
    for annotation in annotations:
        if annotation.transcriptname in annotation_dict:
            pass
        else:
            annotation_dict[annotation.transcriptname] = annotation

    SS_list = list()
    with open(ss_list, 'r') as f:
        for s in f:
            SS_list.append(s.strip())

    fread = open(read_fastq, 'r')

    unaligned = False
    for line in fread:
        if line.startswith('>'):
            pos = line.split('_')
            simGeneName = pos[0].strip('>')
            if "transcript" in simGeneName:
                simGeneName = simGeneName.split(':')[1]

            maf_startpos = int(pos[1])
            aln_sig = pos[2]
            read_idx = int(pos[3])
            maf_strand = pos[4]
            l_clip = int(pos[5])
            maf_length = int(pos[6])
            r_clip = int(pos[7])

            if aln_sig == "unaligned":
                unaligned = True
                continue

            annotation = annotation_dict[
                simGeneName]  # Getting the correct annotation

            maf_reflen = 0
            for i in range(len(annotation.items)):
                maf_reflen += annotation.items[i].getLength(
                )  # get the reference length from exons itemso

            # IMPORTANT: If the reads were generated from an annotation on reverse strand
            #            expected partial alignments must be reversed
            if annotation.strand == Annotation_formats.GFF_STRANDRV:
                maf_startpos = maf_reflen - maf_length - maf_startpos

            # Calculating expected partial alignmetns from MAF and annotations
            sigA = False
            sigB = True
            sigC = False
            sigD = False
            sigE = False
            sigF = False
            sigG = False

            # 1. Calculating the index of the first exon
            # i - the index of exon currently being considered
            i = 0
            while annotation.items[i].getLength() <= maf_startpos:
                maf_startpos -= annotation.items[i].getLength()
                i += 1

            # Calculating expected partial alignments by filling up exons using maf_length
            expected_partial_alignments = []
            while maf_length > 0:
                start = annotation.items[i].start + maf_startpos
                end = annotation.items[i].end
                assert start <= end

                # OLD: length = end-start+1
                # KK: End is already indicating position after the last base, so adding one when callculating length is not correct
                length = end - start
                if length <= maf_length:
                    expected_partial_alignments.append((start, end))
                    maf_length -= length
                    i += 1
                else:
                    expected_partial_alignments.append(
                        (start, start + maf_length))
                    maf_length = 0
                    i += 1

                # Start position should only be considered for the first exon
                maf_startpos = 0
            #*****************************************
            #*****************************************

            report.Total_reads += 1
            num = len(expected_partial_alignments)
            report.Total_expected_exons += len(expected_partial_alignments)
            #level2
            for ele in expected_partial_alignments[1:-1]:
                if ele[1] - ele[0] < 30:
                    report.Total_level2_reads += 1
                    report.Total_level2_expected_exons += num
                    sigA = True
                    break
            if sigA == False:
                report.Total_level2_r_reads += 1
                report.Total_level2_r_expected_exons += num
            #level4
            if num < 6:
                report.Total_level4_2_5_reads += 1
                report.Total_level4_2_5_expected_exons += num
                sigE = True
            elif num > 5 and num < 10:
                report.Total_level4_6_9_reads += 1
                report.Total_level4_6_9_expected_exons += num
                sigF = True
            else:
                report.Total_level4_10_reads += 1
                report.Total_level4_10_expected_exons += num

            #level3
            #print simGeneName
            if simGeneName in SS_list:
                report.Total_level3_SS_reads += 1
                report.Total_level3_SS_expected_exons += num
                sigC = True
            else:
                report.Total_level3_AS_reads += 1
                report.Total_level3_AS_expected_exons += num
                sigD = True
        else:
            if unaligned == True:
                unaligned = False
                continue
            sim_bases = int(len(line))
            report.Total_bases += sim_bases
            #level2
            if sigA == True:
                report.Total_level2_bases += sim_bases
            else:
                report.Total_level2_r_bases += sim_bases

            #level3
            if sigC == True:
                report.Total_level3_SS_bases += sim_bases
            else:
                report.Total_level3_AS_bases += sim_bases
            #level4
            if sigE == True:
                report.Total_level4_2_5_bases += sim_bases
            elif sigF == True:
                report.Total_level4_6_9_bases += sim_bases
            else:
                report.Total_level4_10_bases += sim_bases

    fread.close()
    return report
def processData(datafolder, annotationfile, ss_list, hq_read_file):

    #load annotation:
    annotations = Annotation_formats.Load_Annotation_From_File(annotationfile)
    annotation_dict = {}
    for annotation in annotations:
        if annotation.transcriptname in annotation_dict:
            pass
        else:
            # print(annotation.transcriptname)
            annotation_dict[annotation.transcriptname] = annotation

    #cal file count
    fFile = os.listdir(datafolder)
    file_count = int(len(fFile) / 2)

    SS_list = list()
    with open(ss_list, 'r') as f_ss:
        for line in f_ss:
            SS_list.append(line.strip())

    RR_list = list()
    with open(hq_read_file + ".txt", 'r') as f_ss:
        for line in f_ss:
            RR_list.append(line.strip())
    # print(RR_list[0])

    RL_list = list()
    with open(hq_read_file + "_len.txt", 'r') as f_ss:
        for line in f_ss:
            RL_list.append(line.strip())
    # print(RL_list[0])

    report = Report()

    simFileSuffix = 'SimG2_S'

    for i in range(file_count):
        simFileName = simFileSuffix + '_%04d' % (i + 1)
        # print(simFileName)
        simRefFileName = simFileName + '.ref'  #SimG1_S_0001.ref
        simMafFileName = simFileName + '.maf'  #SimG1_S_0001.maf

        simFilePath = datafolder
        simRefFilePath = os.path.join(simFilePath, simRefFileName)
        simMafFilePath = os.path.join(simFilePath, simMafFileName)

        if not os.path.exists(simRefFilePath):
            raise Exception(
                'Reference file for simulated read %s does not exist!' %
                simRefFilePath)
        if not os.path.exists(simMafFilePath):
            raise Exception(
                'Sequence alignment (MAF) for simulated read %s does not exist!'
                % simMafFilePath)

        # Reading reference file
        [headers, seqs, quals] = read_fastq(simRefFilePath)
        simGeneName = headers[0]
        # if "transcript" in simGeneName:
        #     simGeneName = simGeneName.split(':')[1]
        annotation = annotation_dict[
            simGeneName]  # Getting the correct annotation

        maf_startpos = maf_length = 0
        i = 0
        len_i = 0
        l_c = 0
        sigA = False
        total_sim_bases = 0
        total_sim_exons = 0
        with open(simMafFilePath, 'rU') as maffile:
            for line in maffile:
                if line[0] == 's':
                    if line.split()[1] == 'ref':  # sim ref
                        line_sim = maffile.readline()
                        line_sim_name = "SimG2_" + line_sim.split()[1]
                        # print(line_sim_name)
                        if line_sim_name in RR_list:

                            flag_wrong = 0
                            l_c += 1
                            elements = line.split()
                            maf_startpos = int(elements[2])  #0
                            maf_length = int(elements[3])  #3490
                            maf_reflen = int(int(elements[5]) / 3)  #3675

                            # Calculating expected partial alignmetns from MAF and annotations
                            #IMPORTANT:  if the reads were generated from an annotation on reverse strand, expected partial alignments must be reversed
                            if annotation.strand == Annotation_formats.GFF_STRANDRV:
                                maf_startpos = maf_reflen * 3 - maf_length - maf_startpos
                                if maf_startpos > maf_reflen * 2:
                                    maf_startpos = maf_startpos - maf_reflen * 2
                                elif maf_startpos > maf_reflen:
                                    maf_startpos = maf_startpos - maf_reflen
                            # 1. Calculating the index of the first exon
                            # i - the index of exon currently being considered

                            i = 0
                            # print(annotation.items[i].getLength())
                            # print(maf_startpos)
                            while annotation.items[i].getLength(
                            ) <= maf_startpos:
                                maf_startpos -= annotation.items[i].getLength()
                                i += 1
                                if len(annotation.items) == i:
                                    flag_wrong = 1
                                    break
                            if flag_wrong == 1:
                                continue
                            # Calculating expected partial alignments by filling up exons using maf_length
                            # maf_length =
                            expected_partial_alignments = []
                            maf_length = int(maf_length / 3)
                            while maf_length > 0:
                                # print(i)
                                start = annotation.items[i].start + maf_startpos
                                end = annotation.items[i].end
                                assert start <= end

                                # OLD: length = end-start+1
                                # KK: End is already indicating position after the last base, so adding one when callculating length is not correct
                                length = end - start
                                # print(length)
                                # print(maf_length)
                                if length <= maf_length:
                                    expected_partial_alignments.append(
                                        (start, end))
                                    maf_length -= length
                                    i += 1
                                    if len(annotation.items) == i:
                                        maf_length = 0
                                else:
                                    expected_partial_alignments.append(
                                        (start, start + maf_length))
                                    maf_length = 0
                                    i += 1

                                # Start position should only be considered for the first exon
                                maf_startpos = 0

                            report.Total_expected_exons += len(
                                expected_partial_alignments)
                            num = len(expected_partial_alignments)
                            total_sim_exons += num
                            #level2
                            for ele in expected_partial_alignments[1:-1]:
                                if ele[1] - ele[0] < 30:
                                    report.Total_level2_reads += 1
                                    report.Total_level2_expected_exons += num
                                    sigA = True
                                    break
                            if sigA == False:
                                report.Total_level2_r_reads += 1
                                report.Total_level2_r_expected_exons += num
                    # else: #sim read
                            sim_bases = int(RL_list[len_i])
                            len_i += 1
                            report.Total_bases += sim_bases
                            total_sim_bases += sim_bases
                            #level2
                            if sigA == True:
                                report.Total_level2_bases += sim_bases
                                sigA = False
                            else:
                                report.Total_level2_r_bases += sim_bases
            #level3
            #print simGeneName
            if simGeneName in SS_list:
                report.Total_level3_SS_reads += l_c
                report.Total_level3_SS_bases += total_sim_bases
                report.Total_level3_SS_expected_exons += total_sim_exons
            else:
                report.Total_level3_AS_reads += l_c
                report.Total_level3_AS_bases += total_sim_bases
                report.Total_level3_AS_expected_exons += total_sim_exons

    report.Total_reads = report.Total_level3_SS_reads + report.Total_level3_AS_reads
    # print(report.Total_reads, report.Total_bases, report.Total_expected_exons)
    # print(report.Total_level2_reads, report.Total_level2_bases, report.Total_level2_expected_exons)
    # print(report.Total_level2_r_reads, report.Total_level2_r_bases, report.Total_level2_r_expected_exons)
    # print(report.Total_level3_AS_reads, report.Total_level3_AS_bases, report.Total_level3_AS_expected_exons)
    # print(report.Total_level3_SS_reads, report.Total_level3_SS_bases, report.Total_level3_SS_expected_exons)
    return report