Beispiel #1
0
def test_find_amplicon():

    filename = os.path.join(dirname,
                            "../periscope/resources/artic_primers_V3.bed")
    primer_bed_object = read_bed_file(filename)
    inbamfile = pysam.AlignmentFile("reads.sam", "rb")
    for read in inbamfile:
        amplicon = find_amplicon(read, primer_bed_object)["right_amplicon"]
        assert amplicon == truth[read.query_name]["amplicon"]
Beispiel #2
0
def test_classify_read():
    inbamfile = pysam.AlignmentFile("reads.sam", "rb")

    filename = os.path.join(dirname,
                            "../periscope/resources/artic_primers_V3.bed")
    primer_bed_object = read_bed_file(filename)

    filename = os.path.join(dirname, "../periscope/resources/orf_start.bed")
    bed_object = open_bed(filename)

    for read in inbamfile:
        print(read.query_name)
        search = 'AACCAACTTTCGATCTCTTGTAGATCTGTTCT'
        search_result = search_reads(read, search)
        amplicons = find_amplicon(read, primer_bed_object)
        orf = check_start(bed_object, read)
        result = classify_read(read, search_result["align_score"], 50, orf,
                               amplicons)
        print(result)
        assert result == truth[read.query_name]["class"]
def main(args):

    # read input bam file
    inbamfile = pysam.AlignmentFile(args.bam, "rb")
    # get bam header so that we can use it for writing later
    bam_header = inbamfile.header.copy().to_dict()
    # open output bam with the header we just got
    outbamfile = pysam.AlignmentFile(args.output_prefix + "_periscope.bam", "wb", header=bam_header)

    # get mapped reads
    mapped_reads = get_mapped_reads(args.bam)

    # open the orfs bed file
    orf_bed_object = open_bed(args.orf_bed)
    # open the artic primer bed file
    primer_bed_object=read_bed_file(args.primer_bed)
    # set the output reads filename
    # outfile_reads = args.output_prefix + "_periscope_reads.tsv"
    # set the output counts file name

    # add headers to these files
    # file_reads = open(outfile_reads, "w")
    # file_reads.write("sample\tread_id\tposition\tread_length\torf\tscore\tclass\tamplicon\n")

    total_counts = setup_counts(primer_bed_object)
    # for every read let's decide if it's sgRNA or not
    print("Processing " + str(mapped_reads) + " reads", file=sys.stderr)
    for read in tqdm(inbamfile,total=mapped_reads):

        if read.seq == None:
            # print("%s read has no sequence" %
            #       (read.query_name), file=sys.stderr)
            continue
        if read.is_unmapped:
            # print("%s skipped as unmapped" %
            #       (read.query_name), file=sys.stderr)
            continue
        if read.is_supplementary:
            # print("%s skipped as supplementary" %
            #       (read.query_name), file=sys.stderr)
            continue

        # find the amplicon for the read

        amplicons = find_amplicon(read, primer_bed_object)

        total_counts[amplicons["right_amplicon"]]["total_reads"] += 1


        # we are searching for the leader sequence
        search = 'AACCAACTTTCGATCTCTTGTAGATCTGTTCT'

        # search for the sequence
        result = search_reads(read,search)

        # add orf location to result
        result["read_orf"] = check_start(orf_bed_object, read)

        # classify read based on prior information
        read_class = classify_read(read,result["align_score"],args.score_cutoff,result["read_orf"],amplicons)

        # store the attributes we have calculated with the read as tags
        read.set_tag('XS', result["align_score"])
        read.set_tag('XA', amplicons["right_amplicon"])
        read.set_tag('XC', read_class)
        read.set_tag('XO', result["read_orf"])


        # ok now add this info to a dictionary for later processing
        if "sgRNA" in read_class:
            if result["read_orf"] is None:
                result["read_orf"] = "novel_"+str(read.pos)

            if result["read_orf"] not in total_counts[amplicons["right_amplicon"]][read_class]:
                total_counts[amplicons["right_amplicon"]][read_class][result["read_orf"]] = []
            total_counts[amplicons["right_amplicon"]][read_class][result["read_orf"]].append(read)
        else:
            total_counts[amplicons["right_amplicon"]][read_class].append(read)

        # write the annotated read to a bam file
        outbamfile.write(read)

    outbamfile.close()
    pysam.index(args.output_prefix + "_periscope.bam")


    # define ORF bed object because we cleared our session
    orf_bed_object = open_bed(args.orf_bed)

    # go through each amplicon and do normalisations
    outfile_amplicons = args.output_prefix + "_periscope_amplicons.csv"
    total_counts,orf_bed_object = calculate_normalised_counts(mapped_reads,total_counts,outfile_amplicons,orf_bed_object)
    # summarise result into ORFs
    result = summarised_counts_per_orf(total_counts,orf_bed_object)
    # output summarised counts
    outfile_counts = args.output_prefix + "_periscope_counts.csv"
    outfile_counts_novel = args.output_prefix + "_periscope_novel_counts.csv"
    output_summarised_counts(mapped_reads,result,outfile_counts,outfile_counts_novel)
def main(args):

    # read input bam file
    inbamfile = pysam.AlignmentFile(args.bam, "rb")

    # get bam header so that we can use it for writing later
    bam_header = inbamfile.header.copy().to_dict()
    # open output bam with the header we just got
    outbamfile = pysam.AlignmentFile(args.output_prefix + "_periscope.bam",
                                     "wb",
                                     header=bam_header)

    # get mapped reads
    mapped_reads = get_mapped_reads(args.bam)

    # open the orfs bed file
    orf_bed_object = open_bed(args.orf_bed)

    orf_coverage = {}
    # get coverage for each orf
    logger.warning("getting coverage at canonical ORF sites")
    for row in orf_bed_object:

        median = get_coverage(row.start, row.end, inbamfile)

        orf_coverage[row.name] = median
    logger.warning("getting coverage at canonical ORF sites....DONE")

    # read input bam file again
    inbamfile = pysam.AlignmentFile(args.bam, "rb")

    # open the artic primer bed file
    primer_bed_object = read_bed_file(args.primer_bed)
    # set the output reads filename
    # outfile_reads = args.output_prefix + "_periscope_reads.tsv"
    # set the output counts file name

    # add headers to these files
    # file_reads = open(outfile_reads, "w")
    # file_reads.write("sample\tread_id\tposition\tread_length\torf\tscore\tclass\tamplicon\n")

    total_counts = setup_counts(primer_bed_object)
    # for every read let's decide if it's sgRNA or not
    logger.warning("Processing " + str(mapped_reads) + " reads")
    result = {}
    count = 0
    orfs = {}
    reads = {}
    for read in tqdm(inbamfile, total=mapped_reads):

        if read.seq == None:
            # print("%s read has no sequence" %
            #       (read.query_name), file=sys.stderr)
            continue
        if read.is_unmapped:
            # print("%s skipped as unmapped" %
            #       (read.query_name), file=sys.stderr)
            continue
        if read.is_supplementary:
            # print("%s skipped as supplementary" %
            #       (read.query_name), file=sys.stderr)
            continue
        if read.is_secondary:
            # print("%s skipped as secondary" %
            #       (read.query_name), file=sys.stderr)
            continue
        # print("------")
        # print(read.query_name)
        # # print(read.is_read1)
        # # print(read.get_tags())
        # print(read.cigar)
        leader_search_result = extact_soft_clipped_bases(read)
        if read.query_name not in reads:
            reads[read.query_name] = []
        orf = None
        for row in orf_bed_object:
            # see if read falls within ORF start location
            if row.end >= read.pos >= row.start:
                orf = row.name
        if orf == None:
            if leader_search_result == True:
                orf = "novel_" + str(read.pos)

        reads[read.query_name].append(
            ClassifiedRead(sgRNA=leader_search_result, orf=orf, read=read))

    # now we have all the reads classified, deal with pairs
    logger.warning("Processing " + str(mapped_reads) + " reads....DONE")
    logger.info("dealing with read pairs")
    for id, pair in reads.items():

        # get the class and orf of the left hand read, this will be the classification and ORF for the pair - sometimes right read looks like it has subgenomic evidence - there are likely false positives

        left_read = min(pair, key=lambda x: x.pos)

        right_read = max(pair, key=lambda x: x.pos)

        read_class = left_read.sgRNA
        orf = left_read.orf

        left_read_object = left_read.read
        right_read_object = right_read.read

        left_read_object.set_tag('XO', orf)
        right_read_object.set_tag('XO', orf)

        if read_class == True:
            left_read_object.set_tag('XC', 'sgRNA')
            right_read_object.set_tag('XC', 'sgRNA')
        else:
            left_read_object.set_tag('XC', 'gRNA')
            right_read_object.set_tag('XC', 'gRNA')

        outbamfile.write(left_read_object)
        outbamfile.write(right_read_object)

        if orf == None:
            continue

        if read_class == False:
            continue

        if orf not in orfs:
            orfs[orf] = [left_read_object]
        else:
            orfs[orf].append(left_read_object)

    logger.info("dealing with read pairs....DONE")

    outbamfile.close()
    pysam.sort("-o", args.output_prefix + "_periscope_sorted.bam",
               args.output_prefix + "_periscope.bam")
    pysam.index(args.output_prefix + "_periscope_sorted.bam")

    novel_count = 0
    canonical = open(args.output_prefix + "_periscope_counts.csv", "w")
    canonical.write(",".join([
        "sample", "mapped_reads", "orf", "sgRNA_count", "coverage", "sgRPTL",
        "sgRPHT\n"
    ]))

    novel = open(args.output_prefix + "_periscope_novel_counts.csv", "w")
    novel.write(",".join([
        "sample", "mapped_reads", "orf", "sgRNA_count", "coverage", "sgRPTL",
        "sgRPHT\n"
    ]))

    logger.info("summarising results")

    for orf in orfs:
        sgRPHT = len(orfs[orf]) / (mapped_reads / 10000)
        if "novel" not in orf:
            sgRPTL = len(orfs[orf]) / (orf_coverage[orf] / 1000)
            canonical.write(args.sample + "," + str(mapped_reads) + "," + orf +
                            "," + str(len(orfs[orf])) + "," +
                            str(orf_coverage[orf]) + "," + str(sgRPTL) + "," +
                            str(sgRPHT) + "\n")
        else:
            position = int(orf.split("_")[1])
            coverage = get_coverage(position - 20, position + 20, inbamfile)
            sgRPTL = len(orfs[orf]) / (coverage / 1000)
            novel.write(args.sample + "," + str(mapped_reads) + "," + orf +
                        "," + str(len(orfs[orf])) + "," + str(coverage) + "," +
                        str(sgRPTL) + "," + str(sgRPHT) + "\n")
            novel_count += len(orfs[orf])

    canonical.close()
    novel.close()

    logger.info("summarising results....DONE")

    amplicons = open(args.output_prefix + "_periscope_amplicons.csv", "w")
    amplicons.write("not used yet")
    amplicons.close()