def test_find_amplicon(): filename = os.path.join(dirname, "../periscope/resources/artic_primers_V3.bed") primer_bed_object = read_bed_file(filename) inbamfile = pysam.AlignmentFile("reads.sam", "rb") for read in inbamfile: amplicon = find_amplicon(read, primer_bed_object)["right_amplicon"] assert amplicon == truth[read.query_name]["amplicon"]
def test_classify_read(): inbamfile = pysam.AlignmentFile("reads.sam", "rb") filename = os.path.join(dirname, "../periscope/resources/artic_primers_V3.bed") primer_bed_object = read_bed_file(filename) filename = os.path.join(dirname, "../periscope/resources/orf_start.bed") bed_object = open_bed(filename) for read in inbamfile: print(read.query_name) search = 'AACCAACTTTCGATCTCTTGTAGATCTGTTCT' search_result = search_reads(read, search) amplicons = find_amplicon(read, primer_bed_object) orf = check_start(bed_object, read) result = classify_read(read, search_result["align_score"], 50, orf, amplicons) print(result) assert result == truth[read.query_name]["class"]
def main(args): # read input bam file inbamfile = pysam.AlignmentFile(args.bam, "rb") # get bam header so that we can use it for writing later bam_header = inbamfile.header.copy().to_dict() # open output bam with the header we just got outbamfile = pysam.AlignmentFile(args.output_prefix + "_periscope.bam", "wb", header=bam_header) # get mapped reads mapped_reads = get_mapped_reads(args.bam) # open the orfs bed file orf_bed_object = open_bed(args.orf_bed) # open the artic primer bed file primer_bed_object=read_bed_file(args.primer_bed) # set the output reads filename # outfile_reads = args.output_prefix + "_periscope_reads.tsv" # set the output counts file name # add headers to these files # file_reads = open(outfile_reads, "w") # file_reads.write("sample\tread_id\tposition\tread_length\torf\tscore\tclass\tamplicon\n") total_counts = setup_counts(primer_bed_object) # for every read let's decide if it's sgRNA or not print("Processing " + str(mapped_reads) + " reads", file=sys.stderr) for read in tqdm(inbamfile,total=mapped_reads): if read.seq == None: # print("%s read has no sequence" % # (read.query_name), file=sys.stderr) continue if read.is_unmapped: # print("%s skipped as unmapped" % # (read.query_name), file=sys.stderr) continue if read.is_supplementary: # print("%s skipped as supplementary" % # (read.query_name), file=sys.stderr) continue # find the amplicon for the read amplicons = find_amplicon(read, primer_bed_object) total_counts[amplicons["right_amplicon"]]["total_reads"] += 1 # we are searching for the leader sequence search = 'AACCAACTTTCGATCTCTTGTAGATCTGTTCT' # search for the sequence result = search_reads(read,search) # add orf location to result result["read_orf"] = check_start(orf_bed_object, read) # classify read based on prior information read_class = classify_read(read,result["align_score"],args.score_cutoff,result["read_orf"],amplicons) # store the attributes we have calculated with the read as tags read.set_tag('XS', result["align_score"]) read.set_tag('XA', amplicons["right_amplicon"]) read.set_tag('XC', read_class) read.set_tag('XO', result["read_orf"]) # ok now add this info to a dictionary for later processing if "sgRNA" in read_class: if result["read_orf"] is None: result["read_orf"] = "novel_"+str(read.pos) if result["read_orf"] not in total_counts[amplicons["right_amplicon"]][read_class]: total_counts[amplicons["right_amplicon"]][read_class][result["read_orf"]] = [] total_counts[amplicons["right_amplicon"]][read_class][result["read_orf"]].append(read) else: total_counts[amplicons["right_amplicon"]][read_class].append(read) # write the annotated read to a bam file outbamfile.write(read) outbamfile.close() pysam.index(args.output_prefix + "_periscope.bam") # define ORF bed object because we cleared our session orf_bed_object = open_bed(args.orf_bed) # go through each amplicon and do normalisations outfile_amplicons = args.output_prefix + "_periscope_amplicons.csv" total_counts,orf_bed_object = calculate_normalised_counts(mapped_reads,total_counts,outfile_amplicons,orf_bed_object) # summarise result into ORFs result = summarised_counts_per_orf(total_counts,orf_bed_object) # output summarised counts outfile_counts = args.output_prefix + "_periscope_counts.csv" outfile_counts_novel = args.output_prefix + "_periscope_novel_counts.csv" output_summarised_counts(mapped_reads,result,outfile_counts,outfile_counts_novel)
def main(args): # read input bam file inbamfile = pysam.AlignmentFile(args.bam, "rb") # get bam header so that we can use it for writing later bam_header = inbamfile.header.copy().to_dict() # open output bam with the header we just got outbamfile = pysam.AlignmentFile(args.output_prefix + "_periscope.bam", "wb", header=bam_header) # get mapped reads mapped_reads = get_mapped_reads(args.bam) # open the orfs bed file orf_bed_object = open_bed(args.orf_bed) orf_coverage = {} # get coverage for each orf logger.warning("getting coverage at canonical ORF sites") for row in orf_bed_object: median = get_coverage(row.start, row.end, inbamfile) orf_coverage[row.name] = median logger.warning("getting coverage at canonical ORF sites....DONE") # read input bam file again inbamfile = pysam.AlignmentFile(args.bam, "rb") # open the artic primer bed file primer_bed_object = read_bed_file(args.primer_bed) # set the output reads filename # outfile_reads = args.output_prefix + "_periscope_reads.tsv" # set the output counts file name # add headers to these files # file_reads = open(outfile_reads, "w") # file_reads.write("sample\tread_id\tposition\tread_length\torf\tscore\tclass\tamplicon\n") total_counts = setup_counts(primer_bed_object) # for every read let's decide if it's sgRNA or not logger.warning("Processing " + str(mapped_reads) + " reads") result = {} count = 0 orfs = {} reads = {} for read in tqdm(inbamfile, total=mapped_reads): if read.seq == None: # print("%s read has no sequence" % # (read.query_name), file=sys.stderr) continue if read.is_unmapped: # print("%s skipped as unmapped" % # (read.query_name), file=sys.stderr) continue if read.is_supplementary: # print("%s skipped as supplementary" % # (read.query_name), file=sys.stderr) continue if read.is_secondary: # print("%s skipped as secondary" % # (read.query_name), file=sys.stderr) continue # print("------") # print(read.query_name) # # print(read.is_read1) # # print(read.get_tags()) # print(read.cigar) leader_search_result = extact_soft_clipped_bases(read) if read.query_name not in reads: reads[read.query_name] = [] orf = None for row in orf_bed_object: # see if read falls within ORF start location if row.end >= read.pos >= row.start: orf = row.name if orf == None: if leader_search_result == True: orf = "novel_" + str(read.pos) reads[read.query_name].append( ClassifiedRead(sgRNA=leader_search_result, orf=orf, read=read)) # now we have all the reads classified, deal with pairs logger.warning("Processing " + str(mapped_reads) + " reads....DONE") logger.info("dealing with read pairs") for id, pair in reads.items(): # get the class and orf of the left hand read, this will be the classification and ORF for the pair - sometimes right read looks like it has subgenomic evidence - there are likely false positives left_read = min(pair, key=lambda x: x.pos) right_read = max(pair, key=lambda x: x.pos) read_class = left_read.sgRNA orf = left_read.orf left_read_object = left_read.read right_read_object = right_read.read left_read_object.set_tag('XO', orf) right_read_object.set_tag('XO', orf) if read_class == True: left_read_object.set_tag('XC', 'sgRNA') right_read_object.set_tag('XC', 'sgRNA') else: left_read_object.set_tag('XC', 'gRNA') right_read_object.set_tag('XC', 'gRNA') outbamfile.write(left_read_object) outbamfile.write(right_read_object) if orf == None: continue if read_class == False: continue if orf not in orfs: orfs[orf] = [left_read_object] else: orfs[orf].append(left_read_object) logger.info("dealing with read pairs....DONE") outbamfile.close() pysam.sort("-o", args.output_prefix + "_periscope_sorted.bam", args.output_prefix + "_periscope.bam") pysam.index(args.output_prefix + "_periscope_sorted.bam") novel_count = 0 canonical = open(args.output_prefix + "_periscope_counts.csv", "w") canonical.write(",".join([ "sample", "mapped_reads", "orf", "sgRNA_count", "coverage", "sgRPTL", "sgRPHT\n" ])) novel = open(args.output_prefix + "_periscope_novel_counts.csv", "w") novel.write(",".join([ "sample", "mapped_reads", "orf", "sgRNA_count", "coverage", "sgRPTL", "sgRPHT\n" ])) logger.info("summarising results") for orf in orfs: sgRPHT = len(orfs[orf]) / (mapped_reads / 10000) if "novel" not in orf: sgRPTL = len(orfs[orf]) / (orf_coverage[orf] / 1000) canonical.write(args.sample + "," + str(mapped_reads) + "," + orf + "," + str(len(orfs[orf])) + "," + str(orf_coverage[orf]) + "," + str(sgRPTL) + "," + str(sgRPHT) + "\n") else: position = int(orf.split("_")[1]) coverage = get_coverage(position - 20, position + 20, inbamfile) sgRPTL = len(orfs[orf]) / (coverage / 1000) novel.write(args.sample + "," + str(mapped_reads) + "," + orf + "," + str(len(orfs[orf])) + "," + str(coverage) + "," + str(sgRPTL) + "," + str(sgRPHT) + "\n") novel_count += len(orfs[orf]) canonical.close() novel.close() logger.info("summarising results....DONE") amplicons = open(args.output_prefix + "_periscope_amplicons.csv", "w") amplicons.write("not used yet") amplicons.close()