def test_compressor_dsrc(): """Test dsrc codecs to gz and bz2""" # Create a temporary directory and chdir in it: tempdir = tempfile.TemporaryDirectory() filename = sequana_data("test.fastq.gz") shutil.copy(filename, tempdir.name) cwd = os.path.abspath(os.curdir) os.chdir(tempdir.name) # We concert gz -> dsrc -> bz2 -> dsrc -> gz and must get the exact same # However, since the compression is not deterministic, we should compare the # content of the uncompressed file (input and output) try: compressor.main([ prog, "--source", "fastq.gz", "--target", "fastq.dsrc", "--quiet" ]) compressor.main([ prog, "--source", "fastq.dsrc", "--target", "fastq.bz2", "--quiet" ]) compressor.main([ prog, "--source", "fastq.bz2", "--target", "fastq.dsrc", "--quiet" ]) compressor.main([ prog, "--source", "fastq.dsrc", "--target", "fastq.gz", "--quiet" ]) except Exception as err: raise Exception(err) finally: os.chdir(cwd) f1 = FastQ(filename) f2 = FastQ(tempdir.name + os.sep + os.path.basename(filename)) assert f1 == f2
def is_synchronised(self): from sequana import FastQ N = 0 for a, b in zip(FastQ(self.fq1), FastQ(self.fq2)): a = a['identifier'].decode() b = b['identifier'].decode() a = a.split()[0] b = b.split()[0] if a.endswith("/1"): id1 = a.rsplit("/1")[0] elif a.endswith("/2"): id1 = a.rsplit("/2")[0] else: id1 = a if b.endswith("/1"): id2 = b.rsplit("/1")[0] elif b.endswith("/2"): id2 = b.rsplit("/2")[0] else: id2 = b if id1 != id2: print("%s differs from %s" % (id1, id2)) print(a) print(b) return False N += 1 print(N) return True
def __init__(self, directory=".", prefix=""): self.prefix = prefix self.directory = directory self.sample_name = "undefined" # low quality isoforms filename = "all.polished_lq.fastq" self.lq_isoforms = self.get_file(filename) if self.lq_isoforms: logger.info("Reading {}".format(filename)) self.lq_sequence = FastQ(self.lq_isoforms) # high quality isoforms filename = "all.polished_hq.fastq" self.hq_isoforms = self.get_file(filename) if self.hq_isoforms: logger.info("Reading {}".format(filename)) self.hq_sequence = FastQ(self.hq_isoforms) # General info filename = "file.csv" self.csv = self.get_file(filename) if self.csv: logger.info("Reading {}".format(filename)) self.data = pd.read_csv(self.csv) # CCS fasta sequence #self.ccs = self.get_file("-ccs.tar.gz") filename = "ccs.fasta" self.ccs = self.get_file(filename, noprefix=True) if self.ccs: logger.info("Reading {}".format(filename)) self.ccs = FastA(self.ccs)
def main(args=None): if args is None: args = sys.argv[:] user_options = Options(prog="sequana") # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) reference = options.reference if options.file1 and options.file2: fastq = "%s %s" % (options.file1, options.file2) elif options.file1 and not options.file2: fastq = "%s" % (options.file1) elif options.file1 is None: raise ValueError("--file1 must be used") from sequana import FastQ from sequana import FastA S = 0 for this in FastQ(options.file1): S += len(this['sequence']) if options.file2: for this in FastQ(options.file2): S += len(this['sequence']) ref = FastA(options.reference) coverage = float(S) / len(ref.sequences[0]) print('Theoretical Depth of Coverage : %s' % coverage) params = {"reference": reference, "fastq": fastq, "thread": options.thread} # indexing shellcmd("bwa index %(reference)s " % params) cmd = "samtools faidx %(reference)s " % params # mapping cmd = "bwa mem -M " # mark shorter split read as secondary; -M is not compulsary but recommended if options.pacbio: cmd += "-x pacbio " cmd += r" -t %(thread)s -R @RG\\tID:1\\tSM:1\\tPL:illumina -T 30 %(reference)s %(fastq)s " # Samtools options: # S:ignore input format # h:include header # b:bam output if options.sambamba is False: cmd += "| samtools view -Sbh | " # sorting BAM cmd += "samtools sort -@ %(thread)s -o %(reference)s.sorted.bam -" shellcmd(cmd % params) else: # FIXME use sambamba for the view as well cmd += "| samtools view -Sbu - | sambamba sort /dev/stdin -o %(reference)s.sorted.bam -t %(thread)s --tmpdir=./tmp " % params shellcmd(cmd % params)
def test_fastq_unzipped(): for thisdata in [data, datagz]: # isntanciation f = fastq.FastQ(thisdata) assert f.data_format == "Illumina_1.8+" # count lines # rune it twice because we want to make sure re-running count_lines # (decompression with zlib) works when run again. assert f.count_lines() == 1000 assert f.count_lines() == 1000 assert f.count_reads() == 250 assert f.count_reads() == 250 # extract head of the file into an unzipped file ft = TempFile() f.extract_head(100, ft.name) fcheck = fastq.FastQ(ft.name) assert fcheck.count_lines() == 100 ft.delete() # extract head of the file and zip output ft = TempFile(suffix=".gz") f.extract_head(100, ft.name) fcheck = fastq.FastQ(ft.name) assert fcheck.count_lines() == 100 ft.delete() with FastQ(thisdata) as ff: assert len(ff) == 250 with TempFile() as fh: selection = f.select_random_reads(10, fh.name) f.select_random_reads(selection, fh.name)
def test_filter(): f = fastq.FastQ(data) # keeps all with TempFile() as fh: f.filter(min_bp=80, max_bp=120, output_filename=fh.name, progressbar=False) assert len(f) == 250 ff = FastQ(fh.name) assert len(ff) == 250 # keeps nothing with TempFile() as fh: f.filter(min_bp=80, max_bp=90, output_filename=fh.name) assert len(f) == 250 ff = FastQ(fh.name) assert len(ff) == 0
def test_compressor_running(): # Here we test gz -> bz2 -> gz -> dsrc -> gz using recursive or not # get a fastq.gz in a temp file and process it tempdir = tempfile.TemporaryDirectory() filename = sequana_data("test.fastq.gz") shutil.copy(filename, tempdir.name) cwd = os.path.abspath(os.curdir) os.chdir(tempdir.name) # We concert gz -> bz2 -> gz and must get the exact same # However, since the compression is not deterministic, we should compare the # content of the uncompressed file (input and output) try: # seems to fail on travis with a subprocess issue # https://travis-ci.org/sequana/sequana/builds/162466158 compressor.main( [prog, "--source", "fastq.gz", "--target", "fastq.bz2", "--quiet"]) compressor.main([ prog, "--source", "fastq.bz2", "--target", "fastq.gz", "--recursive", "--quiet" ]) compressor.main([ prog, "--source", "fastq.gz", "--target", "fastq.dsrc", "--recursive", "--quiet" ]) compressor.main([ prog, "--source", "fastq.dsrc", "--target", "fastq.gz", "--quiet" ]) except Exception as err: raise Exception(err) finally: os.chdir(cwd) f1 = FastQ(filename) f2 = FastQ(tempdir.name + os.sep + os.path.basename(filename)) assert f1 == f2
def __init__(self, directory=".", prefix="job-*"): self.prefix = prefix self.directory = directory # low quality isoforms self.lq_isoforms = self.get_file("lq_isoforms.fastq") if self.lq_isoforms: self.lq_sequence = FastQ(self.lq_isoforms) # high quality isoforms self.hq_isoforms = self.get_file("hq_isoforms.fastq") if self.hq_isoforms: self.hq_sequence = FastQ(self.hq_isoforms) # General info self.csv = self.get_file("-file.csv") if self.csv: self.data = pd.read_csv(self.csv) # CCS fasta sequence #self.ccs = self.get_file("-ccs.tar.gz") self.ccs = self.get_file("ccs.fasta", noprefix=True) if self.ccs: self.ccs = FastA(self.ccs)
def test_bam(tmpdir): datatest = sequana_data("test.bam", "testing") s = BAM(datatest) assert len(s) == 1000 assert s.is_sorted is True df = s.get_df_concordance() assert s.is_paired is True assert int(df.length.sum()) == 67938 assert int(df.M.sum()) == 67788 df = s.get_df() # call this here before other computations on purpose with TempFile(suffix=".json") as fh: s.bam_analysis_to_json(fh.name) assert s.get_read_names() s.get_mapped_read_length() s.get_stats() s.get_stats_full() s.get_samtools_stats_as_df() with TempFile() as fh: s.to_fastq(fh.name) from sequana import FastQ ff = FastQ(fh.name) len(ff) == len(s) # plotting with TempFile(suffix='.png') as fh: s.plot_bar_flags(filename=fh.name, logy=True) s.plot_bar_flags(filename=fh.name) with TempFile(suffix='.png') as fh: s.plot_bar_mapq(filename=fh.name) s.get_gc_content() s.get_length_count() s.plot_gc_content() s.boxplot_qualities() s.boxplot_qualities(max_sample=50) try: s.plot_gc_content(bins=[1, 2, 10]) assert False except: assert True
def test_bam(tmpdir): s = BAM(datatest) assert len(s) == 1000 assert s.is_sorted is True assert len(list(s.iter_unmapped_reads())) == 2 s.reset() assert len(list(s.iter_mapped_reads())) == 998 s.reset() # call this here before other computations on purpose with TempFile(suffix=".json") as fh: s.bam_analysis_to_json(fh.name) assert s.get_read_names() s.get_mapped_read_length() s.get_stats() s.get_full_stats_as_df() with TempFile(suffix='.png') as fh: s.plot_bar_flags(filename=fh.name, logy=True) s.plot_bar_flags(filename=fh.name) with TempFile(suffix='.png') as fh: s.plot_bar_mapq(filename=fh.name) with TempFile() as fh: s.to_fastq(fh.name) from sequana import FastQ ff = FastQ(fh.name) len(ff) == len(s) s.get_gc_content() s.get_length_count() s.plot_gc_content() try: s.plot_gc_content(bins=[1, 2, 10]) assert False except: assert True
def __init__(self, infile, references, outdir, mapper, threads=4): self.infile = infile self.references = references self.outdir = outdir self.threads = threads if os.path.exists(outdir): logger.info("using {} for output".format(outdir)) else: os.mkdir(outdir) # this may be used later on for other mapper or methodology if mapper == "minimap2": self.mapper_cmd = "minimap2 -x map-pb -t {} {} {} -a > {}" elif mapper == "bwa": self.mapper_cmd = "bwa mem -M -t {} {} {} > {}" f = FastQ(self.infile) self.L = len(f) logger.info("Found {} reads in input FastQ file\n\n".format(self.L))
class IsoSeqQC(object): """ Use get_isoseq_files on smrtlink to get the proper files iso = IsoSeqQC() iso.hist_read_length_consensus_isoform() # histo CCS iso.stats() # "CCS" key is equivalent to summary metrics in CCS report todo: get CCS passes histogram . Where to get the info of passes ? """ def __init__(self, directory=".", prefix=""): self.prefix = prefix self.directory = directory self.sample_name = "undefined" # low quality isoforms filename = "all.polished_lq.fastq" self.lq_isoforms = self.get_file(filename) if self.lq_isoforms: logger.info("Reading {}".format(filename)) self.lq_sequence = FastQ(self.lq_isoforms) # high quality isoforms filename = "all.polished_hq.fastq" self.hq_isoforms = self.get_file(filename) if self.hq_isoforms: logger.info("Reading {}".format(filename)) self.hq_sequence = FastQ(self.hq_isoforms) # General info filename = "file.csv" self.csv = self.get_file(filename) if self.csv: logger.info("Reading {}".format(filename)) self.data = pd.read_csv(self.csv) # CCS fasta sequence #self.ccs = self.get_file("-ccs.tar.gz") filename = "ccs.fasta" self.ccs = self.get_file(filename, noprefix=True) if self.ccs: logger.info("Reading {}".format(filename)) self.ccs = FastA(self.ccs) def get_file(self, tag, noprefix=False): if noprefix: filenames = glob.glob(self.directory + os.sep + tag) else: filenames = glob.glob(self.directory + os.sep + self.prefix + tag) if len(filenames) == 1: return filenames[0] elif len(filenames) > 1: print("Found several files ending in %s" % tag) else: print("No files matching %s" % tag) return None def stats(self): results = {} if self.data is not None: logger.info("Reading strand") results['strand'] = { "+": sum(self.data.strand == "+"), "-": sum(self.data.strand == "-"), "?": sum(self.data.strand.isnull()) } results['classification'] = { "total_ccs_reads" : len(self.data), "five_prime_reads" : int(self.data.fiveseen.sum()), "three_prime_reads" : int(self.data.threeseen.sum()), "chimera" : int(self.data.chimera.sum()), "polyA_reads" : int(self.data.polyAseen.sum()), } if self.lq_isoforms: logger.info("Reading LQ isoforms") results['lq_isoform'] = self.lq_sequence.stats() # number of if self.hq_isoforms: logger.info("Reading HQ isoforms") results['hq_isoform'] = self.hq_sequence.stats() # number of polished HQ isoform if self.ccs: seq = [ len(read.sequence) for read in self.ccs] results["CCS"] = { "mean_length" : pylab.mean(seq), "number_ccs_bases" : sum(seq), "number_ccs_reads" : len(seq) } self.idents_v = [] self.full_v = [] self.non_full_v = [] self.isoform_lengths = [] for read in self.lq_sequence: ident, full, non_full, length = read['identifier'].decode().split(";") self.idents_v.append(ident) self.full_v.append(int(full.split("=")[1])) self.non_full_v.append(int(non_full.split("=")[1])) self.isoform_lengths.append(int(length.split("=")[1])) return results def to_summary(self, filename="sequana_summary_isoseq.json", data=None): """Save statistics into a JSON file :param filename: :param data: dictionary to save. If not provided, use :meth:`stats` """ from sequana.summary import Summary if data is None: data = self.stats() Summary("isoseq",self.sample_name, data=data).to_json(filename) def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read['sequence']) for read in self.lq_sequence] L2 = [len(read['sequence']) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads") def hist_average_quality(self, fontsize=16, bins=None): """ bins is from 0 to 94 """ hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) for read in self.hq_sequence] lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) for read in self.lq_sequence] if bins is None: bins = range(0,94) Y1, X = np.histogram(hq_qv, bins=bins) Y2, X = np.histogram(lq_qv, bins=bins) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlim([0.5, 93.5]) pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize) ax = pylab.twinx() N = np.sum(Y1+Y2) ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
class IsoSeq(object): """ ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/ """ def __init__(self, directory=".", prefix="job-*"): self.prefix = prefix self.directory = directory # low quality isoforms self.lq_isoforms = self.get_file("lq_isoforms.fastq") if self.lq_isoforms: self.lq_sequence = FastQ(self.lq_isoforms) # high quality isoforms self.hq_isoforms = self.get_file("hq_isoforms.fastq") if self.hq_isoforms: self.hq_sequence = FastQ(self.hq_isoforms) # General info self.csv = self.get_file("-file.csv") if self.csv: self.data = pd.read_csv(self.csv) # CCS fasta sequence #self.ccs = self.get_file("-ccs.tar.gz") self.ccs = self.get_file("ccs.fasta", noprefix=True) if self.ccs: self.ccs = FastA(self.ccs) def get_file(self, tag, noprefix=False): if noprefix: filenames = glob.glob(self.directory + os.sep + tag) else: filenames = glob.glob(self.directory + os.sep + self.prefix + tag) if len(filenames) == 1: return filenames[0] elif len(filenames) > 1: print("Found several files ending in %s" % tag) else: print("No files matching %s" % tag) return None def stats(self): results = {} if self.data is not None: print("Reading strand") results['strand'] = { "+": sum(self.data.strand == "+"), "-": sum(self.data.strand == "-"), "?": sum(self.data.strand.isnull()) } results['classification'] = { "total_ccs_reads": len(self.data), "five_prime_reads": sum(self.data.fiveseen), "three_prime_reads": sum(self.data.threeseen), "polyA_reads": sum(self.data.polyAseen), } if self.lq_isoforms: print("Reading LQ isoform") results['lq_isoform'] = self.lq_sequence.stats() # number of if self.hq_isoforms: print("Reading HQ isoform") results['hq_isoform'] = self.hq_sequence.stats( ) # number of polished HQ isoform if self.ccs: seq = [read.sequence for read in self.ccs] results["CCS"] = { "mean_length": pylab.mean(seq), "number_ccs_bases": sum(seq), "number_ccs_reads": len(seq) } self.idents_v = [] self.full_v = [] self.non_full_v = [] self.isoform_lengths = [] for read in self.lq_sequence: ident, full, non_full, length = read['identifier'].decode().split( ";") self.idents_v.append(ident) self.full_v.append(int(full.split("=")[1])) self.non_full_v.append(int(non_full.split("=")[1])) self.isoform_lengths.append(int(length.split("=")[1])) return results def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read['sequence']) for read in self.lq_sequence] L2 = [len(read['sequence']) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[1:] - shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads") def hist_average_quality(self, fontsize=16): hq_qv = [ mean([phred.ascii_to_quality(X) for X in read['quality'].decode()]) for read in iso.hq_sequence ] lq_qv = [ mean([phred.ascii_to_quality(X) for X in read['quality'].decode()]) for read in iso.lq_sequence ] Y1, X = numpy.histogram(hq_qv, bins=range(0, 94)) Y2, X = numpy.histogram(lq_qv, bins=range(0, 94)) pylab.bar(X[1:], Y1, width=1, label="HQ") pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ") pylab.xlabel("Isoform average QV") pylab.ylabel("# Isoform") pylab.legend(fontsize=fontsize)