def test_invalid_spreadsheet_doesnt_exist(self): '''test_invalid_spreadsheet_doesnt_exist''' i = SpreadsheetParser(os.path.join(data_dir, 'file_which_doesnt_exist')) with self.assertRaises(Exception): i.extract_samples()
def test_spreadsheet_with_one_set_of_files(self): '''A spreadsheet with one set of files should give a single object back''' i = SpreadsheetParser( os.path.join(data_dir, 'spreadsheet_with_one_set_of_files.csv')) samples = i.extract_samples() self.assertEqual(len(samples), 1) self.assertEqual( samples[0].forward_file, 'plasmidtron/tests/data/spreadsheetparser/sampleA_1.fastq.gz') self.assertEqual( samples[0].reverse_file, 'plasmidtron/tests/data/spreadsheetparser/sampleA_2.fastq.gz')
def test_spreadsheet_with_a_non_paired_file(self): '''A spreadsheet with a single file should be okay''' i = SpreadsheetParser( os.path.join(data_dir, 'spreadsheet_with_a_non_paired_file.csv'), False) samples = i.extract_samples() self.assertEqual(len(samples), 1) self.assertEqual( samples[0].forward_file, 'plasmidtron/tests/data/spreadsheetparser/sampleA.fasta') self.assertEqual(samples[0].basename, 'sampleA')
def run(self): self.logger.warning('Using KMC syntax version %s', self.kmc_major_version) os.makedirs(self.output_directory) trait_samples = SpreadsheetParser(self.file_of_traits, self.verbose).extract_samples() nontrait_samples = SpreadsheetParser(self.file_of_nontraits, self.verbose).extract_samples() self.logger.warning('Generating kmer databases for all samples') kmc_samples = self.generate_kmer_databases(trait_samples, nontrait_samples) self.logger.warning("Generating a database of kmers which are in the traits but not in the nontraits set") kmc_complex = KmcComplex(self.output_directory, self.threads, self.min_kmers_threshold, trait_samples, nontrait_samples, self.action, self.verbose) kmc_complex.run() kmc_filters = self.filter_data_against_kmers(trait_samples,kmc_complex.result_database()) self.logger.warning('Assembling all of the trait samples') spades_assemblies = self.assemble_samples(trait_samples, self.keep_files) if self.kmer_plot: spades_assembly_files = [s.filtered_spades_assembly_file() for s in spades_assemblies if os.path.exists(s.filtered_spades_assembly_file())] plot_kmers = PlotKmers( spades_assembly_files, self.output_directory, self.threads, self.kmer, self.max_kmers_threshold, self.verbose, self.plot_filename) plot_kmers.generate_plot() method_file = Methods( os.path.join(self.output_directory, 'methods_summary.txt'), trait_samples, nontrait_samples, self.min_kmers_threshold, self.min_contig_len, self.start_time, self.spades_exec, self.verbose) method_file.create_file() self.cleanup(kmc_samples, kmc_complex, kmc_filters)
def run(self): os.makedirs(self.output_directory) trait_samples = SpreadsheetParser( self.file_of_trait_fastqs).extract_samples() nontrait_samples = SpreadsheetParser( self.file_of_nontrait_fastqs).extract_samples() self.logger.info("Generating a kmer database for each sample") kmc_samples = [] for set_of_samples in [trait_samples, nontrait_samples]: for sample in set_of_samples: kmc_sample = Kmc(self.output_directory, sample, self.threads, self.kmer, self.min_kmers_threshold, self.max_kmers_threshold) kmc_sample.run() kmc_samples.append(kmc_sample) self.logger.info( "Generating a database of kmers which are in the traits but not in the nontraits set" ) kmc_complex = KmcComplex(self.output_directory, self.threads, self.min_kmers_threshold, trait_samples, nontrait_samples, self.action) kmc_complex.run() kmc_filters = [] for sample in trait_samples: kmc_filter = KmcFilter(sample, self.output_directory, self.threads, kmc_complex.result_database()) kmc_filter.filter_fastq_file_against_kmers() kmc_filters.append(kmc_filter) kmc_fastas = [] spades_assemblies = [] for sample in trait_samples: self.logger.info("First assembly with reads only matching kmers") spades_assembly = SpadesAssembly(sample, self.output_directory, self.threads, self.kmer, self.spades_exec, self.min_contig_len, True, self.min_spades_contig_coverage, False) spades_assembly.run() if os.path.getsize(spades_assembly.filtered_spades_assembly_file() ) <= self.min_contig_len: self.logger.info( "Theres not enough data in the first assembly after filtering, so skipping the rest of the steps for this sample." ) continue self.logger.info("Rescaffold 1st assembly with all reads") # Next we want to scaffold by using all of the original reads to join up the small contigs. # Extract all of the kmers found in the filtered assembly self.logger.info("Extract kmers from assembly") kmc_fasta = KmcFasta( self.output_directory, spades_assembly.filtered_spades_assembly_file(), self.threads, self.kmer, 1, self.max_kmers_threshold) kmc_fasta.run() kmc_fastas.append(kmc_fasta) # Pull out any reads matching the kmers found in the assembly self.logger.info( "Pull out reads from original fastq files matching assembly kmers" ) kmc_filter = KmcFilter(sample, self.output_directory, self.threads, kmc_fasta.output_database_name()) kmc_filter.filter_fastq_file_against_kmers() kmc_filters.append(kmc_filter) # delete the original assembly directory if not self.verbose: spades_assembly.cleanup() self.logger.info("Reassemble with SPAdes") final_spades_assembly = SpadesAssembly( sample, self.output_directory, self.threads, self.kmer, self.spades_exec, self.min_contig_len, False, self.min_spades_contig_coverage, True) final_spades_assembly.run() spades_assemblies.append(final_spades_assembly) print(final_spades_assembly.filtered_spades_assembly_file() + "\n") method_file = Methods( os.path.join(self.output_directory, 'methods_summary.txt'), trait_samples, nontrait_samples, self.min_kmers_threshold, self.min_contig_len, self.start_time, self.spades_exec) method_file.create_file() self.cleanup(kmc_samples, kmc_fastas, kmc_complex, kmc_filters, spades_assemblies)