Example #1
0
    def test_invalid_spreadsheet_doesnt_exist(self):
        '''test_invalid_spreadsheet_doesnt_exist'''
        i = SpreadsheetParser(os.path.join(data_dir,
                                           'file_which_doesnt_exist'))

        with self.assertRaises(Exception):
            i.extract_samples()
Example #2
0
 def test_spreadsheet_with_one_set_of_files(self):
     '''A spreadsheet with one set of files should give a single object back'''
     i = SpreadsheetParser(
         os.path.join(data_dir, 'spreadsheet_with_one_set_of_files.csv'))
     samples = i.extract_samples()
     self.assertEqual(len(samples), 1)
     self.assertEqual(
         samples[0].forward_file,
         'plasmidtron/tests/data/spreadsheetparser/sampleA_1.fastq.gz')
     self.assertEqual(
         samples[0].reverse_file,
         'plasmidtron/tests/data/spreadsheetparser/sampleA_2.fastq.gz')
Example #3
0
    def test_spreadsheet_with_a_non_paired_file(self):
        '''A spreadsheet with a single file should be okay'''
        i = SpreadsheetParser(
            os.path.join(data_dir, 'spreadsheet_with_a_non_paired_file.csv'),
            False)
        samples = i.extract_samples()

        self.assertEqual(len(samples), 1)
        self.assertEqual(
            samples[0].forward_file,
            'plasmidtron/tests/data/spreadsheetparser/sampleA.fasta')
        self.assertEqual(samples[0].basename, 'sampleA')
Example #4
0
	def run(self):
		self.logger.warning('Using KMC syntax version %s', self.kmc_major_version)
		os.makedirs(self.output_directory)
		trait_samples = SpreadsheetParser(self.file_of_traits, self.verbose).extract_samples()
		nontrait_samples = SpreadsheetParser(self.file_of_nontraits, self.verbose).extract_samples()

		self.logger.warning('Generating kmer databases for all samples')
		kmc_samples = self.generate_kmer_databases(trait_samples, nontrait_samples)
		
		self.logger.warning("Generating a database of kmers which are in the traits but not in the nontraits set")
		kmc_complex = KmcComplex(self.output_directory, self.threads, self.min_kmers_threshold, trait_samples, nontrait_samples, self.action, self.verbose)
		kmc_complex.run()

		kmc_filters = self.filter_data_against_kmers(trait_samples,kmc_complex.result_database())
		
		self.logger.warning('Assembling all of the trait samples')
		spades_assemblies = self.assemble_samples(trait_samples, self.keep_files)
	
		if self.kmer_plot:
			spades_assembly_files = [s.filtered_spades_assembly_file() for s in spades_assemblies if os.path.exists(s.filtered_spades_assembly_file())]
			plot_kmers = PlotKmers( spades_assembly_files,
									self.output_directory,
									self.threads,
									self.kmer,
									self.max_kmers_threshold, 
									self.verbose, 
									self.plot_filename)
			plot_kmers.generate_plot()
			
		method_file = Methods(
						os.path.join(self.output_directory, 'methods_summary.txt'), 
						trait_samples, 
						nontrait_samples, 
						self.min_kmers_threshold, 
						self.min_contig_len, 
						self.start_time, 
						self.spades_exec, 
						self.verbose)
		method_file.create_file()
		self.cleanup(kmc_samples, kmc_complex, kmc_filters)
Example #5
0
    def run(self):
        os.makedirs(self.output_directory)
        trait_samples = SpreadsheetParser(
            self.file_of_trait_fastqs).extract_samples()
        nontrait_samples = SpreadsheetParser(
            self.file_of_nontrait_fastqs).extract_samples()

        self.logger.info("Generating a kmer database for each sample")
        kmc_samples = []
        for set_of_samples in [trait_samples, nontrait_samples]:
            for sample in set_of_samples:
                kmc_sample = Kmc(self.output_directory, sample, self.threads,
                                 self.kmer, self.min_kmers_threshold,
                                 self.max_kmers_threshold)
                kmc_sample.run()
                kmc_samples.append(kmc_sample)

        self.logger.info(
            "Generating a database of kmers which are in the traits but not in the nontraits set"
        )
        kmc_complex = KmcComplex(self.output_directory, self.threads,
                                 self.min_kmers_threshold, trait_samples,
                                 nontrait_samples, self.action)
        kmc_complex.run()

        kmc_filters = []
        for sample in trait_samples:
            kmc_filter = KmcFilter(sample, self.output_directory, self.threads,
                                   kmc_complex.result_database())
            kmc_filter.filter_fastq_file_against_kmers()
            kmc_filters.append(kmc_filter)

        kmc_fastas = []
        spades_assemblies = []
        for sample in trait_samples:
            self.logger.info("First assembly with reads only matching kmers")
            spades_assembly = SpadesAssembly(sample, self.output_directory,
                                             self.threads, self.kmer,
                                             self.spades_exec,
                                             self.min_contig_len, True,
                                             self.min_spades_contig_coverage,
                                             False)
            spades_assembly.run()

            if os.path.getsize(spades_assembly.filtered_spades_assembly_file()
                               ) <= self.min_contig_len:
                self.logger.info(
                    "Theres not enough data in the first assembly after filtering, so skipping the rest of the steps for this sample."
                )
                continue

            self.logger.info("Rescaffold 1st assembly with all reads")
            # Next we want to scaffold by using all of the original reads to join up the small contigs.
            # Extract all of the kmers found in the filtered assembly
            self.logger.info("Extract kmers from assembly")
            kmc_fasta = KmcFasta(
                self.output_directory,
                spades_assembly.filtered_spades_assembly_file(), self.threads,
                self.kmer, 1, self.max_kmers_threshold)
            kmc_fasta.run()
            kmc_fastas.append(kmc_fasta)

            # Pull out any reads matching the kmers found in the assembly
            self.logger.info(
                "Pull out reads from original fastq files matching assembly kmers"
            )
            kmc_filter = KmcFilter(sample, self.output_directory, self.threads,
                                   kmc_fasta.output_database_name())
            kmc_filter.filter_fastq_file_against_kmers()
            kmc_filters.append(kmc_filter)

            # delete the original assembly directory
            if not self.verbose:
                spades_assembly.cleanup()

            self.logger.info("Reassemble with SPAdes")
            final_spades_assembly = SpadesAssembly(
                sample, self.output_directory, self.threads, self.kmer,
                self.spades_exec, self.min_contig_len, False,
                self.min_spades_contig_coverage, True)
            final_spades_assembly.run()
            spades_assemblies.append(final_spades_assembly)
            print(final_spades_assembly.filtered_spades_assembly_file() + "\n")

        method_file = Methods(
            os.path.join(self.output_directory, 'methods_summary.txt'),
            trait_samples, nontrait_samples, self.min_kmers_threshold,
            self.min_contig_len, self.start_time, self.spades_exec)
        method_file.create_file()
        self.cleanup(kmc_samples, kmc_fastas, kmc_complex, kmc_filters,
                     spades_assemblies)