Esempio n. 1
0
    def test(self):
        fasta_filename = os.path.join(self.temp_dir, 'transcripts.fasta.gz')
        fastq_filenames = [
            os.path.join(self.temp_dir, 'reads_1.fastq.gz'),
            os.path.join(self.temp_dir, 'reads_2.fastq.gz'),
        ]

        # download files
        urllib.request.urlretrieve('https://github.com/pachterlab/kallisto/raw/master/test/transcripts.fasta.gz',
                                   fasta_filename)
        urllib.request.urlretrieve('https://github.com/pachterlab/kallisto/raw/master/test/reads_1.fastq.gz',
                                   fastq_filenames[0])
        urllib.request.urlretrieve('https://github.com/pachterlab/kallisto/raw/master/test/reads_2.fastq.gz',
                                   fastq_filenames[1])

        # run kallisto on test files
        index_filename = os.path.join(self.temp_dir, 'index.idx')
        rna_seq_util.Kallisto().index([fasta_filename], index_filename=index_filename)
        self.assertTrue(os.path.isfile(index_filename))

        output_dirname = os.path.join(self.temp_dir, 'out')
        rna_seq_util.Kallisto().quant(fastq_filenames, index_filename=index_filename, output_dirname=output_dirname)
        self.assertTrue(os.path.isdir(output_dirname))
        self.assertTrue(os.path.isfile(os.path.join(output_dirname, 'abundance.tsv')))
        self.assertTrue(os.path.isfile(os.path.join(output_dirname, 'abundance.h5')))
        self.assertTrue(os.path.isfile(os.path.join(output_dirname, 'run_info.json')))
Esempio n. 2
0
    def test_error(self):
        with capturer.CaptureOutput(merged=False, relay=False) as captured:
            with self.assertRaises(Exception):
                rna_seq_util.Kallisto()._run('index', ['__undefined__.fasta'], verbose=True)
            self.assertNotEqual(captured.stdout.get_text(), '')
            self.assertNotEqual(captured.stderr.get_text(), '')

        with capturer.CaptureOutput(merged=False, relay=False) as captured:
            with self.assertRaises(Exception):
                rna_seq_util.Kallisto()._run('index', ['__undefined__.fasta'], verbose=False)
            self.assertEqual(captured.stdout.get_text(), '')
            self.assertEqual(captured.stderr.get_text(), '')
Esempio n. 3
0
def process_fastq(experiment_name, sample_name, strain_name, num_fastq_files,
                  read_type, output_directory, temp_directory):

    exp_dirname = "{}/{}".format(output_directory, experiment_name)
    if not os.path.isdir(exp_dirname):
        os.makedirs(exp_dirname)

    sample_dirname = "{}/{}".format(exp_dirname, sample_name)
    if not os.path.isdir(sample_dirname):
        os.makedirs(sample_dirname)

    fastq_filenames = []
    for num in range(num_fastq_files):
        fastq_filenames.append("{}/FASTQ_Files/{}__{}__{}.fastq.gz".format(
            temp_directory, experiment_name, sample_name, num))
    index_filename = '{}/kallisto_index_files/{}.idx'.format(
        output_directory, strain_name)
    output_dirname = '{}/output'.format(sample_dirname)
    if read_type == "single":
        rna_seq_util.Kallisto().quant(fastq_filenames,
                                      index_filename=index_filename,
                                      output_dirname=output_dirname,
                                      single_end_reads=True,
                                      fragment_length=180,
                                      fragment_length_std=20)
    elif read_type == "paired":
        rna_seq_util.Kallisto().quant(fastq_filenames,
                                      index_filename=index_filename,
                                      output_dirname=output_dirname)

    new_pandas = pd.read_csv('{}/output/abundance.tsv'.format(sample_dirname),
                             sep='\t').set_index("target_id")
    total_tpm = 0
    for num in new_pandas['tpm']:
        total_tpm = total_tpm + num
    new_column = [num / total_tpm for num in new_pandas['tpm']]
    new_pandas['target_id'] = new_column
    new_pandas.to_pickle("{}/{}_abundances_binary".format(
        sample_dirname, sample_name))