Beispiel #1
0
def abundance_filter_sample(
        demux_path_str: SingleLanePerSampleSingleEndFastqDirFmt,
        sample_id: str,
        new_demux_path_str: SingleLanePerSampleSingleEndFastqDirFmt):

    print("Commencing abundance-filtering for sample {}".format(sample_id))

    demux = SingleLanePerSampleSingleEndFastqDirFmt(demux_path_str, mode='r')
    new_demux = SingleLanePerSampleSingleEndFastqDirFmt(new_demux_path_str,
                                                        mode='r')

    stats_dict = {}
    stats_dict['sample-id'] = sample_id

    counts_series = sample_counts_series(demux, sample_id)

    stats_dict['n_input_seqs'] = counts_series.sum()

    threshold = abundance_filter_threshold_Wang_et_al(counts_series)
    stats_dict['threshold'] = threshold

    abundance_filtered_series = counts_series[counts_series > threshold].copy()

    n_seqs_kept = abundance_filtered_series.sum()
    stats_dict['n_seqs_kept'] = n_seqs_kept

    n_seqs_unique_kept = len(abundance_filtered_series.index)
    stats_dict['n_seqs_unique_kept'] = n_seqs_unique_kept

    demux_metadata_view = demux.metadata.view(YamlFormat)
    with open(str(demux_metadata_view)) as demux_metadata_fh:
        demux_metadata_dict = yaml.load(demux_metadata_fh)
        phred_offset = demux_metadata_dict['phred-offset']

    new_fastqgz_path = new_demux.sequences.path_maker(sample_id=sample_id,
                                                      barcode_id=1,
                                                      lane_number=1,
                                                      read_number=1)

    #skip writing into the gzipped file below if sequences = 0
    if stats_dict['n_seqs_kept'] == 0:
        return str(new_fastqgz_path), stats_dict

    with gzip.open(str(new_fastqgz_path), mode='w') as writer:
        for seq in return_fastq_seqs_for_sample(demux, sample_id):
            if str(seq) in abundance_filtered_series.index:
                seq_strIO = StringIO()
                skbio.io.write(seq,
                               format='fastq',
                               phred_offset=phred_offset,
                               into=seq_strIO)

                writer.write(bytes(seq_strIO.read(), encoding="UTF-8"))

    return str(new_fastqgz_path), stats_dict
Beispiel #2
0
    def test_slanepsample_single_end_fastq_dir_fmt_validate_positive(self):
        filenames = ('single_end_data/MANIFEST', 'metadata.yml',
                     'Human-Kneecap_S1_L001_R1_001.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSampleSingleEndFastqDirFmt(self.temp_dir.name,
                                                         mode='r')

        format.validate()
Beispiel #3
0
    def test_slanepsample_single_end_fastq_dir_fmt_validate_positive(self):
        filenames = ('single_end_data/MANIFEST', 'metadata.yml',
                     'Human-Kneecap_S1_L001_R1_001.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSampleSingleEndFastqDirFmt(
            self.temp_dir.name, mode='r')

        format.validate()
    def test_slanepsample_single_end_fastq_dir_fmt_validate_negative(self):
        filenames = ('single_end_data/MANIFEST', 'metadata.yml',
                     'not-fastq.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSampleSingleEndFastqDirFmt(self.temp_dir.name,
                                                         mode='r')

        with self.assertRaisesRegex(ValueError, 'SingleLanePerSampleSingle'):
            format.validate()
Beispiel #5
0
    def test_slanepsample_single_end_fastq_dir_fmt_validate_negative(self):
        filenames = ('single_end_data/MANIFEST', 'metadata.yml',
                     'not-fastq.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSampleSingleEndFastqDirFmt(
            self.temp_dir.name, mode='r')

        with self.assertRaisesRegex(ValueError, 'SingleLanePerSampleSingle'):
            format.validate()
Beispiel #6
0
    def test_slanepsample_single_end_fastq_dir_fmt_validate_bad_paired(self):
        filenames = ('paired_end_data/MANIFEST', 'metadata.yml',
                     'Human-Kneecap_S1_L001_R1_001.fastq.gz',
                     'paired_end_data/Human-Kneecap_S1_L001_R2_001.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSampleSingleEndFastqDirFmt(
            self.temp_dir.name, mode='r')

        with self.assertRaisesRegex(ValidationError, 'Forward and reverse'):
            format.validate()
Beispiel #7
0
    def test_slanepsample_single_end_fastq_dir_fmt_validate_bad_paired(self):
        filenames = ('paired_end_data/MANIFEST', 'metadata.yml',
                     'Human-Kneecap_S1_L001_R1_001.fastq.gz',
                     'paired_end_data/Human-Kneecap_S1_L001_R2_001.fastq.gz')
        for filename in filenames:
            filepath = self.get_data_path(filename)
            shutil.copy(filepath, self.temp_dir.name)

        format = SingleLanePerSampleSingleEndFastqDirFmt(self.temp_dir.name,
                                                         mode='r')

        with self.assertRaisesRegex(ValidationError, 'Forward and reverse'):
            format.validate()
def test_write_metadata():
    results = SingleLanePerSampleSingleEndFastqDirFmt()
    _itsxpress._write_metadata(results)
    path = results.path
    metadata = os.path.join(str(path), "metadata.yml")
    with open(metadata, "rt") as fn:
        eq_("{phred-offset: 33}", fn.readline().replace("\n", ""))
Beispiel #9
0
    def setUp(self):
        super().setUp()

        data_single = SingleLanePerSampleSingleEndFastqDirFmt(
            self.get_data_path('filter_samples_single_end/dir_fmt'), mode='r')
        self.sample_single = _PlotQualView(data_single, False)
        self.manifest_single = data_single.manifest.view(pd.DataFrame)

        self.md_single_all = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_all.tsv'))
        self.md_single_subset = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_subset.tsv'))
        self.md_single_none = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_none.tsv'))

        data_paired = SingleLanePerSamplePairedEndFastqDirFmt(
            self.get_data_path('filter_samples_paired_end/dir_fmt'), mode='r')
        self.sample_paired = _PlotQualView(data_paired, True)
        self.manifest_paired = data_paired.manifest.view(pd.DataFrame)

        self.md_paired_all = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_all.tsv'))
        self.md_paired_subset = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_subset.tsv'))
        self.md_paired_none = Metadata.load(
            self.get_data_path('filter_samples_single_end/filter_none.tsv'))
Beispiel #10
0
    def test_underscore_samples(self):
        self.demux_seqs = SingleLanePerSampleSingleEndFastqDirFmt(
            self.get_data_path('underscore_samples'), 'r')

        with open(self.get_data_path('expected/underscore-samples.tsv')) as fh:
            exp_table = biom.Table.from_tsv(fh, None, None, lambda x: x)
        exp_rep_seqs = list(
            skbio.io.read(
                self.get_data_path('expected/underscore-samples.fasta'),
                'fasta',
                constructor=skbio.DNA))
        for seq in exp_rep_seqs:
            del seq.metadata['description']
        exp_md = qiime2.Metadata.load(
            self.get_data_path('expected/underscore-samples-stats.tsv'))

        # Historical NOTE: default used to be `pooled`, so the data still
        # expects that. Since this is only testing underscores, it shouldn't
        # matter much and serves as a regression test to boot.
        table, rep_seqs, md = denoise_single(self.demux_seqs,
                                             100,
                                             chimera_method='pooled')

        self.assertEqual(table, exp_table)
        self.assertEqual(_sort_seqs(rep_seqs), _sort_seqs(exp_rep_seqs))
        self.assertEqual(md, exp_md)
Beispiel #11
0
def abundance_filter_pool(
        sequences: SingleLanePerSampleSingleEndFastqDirFmt, threads: int
) -> (SingleLanePerSampleSingleEndFastqDirFmt, pd.DataFrame):

    list_of_stats_dicts = []

    result = SingleLanePerSampleSingleEndFastqDirFmt()

    sample_ids = return_sample_ids(sequences)

    with Pool(processes=threads) as pool:
        iterable = [(str(sequences), sample_id, str(result))
                    for sample_id in sample_ids]

        sample_results = pool.starmap(abundance_filter_sample, iterable)

        for fastqgz_pth_str, stats_dict in sample_results:
            list_of_stats_dicts.append(stats_dict)

    stats_df = pd.DataFrame(list_of_stats_dicts)
    stats_df = stats_df[STATS_COLUMNS]

    abundance_filtered_result = finalize_result(sequences, result, stats_df)

    return abundance_filtered_result, stats_df
Beispiel #12
0
 def setUp(self):
     super().setUp()
     self.demux_seqs = SingleLanePerSampleSingleEndFastqDirFmt(
         self.get_data_path('sample_seqs_other'), 'r')
     self.ref_ar = Artifact.load(
         self.get_data_path('../../assets/test_reference.qza'))
     self.ref = self.ref_ar.view(DNAFASTAFormat)
Beispiel #13
0
    def test_integer_ids(self):
        int_seqs = SingleLanePerSampleSingleEndFastqDirFmt(
            self.get_data_path('sample_seqs_integers'), 'r')

        obs_tab, _, stats = denoise_16S(int_seqs, 100, sample_stats=True)

        self.assertEqual(set(obs_tab.ids()), {'100', '101', '103', '104'})
        self.assertEqual(set(stats.index.values), {'100', '101', '103', '104'})
Beispiel #14
0
    def setUp(self):
        super().setUp()

        self.fastq_fp = \
            self.get_data_path('forward.fastq.gz')
        self.fastq = FastqGzFormat(self.fastq_fp, mode='r')
        self.seqs_dir_fmt = MultiplexedSingleEndBarcodeInSequenceDirFmt()
        self.seqs_dir_fmt.file.write_data(self.fastq, FastqGzFormat)
        self.barcode_series = pd.Series(['A', 'G'],
                                        index=['sample_a', 'sample_b'])
        self.per_sample_dir_fmt = SingleLanePerSampleSingleEndFastqDirFmt()
        self.untrimmed_dir_fmt = MultiplexedSingleEndBarcodeInSequenceDirFmt()
Beispiel #15
0
    def test_underscore_samples(self):
        self.demux_seqs = SingleLanePerSampleSingleEndFastqDirFmt(
            self.get_data_path('underscore_samples'), 'r')

        with open(self.get_data_path('expected/underscore-samples.tsv')) as fh:
            exp_table = biom.Table.from_tsv(fh, None, None, lambda x: x)
        exp_rep_seqs = list(
            skbio.io.read(
                self.get_data_path('expected/underscore-samples.fasta'),
                'fasta',
                constructor=skbio.DNA))
        for seq in exp_rep_seqs:
            del seq.metadata['description']

        table, rep_seqs = denoise_single(self.demux_seqs, 100)

        self.assertEqual(table, exp_table)
        self.assertEqual(_sort_seqs(rep_seqs), _sort_seqs(exp_rep_seqs))
Beispiel #16
0
    def test_single_sample_multiple_files(self):
        # Note, this case came up on the QIIME 2 Forum, due to a user running
        # `summarize` with demuxed sequences that all had the same Sample ID
        # in the internal MANIFEST file. With versions of seaborn greater than
        # 0.8.0 this is no longer a problem, but on prior versions, the user
        # would see the following error:
        # TypeError: len() of unsized object
        files = [
            'sample1_S0_L001_R1_001.fastq.gz',
            'sample2_S0_L001_R1_001.fastq.gz',
            'sample3_S0_L001_R1_001.fastq.gz', 'MANIFEST', 'metadata.yml'
        ]
        for f in files:
            shutil.copy(
                self.get_data_path('single_sample_multiple_files/%s' % f),
                self.temp_dir.name)

        demux_data = SingleLanePerSampleSingleEndFastqDirFmt(
            self.temp_dir.name, mode='r')
        with tempfile.TemporaryDirectory() as output_dir:
            # TODO: Remove _PlotQualView wrapper
            result = summarize(output_dir,
                               _PlotQualView(demux_data, paired=False),
                               n=1)
            self.assertTrue(result is None)
            index_fp = os.path.join(output_dir, 'overview.html')
            self.assertTrue(os.path.exists(index_fp))
            self.assertTrue(os.path.getsize(index_fp) > 0)
            csv_fp = os.path.join(output_dir, 'per-sample-fastq-counts.csv')
            self.assertTrue(os.path.exists(csv_fp))
            self.assertTrue(os.path.getsize(csv_fp) > 0)
            pdf_fp = os.path.join(output_dir, 'demultiplex-summary.pdf')
            self.assertTrue(os.path.exists(pdf_fp))
            png_fp = os.path.join(output_dir, 'demultiplex-summary.png')
            self.assertTrue(os.path.exists(png_fp))
            with open(index_fp, 'r') as fh:
                html = fh.read()
                self.assertIn('<td>Minimum:</td><td>1</td>', html)
                self.assertIn('<td>Maximum:</td><td>1</td>', html)
Beispiel #17
0
def abundance_filter_single_thread(
    sequences: SingleLanePerSampleSingleEndFastqDirFmt
) -> (SingleLanePerSampleSingleEndFastqDirFmt, pd.DataFrame):
    list_of_stats_dicts = []

    result = SingleLanePerSampleSingleEndFastqDirFmt()

    sample_ids = return_sample_ids(sequences)

    for sample_id in sample_ids:
        sample_result = abundance_filter_sample(demux_path_str=str(sequences),
                                                sample_id=sample_id,
                                                new_demux_path_str=str(result))

        fastqgz_pth_str, stats_dict = sample_result[0], sample_result[1]

        list_of_stats_dicts.append(stats_dict)

    stats_df = pd.DataFrame(list_of_stats_dicts)
    stats_df = stats_df[STATS_COLUMNS]

    abundance_filtered_result = finalize_result(sequences, result, stats_df)

    return abundance_filtered_result, stats_df
Beispiel #18
0
    def setUp(self):
        super().setUp()

        self.demux_seqs = SingleLanePerSampleSingleEndFastqDirFmt(
            self.get_data_path('single-end'), mode='r')
        self.trimmed_seqs = CasavaOneEightSingleLanePerSampleDirFmt()
Beispiel #19
0
TEST_DATA_PAF = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_PAF, "r")
# Test info 4
TEST_FILE_OUT = os.path.join(TEST_DIR, "test_data", "out",
                             "d9955749-00d5-44ae-a628-4b2da43000e1", "data")
TEST_DATA_OUT = SingleLanePerSamplePairedEndFastqDirFmt(TEST_FILE_OUT, "r")
# Test info 5
TEST_FILE_SINGLEOUT = os.path.join(TEST_DIR, "test_data", "singleOut",
                                   "75aea4f5-f10e-421e-91d2-feda9fe7b2e1",
                                   "data")
TEST_DATA_SINGLEOUT = SingleLanePerSamplePairedEndFastqDirFmt(
    TEST_FILE_SINGLEOUT, "r")
# Test info 6
TEST_FILE_SINGLEIN = os.path.join(TEST_DIR, "test_data", "singleIn",
                                  "cfd0e65b-05fb-4329-9618-15ecd0aec9b3",
                                  "data")
TEST_DATA_SINGLEIN = SingleLanePerSampleSingleEndFastqDirFmt(
    TEST_FILE_SINGLEIN, "r")
# Test artifact1
ARTIFACT_TYPE_P = "SampleData[PairedEndSequencesWithQuality]"
# Test artifact2
ARTIFACT_TYPE_S = "SampleData[SequencesWithQuality]"


def test_taxa_prefix_to_taxa():
    exp1 = _itsxpress._taxa_prefix_to_taxa(taxa_prefix="A")
    eq_(exp1, "Alveolata")


class SetFastqsAndCheckTests(unittest.TestCase):
    def test_single(self):
        samples = TEST_DATA_SINGLEIN.manifest.view(pd.DataFrame)
        for sample in samples.itertuples():
Beispiel #20
0
 def setUp(self):
     super().setUp()
     self.demux_seqs = SingleLanePerSampleSingleEndFastqDirFmt(
         self.get_data_path('sample_seqs_single'), 'r')
Beispiel #21
0
 def setUp(self):
     super().setUp()
     # Reusing the single-end reads for this test suite
     self.demux_seqs = SingleLanePerSampleSingleEndFastqDirFmt(
         self.get_data_path('sample_seqs_single'), 'r')
def _join_pairs_w_command_output(
    demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
    truncqual: int = _jp_defaults['truncqual'],
    minlen: int = _jp_defaults['minlen'],
    maxns: int = _jp_defaults['maxns'],
    allowmergestagger: bool = _jp_defaults['allowmergestagger'],
    minovlen: int = _jp_defaults['minovlen'],
    maxdiffs: int = _jp_defaults['maxdiffs'],
    minmergelen: int = _jp_defaults['minmergelen'],
    maxmergelen: int = _jp_defaults['maxmergelen'],
    maxee: float = _jp_defaults['maxee'],
    qmin: int = _jp_defaults['qmin'],
    qminout: int = _jp_defaults['qminout'],
    qmax: int = _jp_defaults['qmax'],
    qmaxout: int = _jp_defaults['qmaxout'],
) -> (List[str], SingleLanePerSampleSingleEndFastqDirFmt):
    # this function exists only to simplify unit testing

    result = SingleLanePerSampleSingleEndFastqDirFmt()

    manifest = pd.read_csv(os.path.join(str(demultiplexed_seqs),
                                        demultiplexed_seqs.manifest.pathspec),
                           header=0,
                           comment='#')
    manifest.filename = manifest.filename.apply(
        lambda x: os.path.join(str(demultiplexed_seqs), x))

    phred_offset = yaml.load(
        open(
            os.path.join(
                str(demultiplexed_seqs),
                demultiplexed_seqs.metadata.pathspec)))['phred-offset']

    id_to_fps = manifest.pivot(index='sample-id',
                               columns='direction',
                               values='filename')

    output_manifest = FastqManifestFormat()
    output_manifest_fh = output_manifest.open()
    output_manifest_fh.write('sample-id,filename,direction\n')
    output_manifest_fh.write('# direction is not meaningful in this file '
                             'as these\n')
    output_manifest_fh.write('# data may be derived from forward, reverse, '
                             'or \n')
    output_manifest_fh.write('# joined reads\n')

    for i, (sample_id, (fwd_fp, rev_fp)) in enumerate(id_to_fps.iterrows()):
        # The barcode id, lane number and read number are not relevant
        # here. We might ultimately want to use a dir format other than
        # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care
        # about this information. Similarly, the direction of the read
        # isn't relevant here anymore.
        path = result.sequences.path_maker(sample_id=sample_id,
                                           barcode_id=i,
                                           lane_number=1,
                                           read_number=1)
        uncompressed_path = str(path).strip('.gz')

        cmd = [
            'vsearch',
            '--fastq_mergepairs',
            fwd_fp,
            '--reverse',
            rev_fp,
            '--fastqout',
            uncompressed_path,
            '--fastq_ascii',
            str(phred_offset),
            '--fastq_minlen',
            str(minlen),
            '--fastq_minovlen',
            str(minovlen),
            '--fastq_maxdiffs',
            str(maxdiffs),
            '--fastq_qmin',
            str(qmin),
            '--fastq_qminout',
            str(qminout),
            '--fastq_qmax',
            str(qmax),
            '--fastq_qmaxout',
            str(qmaxout),
        ]
        if truncqual is not None:
            cmd += ['--fastq_truncqual', str(truncqual)]
        if maxns is not None:
            cmd += ['--fastq_maxns', str(maxns)]
        if minmergelen is not None:
            cmd += ['--fastq_minmergelen', str(minmergelen)]
        if maxmergelen is not None:
            cmd += ['--fastq_maxmergelen', str(maxmergelen)]
        if maxee is not None:
            cmd += ['--fastq_maxee', str(maxee)]
        if allowmergestagger:
            cmd.append('--fastq_allowmergestagger')
        run_command(cmd)
        run_command(['gzip', uncompressed_path])
        output_manifest_fh.write('%s,%s,%s\n' %
                                 (sample_id, path.name, 'forward'))

    output_manifest_fh.close()
    result.manifest.write_data(output_manifest, FastqManifestFormat)

    metadata = YamlFormat()
    metadata.path.write_text(yaml.dump({'phred-offset': phred_offset}))
    result.metadata.write_data(metadata, YamlFormat)

    return cmd, result
Beispiel #23
0
def main(per_sample_sequences: _SingleLanePerSampleFastqDirFmt, threads: int,
         taxa: str, region: str, paired: bool, cluster_id: float):
    """The main communication between the plugin and the ITSxpress program.

    Args:
        per_sample_sequences (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the input.
        threads (int) : The number of threads to use.
        taxa (str): The taxa to be used for the search.
        region (str) : The region to be used for the search.
        cluster_id (float):The percent identity for clustering reads, set to 1 for exact dereplication.

    Returns:
        (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the output.

    Raises:
        ValueError1: hmmsearch error.

    """
    #Seeing if cluter_id is equal to 1
    # Finding the artifact type.
    artifact_type = _view_artifact_type(
        per_sample_sequence=per_sample_sequences)
    # Setting the taxa
    taxa = _taxa_prefix_to_taxa(taxa)
    # Writing the manifest for the output qza
    manifest = FastqManifestFormat()
    manifest_fn = manifest.open()
    manifest_fn.write('sample-id,filename,direction\n')
    # Getting the sequences from the manifest
    sequences, single_end = _fastq_id_maker(
        per_sample_sequences=per_sample_sequences, artifact_type=artifact_type)
    barcode = 0
    # Creating result dir
    if paired:
        results = SingleLanePerSamplePairedEndFastqDirFmt()
    else:
        results = SingleLanePerSampleSingleEndFastqDirFmt()
    # Running the for loop for each sample

    for sequence in sequences:
        # writing fastqs and there attributes and checking the files
        sequence_id, sobj = _set_fastqs_and_check(
            per_sample_sequences=per_sample_sequences,
            artifact_type=artifact_type,
            sequence=sequence,
            single_end=single_end,
            threads=threads)

        # Deduplicate
        if math.isclose(cluster_id, 1, rel_tol=1e-05):
            sobj.deduplicate(threads=threads)
        else:
            sobj.cluster(threads=threads, cluster_id=cluster_id)
        try:
            # HMMSearch for ITS regions
            hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs",
                                   taxa_dict[taxa])
            sobj._search(hmmfile=hmmfile, threads=threads)
        except (ModuleNotFoundError, FileNotFoundError, NotADirectoryError):

            raise ValueError(
                "hmmsearch was not found, make sure HMMER3 is installed and executable"
            )

        # Parse HMMseach output.
        its_pos = itsxpress.ItsPosition(domtable=sobj.dom_file, region=region)
        # Create deduplication object.
        dedup_obj = itsxpress.Dedup(uc_file=sobj.uc_file,
                                    rep_file=sobj.rep_file,
                                    seq_file=sobj.seq_file,
                                    fastq=sobj.r1,
                                    fastq2=sobj.fastq2)

        path_forward = results.sequences.path_maker(sample_id=sequence_id,
                                                    barcode_id=barcode,
                                                    lane_number=1,
                                                    read_number=1)
        path_reverse = results.sequences.path_maker(sample_id=sequence_id,
                                                    barcode_id=barcode,
                                                    lane_number=1,
                                                    read_number=2)

        manifest_fn.write("{},{},forward\n".format(sequence_id,
                                                   path_forward.name))
        # Create trimmed sequences.
        if paired:
            dedup_obj.create_paired_trimmed_seqs(str(path_forward),
                                                 str(path_reverse),
                                                 gzipped=True,
                                                 itspos=its_pos)
        else:
            dedup_obj.create_trimmed_seqs(str(path_forward),
                                          gzipped=True,
                                          itspos=its_pos)
        # Deleting the temp files.
        shutil.rmtree(sobj.tempdir)
        # Adding one to the barcode
        barcode += 1
    # Writing out the results.
    manifest_fn.close()
    _write_metadata(results=results)
    results.manifest.write_data(manifest, FastqManifestFormat)
    return results
Beispiel #24
0
 def test_integer_ids_with_underscores(self):
     bad_seqs = SingleLanePerSampleSingleEndFastqDirFmt(
         self.get_data_path('sample_seqs_integers_underscore'), 'r')
     with self.assertRaisesRegex(ValueError, 'Deblur cannot.*100_100.'):
         denoise_16S(bad_seqs, 100)
Beispiel #25
0
def emp_single(
    seqs: BarcodeSequenceFastqIterator,
    barcodes: qiime2.MetadataCategory,
    rev_comp_barcodes: bool = False,
    rev_comp_mapping_barcodes: bool = False
) -> SingleLanePerSampleSingleEndFastqDirFmt:

    result = SingleLanePerSampleSingleEndFastqDirFmt()
    barcode_map, barcode_len = _make_barcode_map(barcodes,
                                                 rev_comp_mapping_barcodes)

    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')
    manifest_fh.write('# direction is not meaningful in this file as these\n')
    manifest_fh.write('# data may be derived from forward, reverse, or \n')
    manifest_fh.write('# joined reads\n')

    per_sample_fastqs = {}

    for barcode_record, sequence_record in seqs:
        barcode_read = barcode_record[1]
        if rev_comp_barcodes:
            barcode_read = str(skbio.DNA(barcode_read).reverse_complement())
        barcode_read = barcode_read[:barcode_len]

        try:
            sample_id = barcode_map[barcode_read]
        except KeyError:
            # TODO: this should ultimately be logged, but we don't currently
            # have support for that.
            continue

        if sample_id not in per_sample_fastqs:
            # The barcode id, lane number and read number are not relevant
            # here. We might ultimately want to use a dir format other than
            # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care
            # about this information. Similarly, the direction of the read
            # isn't relevant here anymore.
            barcode_id = len(per_sample_fastqs) + 1
            path = result.sequences.path_maker(sample_id=sample_id,
                                               barcode_id=barcode_id,
                                               lane_number=1,
                                               read_number=1)
            _maintain_open_fh_count(per_sample_fastqs)
            per_sample_fastqs[sample_id] = gzip.open(str(path), mode='a')
            manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward'))

        if per_sample_fastqs[sample_id].closed:
            _maintain_open_fh_count(per_sample_fastqs)
            per_sample_fastqs[sample_id] = gzip.open(
                per_sample_fastqs[sample_id].name, mode='a')

        fastq_lines = '\n'.join(sequence_record) + '\n'
        fastq_lines = fastq_lines.encode('utf-8')
        per_sample_fastqs[sample_id].write(fastq_lines)

    if len(per_sample_fastqs) == 0:
        raise ValueError('No sequences were mapped to samples. Check that '
                         'your barcodes are in the correct orientation (see '
                         'the rev_comp_barcodes and/or '
                         'rev_comp_mapping_barcodes options).')

    for fh in per_sample_fastqs.values():
        fh.close()

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    _write_metadata_yaml(result)

    return result
Beispiel #26
0
def q_score(demux: SingleLanePerSampleSingleEndFastqDirFmt,
            min_quality: int = _default_params['min_quality'],
            quality_window: int = _default_params['quality_window'],
            min_length_fraction:
            float = _default_params['min_length_fraction'],
            max_ambiguous: int = _default_params['max_ambiguous']) \
                  -> (SingleLanePerSampleSingleEndFastqDirFmt,
                      pd.DataFrame):
    result = SingleLanePerSampleSingleEndFastqDirFmt()

    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')
    manifest_fh.write('# direction is not meaningful in this file as these\n')
    manifest_fh.write('# data may be derived from forward, reverse, or \n')
    manifest_fh.write('# joined reads\n')

    log_records_truncated_counts = {}
    log_records_max_ambig_counts = {}
    log_records_tooshort_counts = {}
    log_records_totalread_counts = {}
    log_records_totalkept_counts = {}

    metadata_view = demux.metadata.view(YamlFormat).open()
    phred_offset = yaml.load(metadata_view)['phred-offset']
    demux_manifest = demux.manifest.view(demux.manifest.format)
    demux_manifest = pd.read_csv(demux_manifest.open(), dtype=str)
    demux_manifest.set_index('filename', inplace=True)

    iterator = demux.sequences.iter_views(FastqGzFormat)
    for bc_id, (fname, fp) in enumerate(iterator):
        sample_id = demux_manifest.loc[str(fname)]['sample-id']

        log_records_truncated_counts[sample_id] = 0
        log_records_max_ambig_counts[sample_id] = 0
        log_records_tooshort_counts[sample_id] = 0
        log_records_totalread_counts[sample_id] = 0
        log_records_totalkept_counts[sample_id] = 0

        # per q2-demux, barcode ID, lane number and read number are not
        # relevant here
        path = result.sequences.path_maker(sample_id=sample_id,
                                           barcode_id=bc_id,
                                           lane_number=1,
                                           read_number=1)

        # we do not open a writer by default in the event that all sequences
        # for a sample are filtered out; an empty fastq file is not a valid
        # fastq file.
        writer = None
        for sequence_record in _read_fastq_seqs(str(fp), phred_offset):
            log_records_totalread_counts[sample_id] += 1

            # determine the length of the runs below quality threshold
            # NOTE: QIIME 1.x used <= the quality threshold and the parameter
            #   -q was interpreted as the maximum unacceptable PHRED score. In
            #   QIIME 2.x, we're now interpreting this as the minimum
            #   acceptable score.
            qual_below_threshold = sequence_record[4] < min_quality
            run_starts, run_lengths = _runs_of_ones(qual_below_threshold)
            bad_windows = np.argwhere(run_lengths > quality_window)

            # if there is a run of sufficient size, truncate it
            if bad_windows.size > 0:
                log_records_truncated_counts[sample_id] += 1

                full_length = len(sequence_record[1])
                sequence_record = _truncate(sequence_record,
                                            run_starts[bad_windows[0]][0])
                trunc_length = len(sequence_record[1])

                # do not keep the read if it is too short following truncation
                if round(trunc_length / full_length, 3) <= min_length_fraction:
                    log_records_tooshort_counts[sample_id] += 1
                    continue

            # do not keep the read if there are too many ambiguous bases
            if sequence_record[1].count('N') > max_ambiguous:
                log_records_max_ambig_counts[sample_id] += 1
                continue

            fastq_lines = '\n'.join(sequence_record[:4]) + '\n'
            fastq_lines = fastq_lines.encode('utf-8')

            if writer is None:
                writer = gzip.open(str(path), mode='w')
            writer.write(fastq_lines)

            log_records_totalkept_counts[sample_id] += 1

        if writer is not None:
            manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward'))
            writer.close()

    if set(log_records_totalkept_counts.values()) == {0, }:
        raise ValueError("All sequences from all samples were filtered out. "
                         "The parameter choices may be too stringent for the "
                         "data.")

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    metadata = YamlFormat()
    metadata.path.write_text(yaml.dump({'phred-offset': phred_offset}))
    result.metadata.write_data(metadata, YamlFormat)

    columns = ['sample-id', 'total-input-reads', 'total-retained-reads',
               'reads-truncated',
               'reads-too-short-after-truncation',
               'reads-exceeding-maximum-ambiguous-bases']
    stats = []
    for id_, _ in sorted(log_records_truncated_counts.items()):
        stats.append([id_, log_records_totalread_counts[id_],
                      log_records_totalkept_counts[id_],
                      log_records_truncated_counts[id_],
                      log_records_tooshort_counts[id_],
                      log_records_max_ambig_counts[id_]])

    stats = pd.DataFrame(stats, columns=columns).set_index('sample-id')

    return result, stats
Beispiel #27
0
def emp_single(
    seqs: BarcodeSequenceFastqIterator,
    barcodes: qiime2.CategoricalMetadataColumn,
    golay_error_correction: bool = True,
    rev_comp_barcodes: bool = False,
    rev_comp_mapping_barcodes: bool = False,
    ignore_description_mismatch: bool = False
) -> (SingleLanePerSampleSingleEndFastqDirFmt, ErrorCorrectionDetailsFmt):
    seqs.ignore_description_mismatch = ignore_description_mismatch
    result = SingleLanePerSampleSingleEndFastqDirFmt()
    barcode_map, barcode_len = _make_barcode_map(barcodes,
                                                 rev_comp_mapping_barcodes)

    if golay_error_correction:
        decoder = GolayDecoder()

    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')
    manifest_fh.write('# direction is not meaningful in this file as these\n')
    manifest_fh.write('# data may be derived from forward, reverse, or \n')
    manifest_fh.write('# joined reads\n')

    per_sample_fastqs = {}

    ec_details_fmt = ErrorCorrectionDetailsFmt()
    ec_details = ECDetails(ec_details_fmt)

    for i, (barcode_record, sequence_record) in enumerate(seqs, start=1):
        barcode_read = barcode_record[1]
        if rev_comp_barcodes:
            barcode_read = str(skbio.DNA(barcode_read).reverse_complement())
        raw_barcode_read = barcode_read[:barcode_len]

        if golay_error_correction:
            # A three bit filter is implicitly used by the decoder. See Hamady
            # and Knight 2009 Genome Research for the justification:
            #
            # https://genome.cshlp.org/content/19/7/1141.full
            #
            # Specifically that "...Golay codes of 12 bases can correct all
            # triple-bit errors and detect all quadruple-bit errors."
            barcode_read, ecc_errors = decoder.decode(raw_barcode_read)
            golay_stats = [barcode_read, ecc_errors]
        else:
            barcode_read = raw_barcode_read
            golay_stats = [None, None]

        sample_id = barcode_map.get(barcode_read)

        record = [
            f'record-{i}',
            sample_id,
            barcode_record[0],
            raw_barcode_read,
        ]
        ec_details.write(record + golay_stats)

        if sample_id is None:
            continue

        if sample_id not in per_sample_fastqs:
            # The barcode id, lane number and read number are not relevant
            # here. We might ultimately want to use a dir format other than
            # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care
            # about this information. Similarly, the direction of the read
            # isn't relevant here anymore.
            barcode_id = len(per_sample_fastqs) + 1
            path = result.sequences.path_maker(sample_id=sample_id,
                                               barcode_id=barcode_id,
                                               lane_number=1,
                                               read_number=1)
            _maintain_open_fh_count(per_sample_fastqs)
            per_sample_fastqs[sample_id] = gzip.open(str(path), mode='a')
            manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward'))

        if per_sample_fastqs[sample_id].closed:
            _maintain_open_fh_count(per_sample_fastqs)
            per_sample_fastqs[sample_id] = gzip.open(
                per_sample_fastqs[sample_id].name, mode='a')

        fastq_lines = '\n'.join(sequence_record) + '\n'
        fastq_lines = fastq_lines.encode('utf-8')
        per_sample_fastqs[sample_id].write(fastq_lines)

    if len(per_sample_fastqs) == 0:
        raise ValueError('No sequences were mapped to samples. Check that '
                         'your barcodes are in the correct orientation (see '
                         'the rev_comp_barcodes and/or '
                         'rev_comp_mapping_barcodes options). If barcodes are '
                         'NOT Golay format set golay_error_correction '
                         'to False.')

    for fh in per_sample_fastqs.values():
        fh.close()

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    _write_metadata_yaml(result)

    return result, ec_details_fmt
Beispiel #28
0
def join_pairs(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
               threads: int = 1) -> SingleLanePerSampleSingleEndFastqDirFmt:

    result = SingleLanePerSampleSingleEndFastqDirFmt()

    manifest = pd.read_csv(os.path.join(str(demultiplexed_seqs),
                                        demultiplexed_seqs.manifest.pathspec),
                           header=0,
                           comment='#')

    manifest.filename = manifest.filename.apply(
        lambda x: os.path.join(str(demultiplexed_seqs), x))

    phred_offset = yaml.load(
        open(
            os.path.join(
                str(demultiplexed_seqs),
                demultiplexed_seqs.metadata.pathspec)))['phred-offset']

    id_to_fps = manifest.pivot(index='sample-id',
                               columns='direction',
                               values='filename')

    output_manifest = FastqManifestFormat()
    output_manifest_fh = output_manifest.open()
    output_manifest_fh.write('sample-id,filename,direction\n')
    output_manifest_fh.write('# direction is not meaningful in this file '
                             'as these\n')
    output_manifest_fh.write('# data may be derived from forward, reverse, '
                             'or \n')
    output_manifest_fh.write('# joined reads\n')

    for i, (sample_id, (fwd_fp, rev_fp)) in enumerate(id_to_fps.iterrows()):
        # The barcode id, lane number and read number are not relevant
        # here. We might ultimately want to use a dir format other than
        # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care
        # about this information. Similarly, the direction of the read
        # isn't relevant here anymore.
        path = result.sequences.path_maker(sample_id=sample_id,
                                           barcode_id=i,
                                           lane_number=1,
                                           read_number=1)

        uncompressed_path = str(path).strip('.gz')

        parent_pth = Path(path).parent

        sample_id_path = str(parent_pth / sample_id)

        assembled_pth = parent_pth / "{}.assembled.fastq".format(sample_id)
        discarded_pth = parent_pth / "{}.discarded.fastq".format(sample_id)
        unassembled_fwd_pth = parent_pth / "{}.unassembled.forward.fastq".format(
            sample_id)
        unassembled_rev_pth = parent_pth / "{}.unassembled.reverse.fastq".format(
            sample_id)

        cmd = [
            'pear', '-f', fwd_fp, '-r', rev_fp, '-o', sample_id_path,
            '--threads',
            str(threads)
        ]

        run_command(cmd)

        assembled_pth.rename(Path(uncompressed_path))
        run_command(['gzip', uncompressed_path])

        #delete extra files
        extra_files = [discarded_pth, unassembled_fwd_pth, unassembled_rev_pth]
        for f_pth in extra_files:
            try:
                os.remove(str(f_pth))
            except:
                pass

        output_manifest_fh.write('%s,%s,%s\n' %
                                 (sample_id, Path(path).name, 'forward'))

    output_manifest_fh.close()
    result.manifest.write_data(output_manifest, FastqManifestFormat)

    metadata = YamlFormat()
    metadata.path.write_text(yaml.dump({'phred-offset': phred_offset}))
    result.metadata.write_data(metadata, YamlFormat)

    return result
Beispiel #29
0
def main(per_sample_sequences, threads, taxa, region):
    """The main communtion between the pluin and the ITSxpress program.

    Args:

        per_sample_sequences (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the input.
        threads (int) : The number of threads to use.
        taxa (str): The taxa to be used for the search.
        region (str) : The region to be used for the search.

    Returns:

        (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the output.

    Raises:

        ValueError1: BBTools error or fastq format issue.
        ValueError2: BBmerge error.
        ValueError3: hmmsearch error.

    """
    # Setting a temp folder
    dirt = tempfile.tempdir
    # Setting the current dir
    os.chdir(str(per_sample_sequences.path))
    # Finding the artifact type.
    artifactType = _view_artifact_type()
    # Setting the taxa
    taxa = _taxa_prefix_to_taxa(taxa)
    # Writing the manifest for the output qza
    manifest = FastqManifestFormat()
    manifest_fn = manifest.open()
    manifest_fn.write('sample-id,filename,direction\n')
    sequences,singleEnd = _fastq_id_maker(per_sample_sequences, artifactType)
    sequenceList = set(sequences)
    barcode = 0
    # Creating result dir
    results = SingleLanePerSampleSingleEndFastqDirFmt()
    # Running the for loop for each sample
    for sequence in sequenceList:

        # Setting the fastq files and if singleEnd is used.
        fastq = os.path.join(str(per_sample_sequences.path),str(sequence[0]))
        if "SampleData[PairedEndSequencesWithQuality]" in artifactType:
            fastq2 = os.path.join(str(per_sample_sequences.path),str(sequence[1]))
        else:
            fastq2 = sequence[1]
        sequenceID = sequence[2]
        # Running the main ITSxpress program.
        try:
            itsx._check_fastqs(fastq, fastq2)
            # Parse input types
            paired_end, interleaved = itsx._is_paired(fastq, fastq2, singleEnd)
        except:
            raise ValueError("There is a problem with the fastq file(s) you selected or\n"
                             "BBtools was not found. check that the BBtools reformat.sh package is executable.")
        # Create SeqSample objects and merge if needed.
        try:
            if paired_end and interleaved:
                sobj = itsx.SeqSamplePairedInterleaved(fastq=fastq, tempdir=dirt)
                sobj._merge_reads(threads=threads)

            elif paired_end and not interleaved:
                sobj = itsx.SeqSamplePairedNotInterleaved(fastq=fastq, fastq2=fastq2, tempdir=dirt)
                sobj._merge_reads(threads=threads)

            elif not paired_end and not interleaved:
                sobj = itsx.SeqSampleNotPaired(fastq=fastq, tempdir=dirt)

        except:
            raise ValueError("BBmerge was not found. check that the BBmerge reformat.sh package is executible")

        # Deduplicate
        sobj._deduplicate(threads=threads)
        try:
            # HMMSearch for ITS regions
            hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs", taxa_dict[taxa])
            sobj._search(hmmfile=hmmfile, threads=threads)
        except:
            raise ValueError("hmmsearch was not found, make sure HMMER3 is installed and executible")

        # Parse HMMseach output.
        its_pos = itsx.ItsPosition(domtable=sobj.dom_file, region=region)
        # Create deduplication object.
        dedup_obj = itsx.Dedup(uc_file=sobj.uc_file, rep_file=sobj.rep_file, seq_file=sobj.seq_file)
        pathForward = results.sequences.path_maker(sample_id=sequenceID,
                                                   barcode_id=barcode,
                                                   lane_number=1,
                                                   read_number=1)

        manifest_fn.write("{},{},forward\n".format(sequenceID,pathForward.name))
        # Create trimmed sequences.
        dedup_obj.create_trimmed_seqs(str(pathForward), gzipped=True, itspos=its_pos)
        # Deleting the temp files.
        itsx.shutil.rmtree(sobj.tempdir)
        barcode += 1
    #Writing out the results.
    manifest_fn.close()
    _write_metadata(results)
    results.manifest.write_data(manifest, FastqManifestFormat)
    return results
Beispiel #30
0
def main(fastq, fastq2, singleEnd, threads, taxa, region):
    """The main communtion between the pluin and the ITSxpress program.

    Args:

        fastq (str) : The first fastq location.
        fastq2 (str) : The second fastq location.
        singleEnd (bool) : boolean for if singleEnd is used or not.
        threads (int) : The number of threads to use.
        taxa (str): The taxa to be used for the search.
        region (str) : The region to be used for the search.


    Returns:

        (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the output.


    Raises:

        ValueError1: BBTools error or fastq format issue.

        ValueError2: BBmerge error.

        ValueError3: hmmsearch error.

    """
    dirt = "/tmp"
    try:
        itsx._check_fastqs(fastq, fastq2)
        # Parse input types
        paired_end, interleaved = itsx._is_paired(fastq, fastq2, singleEnd)

    except:

        raise ValueError("There is a problem with the fastq file(s) you selected or\n"
                         "BBtools was not found. check that the BBtools reformat.sh package is executable.")

    # Create SeqSample objects and merge if needed.

    try:
        if paired_end and interleaved:

            sobj = itsx.SeqSamplePairedInterleaved(fastq=fastq, tempdir=dirt)

            sobj._merge_reads(threads=threads)

        elif paired_end and not interleaved:

            sobj = itsx.SeqSamplePairedNotInterleaved(fastq=fastq, fastq2=fastq2, tempdir=dirt)


            sobj = itsx.SeqSampleNotPaired(fastq=fastq, tempdir=dirt)
    except:
        raise ValueError("BBmerge was not found. check that the BBmerge reformat.sh package is executible")

    # Deduplicate
    sobj._deduplicate(threads=threads)

    try:

        # HMMSearch for ITS regions
        hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs", taxa_dict[taxa])

        sobj._search(hmmfile=hmmfile, threads=threads)

    except:

        raise ValueError("hmmsearch was not found, make sure HMMER3 is installed and executible")

    # Parse HMMseach output.
    its_pos = itsx.ItsPosition(domtable=sobj.dom_file, region=region)
    # Create deduplication object.
    dedup_obj = itsx.Dedup(uc_file=sobj.uc_file, rep_file=sobj.rep_file, seq_file=sobj.seq_file)

    results = SingleLanePerSampleSingleEndFastqDirFmt()

    path = results.sequences.path_maker(sample_id="seq",
                                        barcode_id=1,
                                        lane_number=1,
                                        read_number=1)
    # Writing the manifest for the output qza
    manifest = FastqManifestFormat()

    manifest_fn = manifest.open()

    manifest_fn.write('sample-id,filename,direction\n')

    manifest_fn.write("seq,{},reverse".format(path))

    manifest_fn.close()

    # Create trimmed sequences.
    dedup_obj.create_trimmed_seqs(str(path), gzipped=True, itspos=its_pos)

    # Writing out the results.
    _write_metadata(results)

    results.manifest.write_data(manifest, FastqManifestFormat)

    # Deleting the temp files.
    itsx.shutil.rmtree(sobj.tempdir)

    return results