Esempio n. 1
0
def finalize_result(
        original_sequences: SingleLanePerSampleSingleEndFastqDirFmt,
        result_sequences: SingleLanePerSampleSingleEndFastqDirFmt,
        stats_df: pd.DataFrame) -> SingleLanePerSampleSingleEndFastqDirFmt:

    print("in finalize result")

    demux = original_sequences

    result = result_sequences

    #exclude those sample with resulting zero sequences after filtering
    #since a fastq file with zero sequences is invalid
    sample_ids_to_include = list(
        stats_df[stats_df['n_seqs_kept'] > 0]['sample-id'])

    #exit with error here
    if len(sample_ids_to_include) == 0:
        raise ValueError(
            "All sequences from all samples were filtered out through abundance-filtering."
        )

    #manifest
    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')
    manifest_fh.write('# direction is not meaningful in this file as these\n')
    manifest_fh.write('# data may be derived from forward, reverse, or \n')
    manifest_fh.write('# joined reads\n')

    for sample_id in sample_ids_to_include:
        path = return_fastqgz_path_for_sample(demux, sample_id=sample_id)
        manifest_fh.write('{sample_id},{filename},{direction}\n'.format(
            sample_id=sample_id, filename=path.name, direction='forward'))

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    ###metadata
    demux_metadata_view = demux.metadata.view(YamlFormat)
    with open(str(demux_metadata_view)) as demux_metadata_fh:
        demux_metadata_dict = yaml.load(demux_metadata_fh)
    metadata = YamlFormat()
    metadata.path.write_text(yaml.dump(demux_metadata_dict))
    result.metadata.write_data(metadata, YamlFormat)

    return result
Esempio n. 2
0
 def test_validate_negative(self):
     files = [
         'no-data-MANIFEST', 'not-MANIFEST',
         'relative_manifests/jagged-MANIFEST'
     ]
     for file in files:
         filepath = self.get_data_path(file)
         with self.assertRaisesRegex(ValidationError,
                                     'FastqManifestFormat'):
             FastqManifestFormat(filepath, mode='r').validate()
Esempio n. 3
0
 def test_validate_positive(self):
     for file in ['single-MANIFEST', 'paired-MANIFEST', 'long-MANIFEST']:
         filepath = self.get_data_path('relative_manifests/%s' % file)
         FastqManifestFormat(filepath, mode='r').validate()
Esempio n. 4
0
def emp_paired(
    seqs: BarcodePairedSequenceFastqIterator,
    barcodes: qiime2.MetadataCategory,
    rev_comp_barcodes: bool = False,
    rev_comp_mapping_barcodes: bool = False
) -> SingleLanePerSamplePairedEndFastqDirFmt:

    result = SingleLanePerSamplePairedEndFastqDirFmt()
    barcode_map, barcode_len = _make_barcode_map(barcodes,
                                                 rev_comp_mapping_barcodes)

    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')

    per_sample_fastqs = {}
    for barcode_record, forward_record, reverse_record in seqs:
        barcode_read = barcode_record[1]
        if rev_comp_barcodes:
            barcode_read = str(skbio.DNA(barcode_read).reverse_complement())
        barcode_read = barcode_read[:barcode_len]

        try:
            sample_id = barcode_map[barcode_read]
        except KeyError:
            # TODO: this should ultimately be logged, but we don't currently
            # have support for that.
            continue

        if sample_id not in per_sample_fastqs:
            barcode_id = len(per_sample_fastqs) + 1
            fwd_path = result.sequences.path_maker(sample_id=sample_id,
                                                   barcode_id=barcode_id,
                                                   lane_number=1,
                                                   read_number=1)
            rev_path = result.sequences.path_maker(sample_id=sample_id,
                                                   barcode_id=barcode_id,
                                                   lane_number=1,
                                                   read_number=2)

            _maintain_open_fh_count(per_sample_fastqs, paired=True)
            per_sample_fastqs[sample_id] = (gzip.open(str(fwd_path), mode='a'),
                                            gzip.open(str(rev_path), mode='a'))
            manifest_fh.write('%s,%s,%s\n' %
                              (sample_id, fwd_path.name, 'forward'))
            manifest_fh.write('%s,%s,%s\n' %
                              (sample_id, rev_path.name, 'reverse'))

        if per_sample_fastqs[sample_id][0].closed:
            _maintain_open_fh_count(per_sample_fastqs, paired=True)
            fwd, rev = per_sample_fastqs[sample_id]
            per_sample_fastqs[sample_id] = (gzip.open(fwd.name, mode='a'),
                                            gzip.open(rev.name, mode='a'))

        fwd, rev = per_sample_fastqs[sample_id]
        fwd.write(('\n'.join(forward_record) + '\n').encode('utf-8'))
        rev.write(('\n'.join(reverse_record) + '\n').encode('utf-8'))

    if len(per_sample_fastqs) == 0:
        raise ValueError('No sequences were mapped to samples. Check that '
                         'your barcodes are in the correct orientation (see '
                         'the rev_comp_barcodes and/or '
                         'rev_comp_mapping_barcodes options).')

    for fwd, rev in per_sample_fastqs.values():
        fwd.close()
        rev.close()

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    _write_metadata_yaml(result)

    return result
Esempio n. 5
0
def emp_single(
    seqs: BarcodeSequenceFastqIterator,
    barcodes: qiime2.MetadataCategory,
    rev_comp_barcodes: bool = False,
    rev_comp_mapping_barcodes: bool = False
) -> SingleLanePerSampleSingleEndFastqDirFmt:

    result = SingleLanePerSampleSingleEndFastqDirFmt()
    barcode_map, barcode_len = _make_barcode_map(barcodes,
                                                 rev_comp_mapping_barcodes)

    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')
    manifest_fh.write('# direction is not meaningful in this file as these\n')
    manifest_fh.write('# data may be derived from forward, reverse, or \n')
    manifest_fh.write('# joined reads\n')

    per_sample_fastqs = {}

    for barcode_record, sequence_record in seqs:
        barcode_read = barcode_record[1]
        if rev_comp_barcodes:
            barcode_read = str(skbio.DNA(barcode_read).reverse_complement())
        barcode_read = barcode_read[:barcode_len]

        try:
            sample_id = barcode_map[barcode_read]
        except KeyError:
            # TODO: this should ultimately be logged, but we don't currently
            # have support for that.
            continue

        if sample_id not in per_sample_fastqs:
            # The barcode id, lane number and read number are not relevant
            # here. We might ultimately want to use a dir format other than
            # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care
            # about this information. Similarly, the direction of the read
            # isn't relevant here anymore.
            barcode_id = len(per_sample_fastqs) + 1
            path = result.sequences.path_maker(sample_id=sample_id,
                                               barcode_id=barcode_id,
                                               lane_number=1,
                                               read_number=1)
            _maintain_open_fh_count(per_sample_fastqs)
            per_sample_fastqs[sample_id] = gzip.open(str(path), mode='a')
            manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward'))

        if per_sample_fastqs[sample_id].closed:
            _maintain_open_fh_count(per_sample_fastqs)
            per_sample_fastqs[sample_id] = gzip.open(
                per_sample_fastqs[sample_id].name, mode='a')

        fastq_lines = '\n'.join(sequence_record) + '\n'
        fastq_lines = fastq_lines.encode('utf-8')
        per_sample_fastqs[sample_id].write(fastq_lines)

    if len(per_sample_fastqs) == 0:
        raise ValueError('No sequences were mapped to samples. Check that '
                         'your barcodes are in the correct orientation (see '
                         'the rev_comp_barcodes and/or '
                         'rev_comp_mapping_barcodes options).')

    for fh in per_sample_fastqs.values():
        fh.close()

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    _write_metadata_yaml(result)

    return result
Esempio n. 6
0
def q_score(demux: SingleLanePerSampleSingleEndFastqDirFmt,
            min_quality: int = _default_params['min_quality'],
            quality_window: int = _default_params['quality_window'],
            min_length_fraction:
            float = _default_params['min_length_fraction'],
            max_ambiguous: int = _default_params['max_ambiguous']) \
                  -> (SingleLanePerSampleSingleEndFastqDirFmt,
                      pd.DataFrame):
    result = SingleLanePerSampleSingleEndFastqDirFmt()

    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')
    manifest_fh.write('# direction is not meaningful in this file as these\n')
    manifest_fh.write('# data may be derived from forward, reverse, or \n')
    manifest_fh.write('# joined reads\n')

    log_records_truncated_counts = {}
    log_records_max_ambig_counts = {}
    log_records_tooshort_counts = {}
    log_records_totalread_counts = {}
    log_records_totalkept_counts = {}

    metadata_view = demux.metadata.view(YamlFormat).open()
    phred_offset = yaml.load(metadata_view)['phred-offset']
    demux_manifest = demux.manifest.view(demux.manifest.format)
    demux_manifest = pd.read_csv(demux_manifest.open(), dtype=str)
    demux_manifest.set_index('filename', inplace=True)

    iterator = demux.sequences.iter_views(FastqGzFormat)
    for bc_id, (fname, fp) in enumerate(iterator):
        sample_id = demux_manifest.loc[str(fname)]['sample-id']

        log_records_truncated_counts[sample_id] = 0
        log_records_max_ambig_counts[sample_id] = 0
        log_records_tooshort_counts[sample_id] = 0
        log_records_totalread_counts[sample_id] = 0
        log_records_totalkept_counts[sample_id] = 0

        # per q2-demux, barcode ID, lane number and read number are not
        # relevant here
        path = result.sequences.path_maker(sample_id=sample_id,
                                           barcode_id=bc_id,
                                           lane_number=1,
                                           read_number=1)

        # we do not open a writer by default in the event that all sequences
        # for a sample are filtered out; an empty fastq file is not a valid
        # fastq file.
        writer = None
        for sequence_record in _read_fastq_seqs(str(fp), phred_offset):
            log_records_totalread_counts[sample_id] += 1

            # determine the length of the runs below quality threshold
            # NOTE: QIIME 1.x used <= the quality threshold and the parameter
            #   -q was interpreted as the maximum unacceptable PHRED score. In
            #   QIIME 2.x, we're now interpreting this as the minimum
            #   acceptable score.
            qual_below_threshold = sequence_record[4] < min_quality
            run_starts, run_lengths = _runs_of_ones(qual_below_threshold)
            bad_windows = np.argwhere(run_lengths > quality_window)

            # if there is a run of sufficient size, truncate it
            if bad_windows.size > 0:
                log_records_truncated_counts[sample_id] += 1

                full_length = len(sequence_record[1])
                sequence_record = _truncate(sequence_record,
                                            run_starts[bad_windows[0]][0])
                trunc_length = len(sequence_record[1])

                # do not keep the read if it is too short following truncation
                if round(trunc_length / full_length, 3) <= min_length_fraction:
                    log_records_tooshort_counts[sample_id] += 1
                    continue

            # do not keep the read if there are too many ambiguous bases
            if sequence_record[1].count('N') > max_ambiguous:
                log_records_max_ambig_counts[sample_id] += 1
                continue

            fastq_lines = '\n'.join(sequence_record[:4]) + '\n'
            fastq_lines = fastq_lines.encode('utf-8')

            if writer is None:
                writer = gzip.open(str(path), mode='w')
            writer.write(fastq_lines)

            log_records_totalkept_counts[sample_id] += 1

        if writer is not None:
            manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward'))
            writer.close()

    if set(log_records_totalkept_counts.values()) == {0, }:
        raise ValueError("All sequences from all samples were filtered out. "
                         "The parameter choices may be too stringent for the "
                         "data.")

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    metadata = YamlFormat()
    metadata.path.write_text(yaml.dump({'phred-offset': phred_offset}))
    result.metadata.write_data(metadata, YamlFormat)

    columns = ['sample-id', 'total-input-reads', 'total-retained-reads',
               'reads-truncated',
               'reads-too-short-after-truncation',
               'reads-exceeding-maximum-ambiguous-bases']
    stats = []
    for id_, _ in sorted(log_records_truncated_counts.items()):
        stats.append([id_, log_records_totalread_counts[id_],
                      log_records_totalkept_counts[id_],
                      log_records_truncated_counts[id_],
                      log_records_tooshort_counts[id_],
                      log_records_max_ambig_counts[id_]])

    stats = pd.DataFrame(stats, columns=columns).set_index('sample-id')

    return result, stats
Esempio n. 7
0
def emp_paired(
    seqs: BarcodePairedSequenceFastqIterator,
    barcodes: qiime2.CategoricalMetadataColumn,
    golay_error_correction: bool = True,
    rev_comp_barcodes: bool = False,
    rev_comp_mapping_barcodes: bool = False,
    ignore_description_mismatch: bool = False
) -> (SingleLanePerSamplePairedEndFastqDirFmt, ErrorCorrectionDetailsFmt):
    seqs.ignore_description_mismatch = ignore_description_mismatch
    result = SingleLanePerSamplePairedEndFastqDirFmt()
    barcode_map, barcode_len = _make_barcode_map(barcodes,
                                                 rev_comp_mapping_barcodes)

    if golay_error_correction:
        decoder = GolayDecoder()

    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')

    per_sample_fastqs = {}

    ec_details_fmt = ErrorCorrectionDetailsFmt()
    ec_details = ECDetails(ec_details_fmt)

    for i, record in enumerate(seqs, start=1):
        barcode_record, forward_record, reverse_record = record
        barcode_read = barcode_record[1]
        if rev_comp_barcodes:
            barcode_read = str(skbio.DNA(barcode_read).reverse_complement())
        raw_barcode_read = barcode_read[:barcode_len]

        if golay_error_correction:
            # A three bit filter is implicitly used by the decoder. See Hamady
            # and Knight 2009 Genome Research for the justification:
            #
            # https://genome.cshlp.org/content/19/7/1141.full
            #
            # Specifically that "...Golay codes of 12 bases can correct all
            # triple-bit errors and detect all quadruple-bit errors."
            barcode_read, ecc_errors = decoder.decode(raw_barcode_read)
            golay_stats = [barcode_read, ecc_errors]
        else:
            barcode_read = raw_barcode_read
            golay_stats = [None, None]

        sample_id = barcode_map.get(barcode_read)

        record = [
            f'record-{i}',
            sample_id,
            barcode_record[0],
            raw_barcode_read,
        ]
        ec_details.write(record + golay_stats)

        if sample_id is None:
            continue

        if sample_id not in per_sample_fastqs:
            barcode_id = len(per_sample_fastqs) + 1
            fwd_path = result.sequences.path_maker(sample_id=sample_id,
                                                   barcode_id=barcode_id,
                                                   lane_number=1,
                                                   read_number=1)
            rev_path = result.sequences.path_maker(sample_id=sample_id,
                                                   barcode_id=barcode_id,
                                                   lane_number=1,
                                                   read_number=2)

            _maintain_open_fh_count(per_sample_fastqs, paired=True)
            per_sample_fastqs[sample_id] = (gzip.open(str(fwd_path), mode='a'),
                                            gzip.open(str(rev_path), mode='a'))
            manifest_fh.write('%s,%s,%s\n' %
                              (sample_id, fwd_path.name, 'forward'))
            manifest_fh.write('%s,%s,%s\n' %
                              (sample_id, rev_path.name, 'reverse'))

        if per_sample_fastqs[sample_id][0].closed:
            _maintain_open_fh_count(per_sample_fastqs, paired=True)
            fwd, rev = per_sample_fastqs[sample_id]
            per_sample_fastqs[sample_id] = (gzip.open(fwd.name, mode='a'),
                                            gzip.open(rev.name, mode='a'))

        fwd, rev = per_sample_fastqs[sample_id]
        fwd.write(('\n'.join(forward_record) + '\n').encode('utf-8'))
        rev.write(('\n'.join(reverse_record) + '\n').encode('utf-8'))

    if len(per_sample_fastqs) == 0:
        raise ValueError('No sequences were mapped to samples. Check that '
                         'your barcodes are in the correct orientation (see '
                         'the rev_comp_barcodes and/or '
                         'rev_comp_mapping_barcodes options). If barcodes are '
                         'NOT Golay format set golay_error_correction '
                         'to False.')

    for fwd, rev in per_sample_fastqs.values():
        fwd.close()
        rev.close()

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    _write_metadata_yaml(result)

    return result, ec_details_fmt
Esempio n. 8
0
    def test_fastq_manifest_format_validate_positive(self):
        filepath = self.get_data_path('single_end_data/MANIFEST')
        format = FastqManifestFormat(filepath, mode='r')

        format.validate()
def _join_pairs_w_command_output(
    demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
    truncqual: int = _jp_defaults['truncqual'],
    minlen: int = _jp_defaults['minlen'],
    maxns: int = _jp_defaults['maxns'],
    allowmergestagger: bool = _jp_defaults['allowmergestagger'],
    minovlen: int = _jp_defaults['minovlen'],
    maxdiffs: int = _jp_defaults['maxdiffs'],
    minmergelen: int = _jp_defaults['minmergelen'],
    maxmergelen: int = _jp_defaults['maxmergelen'],
    maxee: float = _jp_defaults['maxee'],
    qmin: int = _jp_defaults['qmin'],
    qminout: int = _jp_defaults['qminout'],
    qmax: int = _jp_defaults['qmax'],
    qmaxout: int = _jp_defaults['qmaxout'],
) -> (List[str], SingleLanePerSampleSingleEndFastqDirFmt):
    # this function exists only to simplify unit testing

    result = SingleLanePerSampleSingleEndFastqDirFmt()

    manifest = pd.read_csv(os.path.join(str(demultiplexed_seqs),
                                        demultiplexed_seqs.manifest.pathspec),
                           header=0,
                           comment='#')
    manifest.filename = manifest.filename.apply(
        lambda x: os.path.join(str(demultiplexed_seqs), x))

    phred_offset = yaml.load(
        open(
            os.path.join(
                str(demultiplexed_seqs),
                demultiplexed_seqs.metadata.pathspec)))['phred-offset']

    id_to_fps = manifest.pivot(index='sample-id',
                               columns='direction',
                               values='filename')

    output_manifest = FastqManifestFormat()
    output_manifest_fh = output_manifest.open()
    output_manifest_fh.write('sample-id,filename,direction\n')
    output_manifest_fh.write('# direction is not meaningful in this file '
                             'as these\n')
    output_manifest_fh.write('# data may be derived from forward, reverse, '
                             'or \n')
    output_manifest_fh.write('# joined reads\n')

    for i, (sample_id, (fwd_fp, rev_fp)) in enumerate(id_to_fps.iterrows()):
        # The barcode id, lane number and read number are not relevant
        # here. We might ultimately want to use a dir format other than
        # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care
        # about this information. Similarly, the direction of the read
        # isn't relevant here anymore.
        path = result.sequences.path_maker(sample_id=sample_id,
                                           barcode_id=i,
                                           lane_number=1,
                                           read_number=1)
        uncompressed_path = str(path).strip('.gz')

        cmd = [
            'vsearch',
            '--fastq_mergepairs',
            fwd_fp,
            '--reverse',
            rev_fp,
            '--fastqout',
            uncompressed_path,
            '--fastq_ascii',
            str(phred_offset),
            '--fastq_minlen',
            str(minlen),
            '--fastq_minovlen',
            str(minovlen),
            '--fastq_maxdiffs',
            str(maxdiffs),
            '--fastq_qmin',
            str(qmin),
            '--fastq_qminout',
            str(qminout),
            '--fastq_qmax',
            str(qmax),
            '--fastq_qmaxout',
            str(qmaxout),
        ]
        if truncqual is not None:
            cmd += ['--fastq_truncqual', str(truncqual)]
        if maxns is not None:
            cmd += ['--fastq_maxns', str(maxns)]
        if minmergelen is not None:
            cmd += ['--fastq_minmergelen', str(minmergelen)]
        if maxmergelen is not None:
            cmd += ['--fastq_maxmergelen', str(maxmergelen)]
        if maxee is not None:
            cmd += ['--fastq_maxee', str(maxee)]
        if allowmergestagger:
            cmd.append('--fastq_allowmergestagger')
        run_command(cmd)
        run_command(['gzip', uncompressed_path])
        output_manifest_fh.write('%s,%s,%s\n' %
                                 (sample_id, path.name, 'forward'))

    output_manifest_fh.close()
    result.manifest.write_data(output_manifest, FastqManifestFormat)

    metadata = YamlFormat()
    metadata.path.write_text(yaml.dump({'phred-offset': phred_offset}))
    result.metadata.write_data(metadata, YamlFormat)

    return cmd, result
Esempio n. 10
0
def main(per_sample_sequences: _SingleLanePerSampleFastqDirFmt, threads: int,
         taxa: str, region: str, paired: bool, cluster_id: float):
    """The main communication between the plugin and the ITSxpress program.

    Args:
        per_sample_sequences (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the input.
        threads (int) : The number of threads to use.
        taxa (str): The taxa to be used for the search.
        region (str) : The region to be used for the search.
        cluster_id (float):The percent identity for clustering reads, set to 1 for exact dereplication.

    Returns:
        (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the output.

    Raises:
        ValueError1: hmmsearch error.

    """
    #Seeing if cluter_id is equal to 1
    # Finding the artifact type.
    artifact_type = _view_artifact_type(
        per_sample_sequence=per_sample_sequences)
    # Setting the taxa
    taxa = _taxa_prefix_to_taxa(taxa)
    # Writing the manifest for the output qza
    manifest = FastqManifestFormat()
    manifest_fn = manifest.open()
    manifest_fn.write('sample-id,filename,direction\n')
    # Getting the sequences from the manifest
    sequences, single_end = _fastq_id_maker(
        per_sample_sequences=per_sample_sequences, artifact_type=artifact_type)
    barcode = 0
    # Creating result dir
    if paired:
        results = SingleLanePerSamplePairedEndFastqDirFmt()
    else:
        results = SingleLanePerSampleSingleEndFastqDirFmt()
    # Running the for loop for each sample

    for sequence in sequences:
        # writing fastqs and there attributes and checking the files
        sequence_id, sobj = _set_fastqs_and_check(
            per_sample_sequences=per_sample_sequences,
            artifact_type=artifact_type,
            sequence=sequence,
            single_end=single_end,
            threads=threads)

        # Deduplicate
        if math.isclose(cluster_id, 1, rel_tol=1e-05):
            sobj.deduplicate(threads=threads)
        else:
            sobj.cluster(threads=threads, cluster_id=cluster_id)
        try:
            # HMMSearch for ITS regions
            hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs",
                                   taxa_dict[taxa])
            sobj._search(hmmfile=hmmfile, threads=threads)
        except (ModuleNotFoundError, FileNotFoundError, NotADirectoryError):

            raise ValueError(
                "hmmsearch was not found, make sure HMMER3 is installed and executable"
            )

        # Parse HMMseach output.
        its_pos = itsxpress.ItsPosition(domtable=sobj.dom_file, region=region)
        # Create deduplication object.
        dedup_obj = itsxpress.Dedup(uc_file=sobj.uc_file,
                                    rep_file=sobj.rep_file,
                                    seq_file=sobj.seq_file,
                                    fastq=sobj.r1,
                                    fastq2=sobj.fastq2)

        path_forward = results.sequences.path_maker(sample_id=sequence_id,
                                                    barcode_id=barcode,
                                                    lane_number=1,
                                                    read_number=1)
        path_reverse = results.sequences.path_maker(sample_id=sequence_id,
                                                    barcode_id=barcode,
                                                    lane_number=1,
                                                    read_number=2)

        manifest_fn.write("{},{},forward\n".format(sequence_id,
                                                   path_forward.name))
        # Create trimmed sequences.
        if paired:
            dedup_obj.create_paired_trimmed_seqs(str(path_forward),
                                                 str(path_reverse),
                                                 gzipped=True,
                                                 itspos=its_pos)
        else:
            dedup_obj.create_trimmed_seqs(str(path_forward),
                                          gzipped=True,
                                          itspos=its_pos)
        # Deleting the temp files.
        shutil.rmtree(sobj.tempdir)
        # Adding one to the barcode
        barcode += 1
    # Writing out the results.
    manifest_fn.close()
    _write_metadata(results=results)
    results.manifest.write_data(manifest, FastqManifestFormat)
    return results
Esempio n. 11
0
def main(fastq, fastq2, singleEnd, threads, taxa, region):
    """The main communtion between the pluin and the ITSxpress program.

    Args:

        fastq (str) : The first fastq location.
        fastq2 (str) : The second fastq location.
        singleEnd (bool) : boolean for if singleEnd is used or not.
        threads (int) : The number of threads to use.
        taxa (str): The taxa to be used for the search.
        region (str) : The region to be used for the search.


    Returns:

        (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the output.


    Raises:

        ValueError1: BBTools error or fastq format issue.

        ValueError2: BBmerge error.

        ValueError3: hmmsearch error.

    """
    dirt = "/tmp"
    try:
        itsx._check_fastqs(fastq, fastq2)
        # Parse input types
        paired_end, interleaved = itsx._is_paired(fastq, fastq2, singleEnd)

    except:

        raise ValueError("There is a problem with the fastq file(s) you selected or\n"
                         "BBtools was not found. check that the BBtools reformat.sh package is executable.")

    # Create SeqSample objects and merge if needed.

    try:
        if paired_end and interleaved:

            sobj = itsx.SeqSamplePairedInterleaved(fastq=fastq, tempdir=dirt)

            sobj._merge_reads(threads=threads)

        elif paired_end and not interleaved:

            sobj = itsx.SeqSamplePairedNotInterleaved(fastq=fastq, fastq2=fastq2, tempdir=dirt)


            sobj = itsx.SeqSampleNotPaired(fastq=fastq, tempdir=dirt)
    except:
        raise ValueError("BBmerge was not found. check that the BBmerge reformat.sh package is executible")

    # Deduplicate
    sobj._deduplicate(threads=threads)

    try:

        # HMMSearch for ITS regions
        hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs", taxa_dict[taxa])

        sobj._search(hmmfile=hmmfile, threads=threads)

    except:

        raise ValueError("hmmsearch was not found, make sure HMMER3 is installed and executible")

    # Parse HMMseach output.
    its_pos = itsx.ItsPosition(domtable=sobj.dom_file, region=region)
    # Create deduplication object.
    dedup_obj = itsx.Dedup(uc_file=sobj.uc_file, rep_file=sobj.rep_file, seq_file=sobj.seq_file)

    results = SingleLanePerSampleSingleEndFastqDirFmt()

    path = results.sequences.path_maker(sample_id="seq",
                                        barcode_id=1,
                                        lane_number=1,
                                        read_number=1)
    # Writing the manifest for the output qza
    manifest = FastqManifestFormat()

    manifest_fn = manifest.open()

    manifest_fn.write('sample-id,filename,direction\n')

    manifest_fn.write("seq,{},reverse".format(path))

    manifest_fn.close()

    # Create trimmed sequences.
    dedup_obj.create_trimmed_seqs(str(path), gzipped=True, itspos=its_pos)

    # Writing out the results.
    _write_metadata(results)

    results.manifest.write_data(manifest, FastqManifestFormat)

    # Deleting the temp files.
    itsx.shutil.rmtree(sobj.tempdir)

    return results
Esempio n. 12
0
def join_pairs(demultiplexed_seqs: SingleLanePerSamplePairedEndFastqDirFmt,
               threads: int = 1) -> SingleLanePerSampleSingleEndFastqDirFmt:

    result = SingleLanePerSampleSingleEndFastqDirFmt()

    manifest = pd.read_csv(os.path.join(str(demultiplexed_seqs),
                                        demultiplexed_seqs.manifest.pathspec),
                           header=0,
                           comment='#')

    manifest.filename = manifest.filename.apply(
        lambda x: os.path.join(str(demultiplexed_seqs), x))

    phred_offset = yaml.load(
        open(
            os.path.join(
                str(demultiplexed_seqs),
                demultiplexed_seqs.metadata.pathspec)))['phred-offset']

    id_to_fps = manifest.pivot(index='sample-id',
                               columns='direction',
                               values='filename')

    output_manifest = FastqManifestFormat()
    output_manifest_fh = output_manifest.open()
    output_manifest_fh.write('sample-id,filename,direction\n')
    output_manifest_fh.write('# direction is not meaningful in this file '
                             'as these\n')
    output_manifest_fh.write('# data may be derived from forward, reverse, '
                             'or \n')
    output_manifest_fh.write('# joined reads\n')

    for i, (sample_id, (fwd_fp, rev_fp)) in enumerate(id_to_fps.iterrows()):
        # The barcode id, lane number and read number are not relevant
        # here. We might ultimately want to use a dir format other than
        # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care
        # about this information. Similarly, the direction of the read
        # isn't relevant here anymore.
        path = result.sequences.path_maker(sample_id=sample_id,
                                           barcode_id=i,
                                           lane_number=1,
                                           read_number=1)

        uncompressed_path = str(path).strip('.gz')

        parent_pth = Path(path).parent

        sample_id_path = str(parent_pth / sample_id)

        assembled_pth = parent_pth / "{}.assembled.fastq".format(sample_id)
        discarded_pth = parent_pth / "{}.discarded.fastq".format(sample_id)
        unassembled_fwd_pth = parent_pth / "{}.unassembled.forward.fastq".format(
            sample_id)
        unassembled_rev_pth = parent_pth / "{}.unassembled.reverse.fastq".format(
            sample_id)

        cmd = [
            'pear', '-f', fwd_fp, '-r', rev_fp, '-o', sample_id_path,
            '--threads',
            str(threads)
        ]

        run_command(cmd)

        assembled_pth.rename(Path(uncompressed_path))
        run_command(['gzip', uncompressed_path])

        #delete extra files
        extra_files = [discarded_pth, unassembled_fwd_pth, unassembled_rev_pth]
        for f_pth in extra_files:
            try:
                os.remove(str(f_pth))
            except:
                pass

        output_manifest_fh.write('%s,%s,%s\n' %
                                 (sample_id, Path(path).name, 'forward'))

    output_manifest_fh.close()
    result.manifest.write_data(output_manifest, FastqManifestFormat)

    metadata = YamlFormat()
    metadata.path.write_text(yaml.dump({'phred-offset': phred_offset}))
    result.metadata.write_data(metadata, YamlFormat)

    return result
Esempio n. 13
0
    def test_fastq_manifest_format_validate_negative(self):
        filepath = self.get_data_path('not-MANIFEST')
        format = FastqManifestFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
            format.validate()
Esempio n. 14
0
    def test_fastq_manifest_format_validate_positive(self):
        filepath = self.get_data_path('single_end_data/MANIFEST')
        format = FastqManifestFormat(filepath, mode='r')

        format.validate()
Esempio n. 15
0
def emp_single(
    seqs: BarcodeSequenceFastqIterator,
    barcodes: qiime2.CategoricalMetadataColumn,
    golay_error_correction: bool = True,
    rev_comp_barcodes: bool = False,
    rev_comp_mapping_barcodes: bool = False,
    ignore_description_mismatch: bool = False
) -> (SingleLanePerSampleSingleEndFastqDirFmt, ErrorCorrectionDetailsFmt):
    seqs.ignore_description_mismatch = ignore_description_mismatch
    result = SingleLanePerSampleSingleEndFastqDirFmt()
    barcode_map, barcode_len = _make_barcode_map(barcodes,
                                                 rev_comp_mapping_barcodes)

    if golay_error_correction:
        decoder = GolayDecoder()

    manifest = FastqManifestFormat()
    manifest_fh = manifest.open()
    manifest_fh.write('sample-id,filename,direction\n')
    manifest_fh.write('# direction is not meaningful in this file as these\n')
    manifest_fh.write('# data may be derived from forward, reverse, or \n')
    manifest_fh.write('# joined reads\n')

    per_sample_fastqs = {}

    ec_details_fmt = ErrorCorrectionDetailsFmt()
    ec_details = ECDetails(ec_details_fmt)

    for i, (barcode_record, sequence_record) in enumerate(seqs, start=1):
        barcode_read = barcode_record[1]
        if rev_comp_barcodes:
            barcode_read = str(skbio.DNA(barcode_read).reverse_complement())
        raw_barcode_read = barcode_read[:barcode_len]

        if golay_error_correction:
            # A three bit filter is implicitly used by the decoder. See Hamady
            # and Knight 2009 Genome Research for the justification:
            #
            # https://genome.cshlp.org/content/19/7/1141.full
            #
            # Specifically that "...Golay codes of 12 bases can correct all
            # triple-bit errors and detect all quadruple-bit errors."
            barcode_read, ecc_errors = decoder.decode(raw_barcode_read)
            golay_stats = [barcode_read, ecc_errors]
        else:
            barcode_read = raw_barcode_read
            golay_stats = [None, None]

        sample_id = barcode_map.get(barcode_read)

        record = [
            f'record-{i}',
            sample_id,
            barcode_record[0],
            raw_barcode_read,
        ]
        ec_details.write(record + golay_stats)

        if sample_id is None:
            continue

        if sample_id not in per_sample_fastqs:
            # The barcode id, lane number and read number are not relevant
            # here. We might ultimately want to use a dir format other than
            # SingleLanePerSampleSingleEndFastqDirFmt which doesn't care
            # about this information. Similarly, the direction of the read
            # isn't relevant here anymore.
            barcode_id = len(per_sample_fastqs) + 1
            path = result.sequences.path_maker(sample_id=sample_id,
                                               barcode_id=barcode_id,
                                               lane_number=1,
                                               read_number=1)
            _maintain_open_fh_count(per_sample_fastqs)
            per_sample_fastqs[sample_id] = gzip.open(str(path), mode='a')
            manifest_fh.write('%s,%s,%s\n' % (sample_id, path.name, 'forward'))

        if per_sample_fastqs[sample_id].closed:
            _maintain_open_fh_count(per_sample_fastqs)
            per_sample_fastqs[sample_id] = gzip.open(
                per_sample_fastqs[sample_id].name, mode='a')

        fastq_lines = '\n'.join(sequence_record) + '\n'
        fastq_lines = fastq_lines.encode('utf-8')
        per_sample_fastqs[sample_id].write(fastq_lines)

    if len(per_sample_fastqs) == 0:
        raise ValueError('No sequences were mapped to samples. Check that '
                         'your barcodes are in the correct orientation (see '
                         'the rev_comp_barcodes and/or '
                         'rev_comp_mapping_barcodes options). If barcodes are '
                         'NOT Golay format set golay_error_correction '
                         'to False.')

    for fh in per_sample_fastqs.values():
        fh.close()

    manifest_fh.close()
    result.manifest.write_data(manifest, FastqManifestFormat)

    _write_metadata_yaml(result)

    return result, ec_details_fmt
Esempio n. 16
0
    def test_fastq_manifest_format_validate_negative(self):
        filepath = self.get_data_path('not-MANIFEST')
        format = FastqManifestFormat(filepath, mode='r')

        with self.assertRaisesRegex(ValueError, 'FastqManifestFormat'):
            format.validate()
Esempio n. 17
0
def main(per_sample_sequences, threads, taxa, region):
    """The main communtion between the pluin and the ITSxpress program.

    Args:

        per_sample_sequences (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the input.
        threads (int) : The number of threads to use.
        taxa (str): The taxa to be used for the search.
        region (str) : The region to be used for the search.

    Returns:

        (SingleLanePerSampleSingleEndFastqDirFmt): The SingleLanePerSampleSingleEndFastqDirFmt type
        of the output.

    Raises:

        ValueError1: BBTools error or fastq format issue.
        ValueError2: BBmerge error.
        ValueError3: hmmsearch error.

    """
    # Setting a temp folder
    dirt = tempfile.tempdir
    # Setting the current dir
    os.chdir(str(per_sample_sequences.path))
    # Finding the artifact type.
    artifactType = _view_artifact_type()
    # Setting the taxa
    taxa = _taxa_prefix_to_taxa(taxa)
    # Writing the manifest for the output qza
    manifest = FastqManifestFormat()
    manifest_fn = manifest.open()
    manifest_fn.write('sample-id,filename,direction\n')
    sequences,singleEnd = _fastq_id_maker(per_sample_sequences, artifactType)
    sequenceList = set(sequences)
    barcode = 0
    # Creating result dir
    results = SingleLanePerSampleSingleEndFastqDirFmt()
    # Running the for loop for each sample
    for sequence in sequenceList:

        # Setting the fastq files and if singleEnd is used.
        fastq = os.path.join(str(per_sample_sequences.path),str(sequence[0]))
        if "SampleData[PairedEndSequencesWithQuality]" in artifactType:
            fastq2 = os.path.join(str(per_sample_sequences.path),str(sequence[1]))
        else:
            fastq2 = sequence[1]
        sequenceID = sequence[2]
        # Running the main ITSxpress program.
        try:
            itsx._check_fastqs(fastq, fastq2)
            # Parse input types
            paired_end, interleaved = itsx._is_paired(fastq, fastq2, singleEnd)
        except:
            raise ValueError("There is a problem with the fastq file(s) you selected or\n"
                             "BBtools was not found. check that the BBtools reformat.sh package is executable.")
        # Create SeqSample objects and merge if needed.
        try:
            if paired_end and interleaved:
                sobj = itsx.SeqSamplePairedInterleaved(fastq=fastq, tempdir=dirt)
                sobj._merge_reads(threads=threads)

            elif paired_end and not interleaved:
                sobj = itsx.SeqSamplePairedNotInterleaved(fastq=fastq, fastq2=fastq2, tempdir=dirt)
                sobj._merge_reads(threads=threads)

            elif not paired_end and not interleaved:
                sobj = itsx.SeqSampleNotPaired(fastq=fastq, tempdir=dirt)

        except:
            raise ValueError("BBmerge was not found. check that the BBmerge reformat.sh package is executible")

        # Deduplicate
        sobj._deduplicate(threads=threads)
        try:
            # HMMSearch for ITS regions
            hmmfile = os.path.join(ROOT_DIR, "ITSx_db", "HMMs", taxa_dict[taxa])
            sobj._search(hmmfile=hmmfile, threads=threads)
        except:
            raise ValueError("hmmsearch was not found, make sure HMMER3 is installed and executible")

        # Parse HMMseach output.
        its_pos = itsx.ItsPosition(domtable=sobj.dom_file, region=region)
        # Create deduplication object.
        dedup_obj = itsx.Dedup(uc_file=sobj.uc_file, rep_file=sobj.rep_file, seq_file=sobj.seq_file)
        pathForward = results.sequences.path_maker(sample_id=sequenceID,
                                                   barcode_id=barcode,
                                                   lane_number=1,
                                                   read_number=1)

        manifest_fn.write("{},{},forward\n".format(sequenceID,pathForward.name))
        # Create trimmed sequences.
        dedup_obj.create_trimmed_seqs(str(pathForward), gzipped=True, itspos=its_pos)
        # Deleting the temp files.
        itsx.shutil.rmtree(sobj.tempdir)
        barcode += 1
    #Writing out the results.
    manifest_fn.close()
    _write_metadata(results)
    results.manifest.write_data(manifest, FastqManifestFormat)
    return results