Exemple #1
0
def splitmapper(input,
                index,
                output=None,
                mismatches=0.04,
                splice_consensus=extended_splice_consensus,
                filter=default_filter,
                refinement_step_size=2,
                min_split_size=15,
                matches_threshold=100,
                strata_after_first=1,
                mismatch_alphabet="ACGT",
                quality=33,
                trim=None,
                filter_splitmaps=True,
                post_validate=True,
                threads=1,
                extra=None):
    """Start the GEM split mapper on the given input.
    If input is a file handle, it is assumed to
    provide fastq entries. If input is a string,
    it is checked for its extension. In case of a
    .map file, the input is converted from gem format
    to fastq and passed to the mapper.

    Output can be a string, which will be translated to
    the output file. In case output is a file handle,
    the GEM output is written there.

    input -- string with the input file or a file handle or a generator
    output -- output file name or file handle
    index -- valid GEM2 index
    """

    ## check the index
    index = _prepare_index_parameter(index, gem_suffix=True)
    if quality is None and isinstance(input, files.ReadIterator):
        quality = input.quality
    quality = _prepare_quality_parameter(quality)
    splice_cons = _prepare_splice_consensus_parameter(splice_consensus)

    pa = [executables['gem-rna-mapper'],
          '-I', index,
          '-q', quality,
          '-m', str(mismatches),
          '--min-split-size', str(min_split_size),
          '--refinement-step-size', str(refinement_step_size),
          '--matches-threshold', str(matches_threshold),
          '-s', str(strata_after_first),
          '--mismatch-alphabet', mismatch_alphabet,
          '-T', str(threads)
    ]
    min_threads = int(round(max(1, threads / 2)))

    if filter is not None:
        pa.append("-f")
        pa.append(filter)
    if splice_cons is not None:
        pa.append("-c")
        pa.append(splice_cons)

    ## extend with additional parameters
    _extend_parameters(pa, extra)
    trim_c = [executables['gem-2-gem'], '-c', '-T', str(min_threads)]
    if trim is not None:
        ## check type
        if not isinstance(trim, (list, tuple)) or len(trim) != 2:
            raise ValueError("Trim parameter has to be a list or a tuple of size 2")
        input = gemfilter.trim(input, trim[0], trim[1], append_label=True)

    tools = [pa]
    if filter_splitmaps:
        tools.append(__awk_filter)
    if trim is not None:
        tools.append(trim_c)

    ## run the mapper
    process = None
    original_output = output
    if post_validate:
        output = None

    raw = False
    if isinstance(input, gt.InputFile) and input.raw_sequence_stream():
        raw = False
        pa.append("-i")
        pa.append(input.filename)
        input = None

    process = utils.run_tools(tools, input=input, output=output, name="GEM-Split-Mapper", raw=raw)
    splitmap_out = _prepare_output(process, output=output, quality=quality)

    if post_validate:
        return validate(splitmap_out, index, original_output, threads=threads)

    return splitmap_out
Exemple #2
0
def mapper(input, index, output=None,
           mismatches=0.04,
           delta=0,
           quality=33,
           quality_threshold=26,
           max_decoded_matches=20,
           min_decoded_strata=1,
           min_matched_bases=0.80,
           max_big_indel_length=15,
           max_edit_distance=0.20,
           mismatch_alphabet="ACGT",
           trim=None,
           unique_mapping=False,
           threads=1,
           extra=None,
           key_file=None,
           force_min_decoded_strata=False,
           compress=False
           ):
    """Start the GEM mapper on the given input.
    If input is a file handle, it is assumed to
    provide fastq entries. If input is a string,
    it is checked for its extension. In case of a
    .map file, the input is converted from gem format
    to fastq and passed to the mapper.

    Output can be a string, which will be translated to
    the output file. In case output is a file handle,
    the GEM output is written there.

    input -- A ReadIterator with the input
    output -- output file name
    index -- valid GEM2 index
    mismatches - number or % mismatches, default=0.04
    delta -- strata after best <number> (default=0)
    quality -- one of 'ignore'|'offset-33'|'offset-64' defaults to offset-33
    quality_threshold <number> -- (default=26, that is e<=2e-3)
    max_edit_distance -- max edit distance, 0.20 per default
    max_decoded_matches -- maximum decoded matches, defaults to 20
    min_decoded_strata -- strata that are decoded fully (ignoring max decoded matches), defaults to 1
    min_matched_bases -- minimum number (or %) of matched bases, defaults to 0.80
    trim -- tuple or list that specifies left and right trimmings
    extra -- list of additional parameters added to gem mapper call
    """

    ## prepare inputs
    index = _prepare_index_parameter(index)
    quality = _prepare_quality_parameter(quality, input)

    # if delta >= min_decoded_strata and not force_min_decoded_strata:
    #     logging.warning("Changing min-decoded-strata from %s to %s to cope with delta of %s" % (
    #         str(min_decoded_strata), str(delta + 1), str(delta)))
    #     min_decoded_strata = delta + 1
    if compress and output is None:
        logging.warning("Disabeling stream compression")
        compress = False

    if compress and not output.endswith(".gz"):
        output += ".gz"

    ## prepare the input
    pa = [executables['gem-mapper'], '-I', index,
          '-q', quality,
          '-m', str(mismatches),
          '-s', str(delta),
          '--max-decoded-matches', str(max_decoded_matches),
          '--min-decoded-strata', str(min_decoded_strata),
          '--min-matched-bases', str(min_matched_bases),
          '--gem-quality-threshold', str(quality_threshold),
          '--max-big-indel-length', str(max_big_indel_length),
          '--mismatch-alphabet', mismatch_alphabet,
          '-T', str(threads)
    ]

    if unique_mapping:
        pa.append("--unique-mapping")

    if max_edit_distance > 0:
        pa.append("-e")
        pa.append("%s" % str(max_edit_distance))

    ## extend with additional parameters
    _extend_parameters(pa, extra)

    trim_c = [executables['gem-2-gem'], '-c', '-T', str(threads)]
    if trim is not None:
        ## check type
        if not isinstance(trim, (list, tuple)) or len(trim) != 2:
            raise ValueError("Trim parameter has to be a list or a tuple of size 2")
        input = gemfilter.trim(input, trim[0], trim[1], append_label=True)

    # workaround for GT-32 - filter away the !
    # build list of tools
    tools = [pa]
    if unique_mapping:
        tools.append(__awk_filter)

    if trim is not None:
        tools.append(trim_c)

    # convert to genome coordinates if mapping to transcriptome
    if key_file is not None:
        convert_to_genome = [executables['transcriptome-2-genome'], key_file, str(max(1, threads / 2))]
        tools.append(convert_to_genome)

    if compress:
        gzip = _compressor(threads=max(1, threads / 2))
        tools.append(gzip)

    raw = False
    if isinstance(input, gt.InputFile) and input.raw_sequence_stream():
        raw = False
        pa.append("-i")
        pa.append(input.filename)
        input = None

    ## run the mapper
    process = utils.run_tools(tools, input=input, output=output, name="GEM-Mapper", raw=raw)
    return _prepare_output(process, output=output, quality=quality)
Exemple #3
0
def mapper(input,
           index,
           output=None,
           mismatches=0.04,
           delta=0,
           quality=33,
           quality_threshold=26,
           max_decoded_matches=20,
           min_decoded_strata=1,
           min_matched_bases=0.80,
           max_big_indel_length=15,
           max_edit_distance=0.20,
           mismatch_alphabet="ACGT",
           trim=None,
           unique_mapping=False,
           threads=1,
           extra=None,
           key_file=None,
           force_min_decoded_strata=False,
           compress=False):
    """Start the GEM mapper on the given input.
    If input is a file handle, it is assumed to
    provide fastq entries. If input is a string,
    it is checked for its extension. In case of a
    .map file, the input is converted from gem format
    to fastq and passed to the mapper.

    Output can be a string, which will be translated to
    the output file. In case output is a file handle,
    the GEM output is written there.

    input -- A ReadIterator with the input
    output -- output file name
    index -- valid GEM2 index
    mismatches - number or % mismatches, default=0.04
    delta -- strata after best <number> (default=0)
    quality -- one of 'ignore'|'offset-33'|'offset-64' defaults to offset-33
    quality_threshold <number> -- (default=26, that is e<=2e-3)
    max_edit_distance -- max edit distance, 0.20 per default
    max_decoded_matches -- maximum decoded matches, defaults to 20
    min_decoded_strata -- strata that are decoded fully (ignoring max decoded matches), defaults to 1
    min_matched_bases -- minimum number (or %) of matched bases, defaults to 0.80
    trim -- tuple or list that specifies left and right trimmings
    extra -- list of additional parameters added to gem mapper call
    """

    ## prepare inputs
    index = _prepare_index_parameter(index)
    quality = _prepare_quality_parameter(quality, input)

    # if delta >= min_decoded_strata and not force_min_decoded_strata:
    #     logging.warning("Changing min-decoded-strata from %s to %s to cope with delta of %s" % (
    #         str(min_decoded_strata), str(delta + 1), str(delta)))
    #     min_decoded_strata = delta + 1
    if compress and output is None:
        logging.warning("Disabeling stream compression")
        compress = False

    if compress and not output.endswith(".gz"):
        output += ".gz"

    ## prepare the input
    pa = [
        executables['gem-mapper'], '-I', index, '-q', quality, '-m',
        str(mismatches), '-s',
        str(delta), '--max-decoded-matches',
        str(max_decoded_matches), '--min-decoded-strata',
        str(min_decoded_strata), '--min-matched-bases',
        str(min_matched_bases), '--gem-quality-threshold',
        str(quality_threshold), '--max-big-indel-length',
        str(max_big_indel_length), '--mismatch-alphabet', mismatch_alphabet,
        '-T',
        str(threads)
    ]

    if unique_mapping:
        pa.append("--unique-mapping")

    if max_edit_distance > 0:
        pa.append("-e")
        pa.append("%s" % str(max_edit_distance))

    ## extend with additional parameters
    _extend_parameters(pa, extra)

    trim_c = [executables['gem-2-gem'], '-c', '-T', str(threads)]
    if trim is not None:
        ## check type
        if not isinstance(trim, (list, tuple)) or len(trim) != 2:
            raise ValueError(
                "Trim parameter has to be a list or a tuple of size 2")
        input = gemfilter.trim(input, trim[0], trim[1], append_label=True)

    # workaround for GT-32 - filter away the !
    # build list of tools
    tools = [pa]
    if unique_mapping:
        tools.append(__awk_filter)

    if trim is not None:
        tools.append(trim_c)

    # convert to genome coordinates if mapping to transcriptome
    if key_file is not None:
        convert_to_genome = [
            executables['gem-rna-tools'], 'transcriptome-2-genome', '-k',
            key_file, '--threads',
            str(max(1, threads / 2))
        ]
        tools.append(convert_to_genome)

    if compress:
        gzip = _compressor(threads=max(1, threads / 2))
        tools.append(gzip)

    raw = False
    if isinstance(input, gt.InputFile) and input.raw_sequence_stream():
        raw = False
        pa.append("-i")
        pa.append(input.filename)
        input = None

    ## run the mapper
    process = utils.run_tools(tools,
                              input=input,
                              output=output,
                              name="GEM-Mapper",
                              raw=raw)
    return _prepare_output(process, output=output, quality=quality)
Exemple #4
0
def splitmapper(input,
                index,
                output=None,
                mismatches=0.04,
                splice_consensus=extended_splice_consensus,
                filter=default_filter,
                refinement_step_size=2,
                min_split_size=15,
                matches_threshold=100,
                strata_after_first=1,
                mismatch_alphabet="ACGT",
                quality=33,
                trim=None,
                filter_splitmaps=True,
                post_validate=True,
                threads=1,
                extra=None):
    """Start the GEM split mapper on the given input.
    If input is a file handle, it is assumed to
    provide fastq entries. If input is a string,
    it is checked for its extension. In case of a
    .map file, the input is converted from gem format
    to fastq and passed to the mapper.

    Output can be a string, which will be translated to
    the output file. In case output is a file handle,
    the GEM output is written there.

    input -- string with the input file or a file handle or a generator
    output -- output file name or file handle
    index -- valid GEM2 index
    """

    ## check the index
    index = _prepare_index_parameter(index, gem_suffix=True)
    if quality is None and isinstance(input, files.ReadIterator):
        quality = input.quality
    quality = _prepare_quality_parameter(quality)
    splice_cons = _prepare_splice_consensus_parameter(splice_consensus)

    pa = [
        executables['gem-rna-tools'], 'split-mapper', '-I', index, '-q',
        quality, '-m',
        str(mismatches), '--min-split-size',
        str(min_split_size), '--refinement-step-size',
        str(refinement_step_size), '--matches-threshold',
        str(matches_threshold), '-s',
        str(strata_after_first), '--mismatch-alphabet', mismatch_alphabet,
        '-T',
        str(threads)
    ]
    min_threads = int(round(max(1, threads / 2)))

    if filter is not None:
        pa.append("-f")
        pa.append(filter)
    if splice_cons is not None:
        pa.append("-c")
        pa.append(splice_cons)

    ## extend with additional parameters
    _extend_parameters(pa, extra)
    trim_c = [executables['gem-2-gem'], '-c', '-T', str(min_threads)]
    if trim is not None:
        ## check type
        if not isinstance(trim, (list, tuple)) or len(trim) != 2:
            raise ValueError(
                "Trim parameter has to be a list or a tuple of size 2")
        input = gemfilter.trim(input, trim[0], trim[1], append_label=True)

    tools = [pa]
    if filter_splitmaps:
        tools.append(__awk_filter)
    if trim is not None:
        tools.append(trim_c)

    ## run the mapper
    process = None
    original_output = output
    if post_validate:
        output = None

    raw = False
    if isinstance(input, gt.InputFile) and input.raw_sequence_stream():
        raw = False
        pa.append("-i")
        pa.append(input.filename)
        input = None

    process = utils.run_tools(tools,
                              input=input,
                              output=output,
                              name="GEM-Split-Mapper",
                              raw=raw)
    splitmap_out = _prepare_output(process, output=output, quality=quality)

    if post_validate:
        return validate(splitmap_out, index, original_output, threads=threads)

    return splitmap_out
Exemple #5
0
def test_fastq_trim_both():
    reads = files.open(testfiles["reads_1.fastq"])
    sum_length = sum(r.length for r in filter.trim(reads, 10, 10))
    assert sum_length == 550000, sum_length