def splitmapper(input, index, output=None, mismatches=0.04, splice_consensus=extended_splice_consensus, filter=default_filter, refinement_step_size=2, min_split_size=15, matches_threshold=100, strata_after_first=1, mismatch_alphabet="ACGT", quality=33, trim=None, filter_splitmaps=True, post_validate=True, threads=1, extra=None): """Start the GEM split mapper on the given input. If input is a file handle, it is assumed to provide fastq entries. If input is a string, it is checked for its extension. In case of a .map file, the input is converted from gem format to fastq and passed to the mapper. Output can be a string, which will be translated to the output file. In case output is a file handle, the GEM output is written there. input -- string with the input file or a file handle or a generator output -- output file name or file handle index -- valid GEM2 index """ ## check the index index = _prepare_index_parameter(index, gem_suffix=True) if quality is None and isinstance(input, files.ReadIterator): quality = input.quality quality = _prepare_quality_parameter(quality) splice_cons = _prepare_splice_consensus_parameter(splice_consensus) pa = [executables['gem-rna-mapper'], '-I', index, '-q', quality, '-m', str(mismatches), '--min-split-size', str(min_split_size), '--refinement-step-size', str(refinement_step_size), '--matches-threshold', str(matches_threshold), '-s', str(strata_after_first), '--mismatch-alphabet', mismatch_alphabet, '-T', str(threads) ] min_threads = int(round(max(1, threads / 2))) if filter is not None: pa.append("-f") pa.append(filter) if splice_cons is not None: pa.append("-c") pa.append(splice_cons) ## extend with additional parameters _extend_parameters(pa, extra) trim_c = [executables['gem-2-gem'], '-c', '-T', str(min_threads)] if trim is not None: ## check type if not isinstance(trim, (list, tuple)) or len(trim) != 2: raise ValueError("Trim parameter has to be a list or a tuple of size 2") input = gemfilter.trim(input, trim[0], trim[1], append_label=True) tools = [pa] if filter_splitmaps: tools.append(__awk_filter) if trim is not None: tools.append(trim_c) ## run the mapper process = None original_output = output if post_validate: output = None raw = False if isinstance(input, gt.InputFile) and input.raw_sequence_stream(): raw = False pa.append("-i") pa.append(input.filename) input = None process = utils.run_tools(tools, input=input, output=output, name="GEM-Split-Mapper", raw=raw) splitmap_out = _prepare_output(process, output=output, quality=quality) if post_validate: return validate(splitmap_out, index, original_output, threads=threads) return splitmap_out
def mapper(input, index, output=None, mismatches=0.04, delta=0, quality=33, quality_threshold=26, max_decoded_matches=20, min_decoded_strata=1, min_matched_bases=0.80, max_big_indel_length=15, max_edit_distance=0.20, mismatch_alphabet="ACGT", trim=None, unique_mapping=False, threads=1, extra=None, key_file=None, force_min_decoded_strata=False, compress=False ): """Start the GEM mapper on the given input. If input is a file handle, it is assumed to provide fastq entries. If input is a string, it is checked for its extension. In case of a .map file, the input is converted from gem format to fastq and passed to the mapper. Output can be a string, which will be translated to the output file. In case output is a file handle, the GEM output is written there. input -- A ReadIterator with the input output -- output file name index -- valid GEM2 index mismatches - number or % mismatches, default=0.04 delta -- strata after best <number> (default=0) quality -- one of 'ignore'|'offset-33'|'offset-64' defaults to offset-33 quality_threshold <number> -- (default=26, that is e<=2e-3) max_edit_distance -- max edit distance, 0.20 per default max_decoded_matches -- maximum decoded matches, defaults to 20 min_decoded_strata -- strata that are decoded fully (ignoring max decoded matches), defaults to 1 min_matched_bases -- minimum number (or %) of matched bases, defaults to 0.80 trim -- tuple or list that specifies left and right trimmings extra -- list of additional parameters added to gem mapper call """ ## prepare inputs index = _prepare_index_parameter(index) quality = _prepare_quality_parameter(quality, input) # if delta >= min_decoded_strata and not force_min_decoded_strata: # logging.warning("Changing min-decoded-strata from %s to %s to cope with delta of %s" % ( # str(min_decoded_strata), str(delta + 1), str(delta))) # min_decoded_strata = delta + 1 if compress and output is None: logging.warning("Disabeling stream compression") compress = False if compress and not output.endswith(".gz"): output += ".gz" ## prepare the input pa = [executables['gem-mapper'], '-I', index, '-q', quality, '-m', str(mismatches), '-s', str(delta), '--max-decoded-matches', str(max_decoded_matches), '--min-decoded-strata', str(min_decoded_strata), '--min-matched-bases', str(min_matched_bases), '--gem-quality-threshold', str(quality_threshold), '--max-big-indel-length', str(max_big_indel_length), '--mismatch-alphabet', mismatch_alphabet, '-T', str(threads) ] if unique_mapping: pa.append("--unique-mapping") if max_edit_distance > 0: pa.append("-e") pa.append("%s" % str(max_edit_distance)) ## extend with additional parameters _extend_parameters(pa, extra) trim_c = [executables['gem-2-gem'], '-c', '-T', str(threads)] if trim is not None: ## check type if not isinstance(trim, (list, tuple)) or len(trim) != 2: raise ValueError("Trim parameter has to be a list or a tuple of size 2") input = gemfilter.trim(input, trim[0], trim[1], append_label=True) # workaround for GT-32 - filter away the ! # build list of tools tools = [pa] if unique_mapping: tools.append(__awk_filter) if trim is not None: tools.append(trim_c) # convert to genome coordinates if mapping to transcriptome if key_file is not None: convert_to_genome = [executables['transcriptome-2-genome'], key_file, str(max(1, threads / 2))] tools.append(convert_to_genome) if compress: gzip = _compressor(threads=max(1, threads / 2)) tools.append(gzip) raw = False if isinstance(input, gt.InputFile) and input.raw_sequence_stream(): raw = False pa.append("-i") pa.append(input.filename) input = None ## run the mapper process = utils.run_tools(tools, input=input, output=output, name="GEM-Mapper", raw=raw) return _prepare_output(process, output=output, quality=quality)
def mapper(input, index, output=None, mismatches=0.04, delta=0, quality=33, quality_threshold=26, max_decoded_matches=20, min_decoded_strata=1, min_matched_bases=0.80, max_big_indel_length=15, max_edit_distance=0.20, mismatch_alphabet="ACGT", trim=None, unique_mapping=False, threads=1, extra=None, key_file=None, force_min_decoded_strata=False, compress=False): """Start the GEM mapper on the given input. If input is a file handle, it is assumed to provide fastq entries. If input is a string, it is checked for its extension. In case of a .map file, the input is converted from gem format to fastq and passed to the mapper. Output can be a string, which will be translated to the output file. In case output is a file handle, the GEM output is written there. input -- A ReadIterator with the input output -- output file name index -- valid GEM2 index mismatches - number or % mismatches, default=0.04 delta -- strata after best <number> (default=0) quality -- one of 'ignore'|'offset-33'|'offset-64' defaults to offset-33 quality_threshold <number> -- (default=26, that is e<=2e-3) max_edit_distance -- max edit distance, 0.20 per default max_decoded_matches -- maximum decoded matches, defaults to 20 min_decoded_strata -- strata that are decoded fully (ignoring max decoded matches), defaults to 1 min_matched_bases -- minimum number (or %) of matched bases, defaults to 0.80 trim -- tuple or list that specifies left and right trimmings extra -- list of additional parameters added to gem mapper call """ ## prepare inputs index = _prepare_index_parameter(index) quality = _prepare_quality_parameter(quality, input) # if delta >= min_decoded_strata and not force_min_decoded_strata: # logging.warning("Changing min-decoded-strata from %s to %s to cope with delta of %s" % ( # str(min_decoded_strata), str(delta + 1), str(delta))) # min_decoded_strata = delta + 1 if compress and output is None: logging.warning("Disabeling stream compression") compress = False if compress and not output.endswith(".gz"): output += ".gz" ## prepare the input pa = [ executables['gem-mapper'], '-I', index, '-q', quality, '-m', str(mismatches), '-s', str(delta), '--max-decoded-matches', str(max_decoded_matches), '--min-decoded-strata', str(min_decoded_strata), '--min-matched-bases', str(min_matched_bases), '--gem-quality-threshold', str(quality_threshold), '--max-big-indel-length', str(max_big_indel_length), '--mismatch-alphabet', mismatch_alphabet, '-T', str(threads) ] if unique_mapping: pa.append("--unique-mapping") if max_edit_distance > 0: pa.append("-e") pa.append("%s" % str(max_edit_distance)) ## extend with additional parameters _extend_parameters(pa, extra) trim_c = [executables['gem-2-gem'], '-c', '-T', str(threads)] if trim is not None: ## check type if not isinstance(trim, (list, tuple)) or len(trim) != 2: raise ValueError( "Trim parameter has to be a list or a tuple of size 2") input = gemfilter.trim(input, trim[0], trim[1], append_label=True) # workaround for GT-32 - filter away the ! # build list of tools tools = [pa] if unique_mapping: tools.append(__awk_filter) if trim is not None: tools.append(trim_c) # convert to genome coordinates if mapping to transcriptome if key_file is not None: convert_to_genome = [ executables['gem-rna-tools'], 'transcriptome-2-genome', '-k', key_file, '--threads', str(max(1, threads / 2)) ] tools.append(convert_to_genome) if compress: gzip = _compressor(threads=max(1, threads / 2)) tools.append(gzip) raw = False if isinstance(input, gt.InputFile) and input.raw_sequence_stream(): raw = False pa.append("-i") pa.append(input.filename) input = None ## run the mapper process = utils.run_tools(tools, input=input, output=output, name="GEM-Mapper", raw=raw) return _prepare_output(process, output=output, quality=quality)
def splitmapper(input, index, output=None, mismatches=0.04, splice_consensus=extended_splice_consensus, filter=default_filter, refinement_step_size=2, min_split_size=15, matches_threshold=100, strata_after_first=1, mismatch_alphabet="ACGT", quality=33, trim=None, filter_splitmaps=True, post_validate=True, threads=1, extra=None): """Start the GEM split mapper on the given input. If input is a file handle, it is assumed to provide fastq entries. If input is a string, it is checked for its extension. In case of a .map file, the input is converted from gem format to fastq and passed to the mapper. Output can be a string, which will be translated to the output file. In case output is a file handle, the GEM output is written there. input -- string with the input file or a file handle or a generator output -- output file name or file handle index -- valid GEM2 index """ ## check the index index = _prepare_index_parameter(index, gem_suffix=True) if quality is None and isinstance(input, files.ReadIterator): quality = input.quality quality = _prepare_quality_parameter(quality) splice_cons = _prepare_splice_consensus_parameter(splice_consensus) pa = [ executables['gem-rna-tools'], 'split-mapper', '-I', index, '-q', quality, '-m', str(mismatches), '--min-split-size', str(min_split_size), '--refinement-step-size', str(refinement_step_size), '--matches-threshold', str(matches_threshold), '-s', str(strata_after_first), '--mismatch-alphabet', mismatch_alphabet, '-T', str(threads) ] min_threads = int(round(max(1, threads / 2))) if filter is not None: pa.append("-f") pa.append(filter) if splice_cons is not None: pa.append("-c") pa.append(splice_cons) ## extend with additional parameters _extend_parameters(pa, extra) trim_c = [executables['gem-2-gem'], '-c', '-T', str(min_threads)] if trim is not None: ## check type if not isinstance(trim, (list, tuple)) or len(trim) != 2: raise ValueError( "Trim parameter has to be a list or a tuple of size 2") input = gemfilter.trim(input, trim[0], trim[1], append_label=True) tools = [pa] if filter_splitmaps: tools.append(__awk_filter) if trim is not None: tools.append(trim_c) ## run the mapper process = None original_output = output if post_validate: output = None raw = False if isinstance(input, gt.InputFile) and input.raw_sequence_stream(): raw = False pa.append("-i") pa.append(input.filename) input = None process = utils.run_tools(tools, input=input, output=output, name="GEM-Split-Mapper", raw=raw) splitmap_out = _prepare_output(process, output=output, quality=quality) if post_validate: return validate(splitmap_out, index, original_output, threads=threads) return splitmap_out
def test_fastq_trim_both(): reads = files.open(testfiles["reads_1.fastq"]) sum_length = sum(r.length for r in filter.trim(reads, 10, 10)) assert sum_length == 550000, sum_length