Beispiel #1
0
def action(arguments):
    """
    Trim the alignment as specified
    """
    # Determine file format for input and output
    source_format = (arguments.source_format or
                     fileformat.from_handle(arguments.source_file))
    output_format = (arguments.output_format or
                     fileformat.from_handle(arguments.output_file))

    # Load the alignment
    with arguments.source_file:
        sequences = SeqIO.parse(
            arguments.source_file,
            source_format,
            alphabet=Alphabet.Gapped(Alphabet.single_letter_alphabet))

        # Locate primers
        (forward_start, forward_end), (reverse_start, reverse_end) = locate_primers(
            sequences, arguments.forward_primer,
            arguments.reverse_primer, arguments.reverse_complement,
            arguments.max_hamming_distance)

        # Generate slice indexes
        if arguments.include_primers:
            start = forward_start
            end = reverse_end + 1
        else:
            start = forward_end + 1
            end = reverse_start

        # Rewind the input file
        arguments.source_file.seek(0)
        sequences = SeqIO.parse(
            arguments.source_file,
            source_format,
            alphabet=Alphabet.Gapped(Alphabet.single_letter_alphabet))

        # Apply the transformation
        prune_action = _ACTIONS[arguments.prune_action]
        transformed_sequences = prune_action(sequences, start, end)

        with arguments.output_file:
            SeqIO.write(transformed_sequences, arguments.output_file,
                        output_format)
Beispiel #2
0
def action(arguments):
    """
    Trim the alignment as specified
    """
    # Determine file format for input and output
    source_format = (arguments.source_format
                     or fileformat.from_handle(arguments.source_file))
    output_format = (arguments.output_format
                     or fileformat.from_handle(arguments.output_file))

    # Load the alignment
    with arguments.source_file:
        sequences = SeqIO.parse(arguments.source_file,
                                source_format,
                                alphabet=Alphabet.Gapped(
                                    Alphabet.single_letter_alphabet))

        # Locate primers
        (forward_start, forward_end), (reverse_start, reverse_end) = \
                locate_primers(sequences, arguments.forward_primer,
                        arguments.reverse_primer, arguments.reverse_complement,
                        arguments.max_hamming_distance)

        # Generate slice indexes
        if arguments.include_primers:
            start = forward_start
            end = reverse_end + 1
        else:
            start = forward_end + 1
            end = reverse_start

        # Rewind the input file
        arguments.source_file.seek(0)
        sequences = SeqIO.parse(arguments.source_file,
                                source_format,
                                alphabet=Alphabet.Gapped(
                                    Alphabet.single_letter_alphabet))

        # Apply the transformation
        prune_action = _ACTIONS[arguments.prune_action]
        transformed_sequences = prune_action(sequences, start, end)

        with arguments.output_file:
            SeqIO.write(transformed_sequences, arguments.output_file,
                        output_format)
Beispiel #3
0
def action(arguments):
    """
    Run
    """
    # Ignore SIGPIPE, for head support
    common.exit_on_sigpipe()
    logging.basicConfig()

    prot_sequences = SeqIO.parse(arguments.protein_align,
                                 fileformat.from_handle(arguments.protein_align))
    nucl_sequences = SeqIO.parse(arguments.nucl_align,
                                 fileformat.from_handle(arguments.nucl_align))

    instance = AlignmentMapper(TRANSLATION_TABLES[arguments.translation_table],
                               arguments.fail_action)

    SeqIO.write(instance.map_all(prot_sequences, nucl_sequences),
                arguments.out_file, fileformat.from_filename(arguments.out_file.name))
Beispiel #4
0
def action(arguments):
    """
    Run
    """
    # Ignore SIGPIPE, for head support
    common.exit_on_sigpipe()
    logging.basicConfig()

    prot_sequences = SeqIO.parse(
        arguments.protein_align,
        fileformat.from_handle(arguments.protein_align))
    nucl_sequences = SeqIO.parse(arguments.nucl_align,
                                 fileformat.from_handle(arguments.nucl_align))

    instance = AlignmentMapper(TRANSLATION_TABLES[arguments.translation_table],
                               arguments.fail_action)

    SeqIO.write(instance.map_all(prot_sequences, nucl_sequences),
                arguments.out_file,
                fileformat.from_filename(arguments.out_file.name))
Beispiel #5
0
def action(arguments):
    common.exit_on_sigpipe()

    # Determine file format for input and output
    source_format = (arguments.input_format
                     or fileformat.from_handle(arguments.sequence_file))

    with arguments.sequence_file:
        sequences = SeqIO.parse(arguments.sequence_file, source_format)
        if arguments.include_description:
            ids = (sequence.description for sequence in sequences)
        else:
            ids = (sequence.id for sequence in sequences)
        with arguments.output_file:
            for i in ids:
                print >> arguments.output_file, i
Beispiel #6
0
def action(arguments):
    common.exit_on_sigpipe()

    # Determine file format for input and output
    source_format = (arguments.input_format or
                     fileformat.from_handle(arguments.sequence_file))

    with arguments.sequence_file:
        sequences = SeqIO.parse(arguments.sequence_file, source_format)
        if arguments.include_description:
            ids = (sequence.description for sequence in sequences)
        else:
            ids = (sequence.id for sequence in sequences)
        with arguments.output_file:
            for i in ids:
                print(i, file=arguments.output_file)
Beispiel #7
0
def summarize_sequence_file(source_file, file_type=None):
    """
    Summarizes a sequence file, returning a tuple containing the name,
    whether the file is an alignment, minimum sequence length, maximum
    sequence length, average length, number of sequences.
    """
    is_alignment = True
    avg_length = None
    min_length = sys.maxsize
    max_length = 0
    sequence_count = 0

    # Get an iterator and analyze the data.
    with common.FileType('rt')(source_file) as fp:
        if not file_type:
            file_type = fileformat.from_handle(fp)
        for record in SeqIO.parse(fp, file_type):
            sequence_count += 1
            sequence_length = len(record)
            if max_length != 0:
                # If even one sequence is not the same length as the others,
                # we don't consider this an alignment.
                if sequence_length != max_length:
                    is_alignment = False

            # Lengths
            if sequence_length > max_length:
                max_length = sequence_length
            if sequence_length < min_length:
                min_length = sequence_length

            # Average length
            if sequence_count == 1:
                avg_length = float(sequence_length)
            else:
                avg_length = avg_length + ((sequence_length - avg_length) /
                                           sequence_count)

    # Handle an empty file:
    if avg_length is None:
        min_length = max_length = avg_length = 0
    if sequence_count <= 1:
        is_alignment = False

    return (source_file, str(is_alignment).upper(), min_length,
            max_length, avg_length, sequence_count)
Beispiel #8
0
def summarize_sequence_file(source_file, file_type=None):
    """
    Summarizes a sequence file, returning a tuple containing the name,
    whether the file is an alignment, minimum sequence length, maximum
    sequence length, average length, number of sequences.
    """
    is_alignment = True
    avg_length = None
    min_length = sys.maxint
    max_length = 0
    sequence_count = 0

    # Get an iterator and analyze the data.
    with common.FileType('rb')(source_file) as fp:
        if not file_type:
            file_type = fileformat.from_handle(fp)
        for record in SeqIO.parse(fp, file_type):
            sequence_count += 1
            sequence_length = len(record)
            if max_length != 0:
                # If even one sequence is not the same length as the others,
                # we don't consider this an alignment.
                if sequence_length != max_length:
                    is_alignment = False

            # Lengths
            if sequence_length > max_length:
                max_length = sequence_length
            if sequence_length < min_length:
                min_length = sequence_length

            # Average length
            if sequence_count == 1:
                avg_length = float(sequence_length)
            else:
                avg_length = avg_length + (
                    (sequence_length - avg_length) / sequence_count)

    # Handle an empty file:
    if avg_length is None:
        min_length = max_length = avg_length = 0
    if sequence_count <= 1:
        is_alignment = False

    return (source_file, str(is_alignment).upper(), min_length, max_length,
            avg_length, sequence_count)
Beispiel #9
0
def transform_file(source_file, destination_file, arguments):
    # Get just the file name, useful for naming the temporary file.
    source_file_type = (arguments.input_format or from_handle(source_file))

    destination_file_type = (arguments.output_format or
            from_handle(destination_file))

    # Get an iterator.
    sorters = {'length': transform.sort_length,
               'name': transform.sort_name,}
    directions = {'asc': 1, 'desc': 0}
    if arguments.sort:
        # Sorted iterator
        key, direction = arguments.sort.split('-')
        records = sorters[key](source_file=source_file,
                source_file_type=source_file_type,
                direction=directions[direction])
    else:
        # Unsorted iterator.
        records = SeqIO.parse(source_file, source_file_type,
                alphabet=ALPHABETS.get(arguments.alphabet))


    #########################################
    # Apply generator functions to iterator.#
    #########################################

    # Apply all the transform functions in transforms
    if arguments.transforms:

        # Special case handling for --cut and --relative-to
        if arguments.cut_relative:
            for o, n in ((transform.multi_cut_sequences,
                          transform.cut_sequences_relative),
                         (transform.multi_mask_sequences,
                          transform.mask_sequences_relative)):
                # Add a function to trim any columns which are gaps in the
                # sequence ID
                try:
                    f = next(f for f in arguments.transforms
                             if f.func == o)
                except StopIteration:
                    continue
                i = arguments.transforms.index(f)
                arguments.transforms.pop(i)
                arguments.transforms.insert(i,
                        functools.partial(n,
                            record_id=arguments.cut_relative, **f.keywords))

        for function in arguments.transforms:
            records = function(records)

    if (arguments.deduplicate_sequences or
            arguments.deduplicate_sequences is None):
        records = transform.deduplicate_sequences(
            records, arguments.deduplicate_sequences)

    # Apply all the partial functions
    if arguments.apply_function:
        for apply_function in arguments.apply_function:
            records = apply_function(records)

    # Only the fasta format is supported, as SeqIO.write does not have a 'wrap'
    # parameter.
    if (arguments.line_wrap is not None and destination_file_type == 'fasta'):
        logging.info("Attempting to write fasta with %d line breaks.",
                arguments.line_wrap)

        with destination_file:
            writer = FastaIO.FastaWriter(
                destination_file, wrap=arguments.line_wrap)
            writer.write_file(records)
    else:
        # Mogrify requires writing all changes to a temporary file by default,
        # but convert uses a destination file instead if one was specified. Get
        # sequences from an iterator that has generator functions wrapping it.
        # After creation, it is then copied back over the original file if all
        # tasks finish up without an exception being thrown.  This avoids
        # loading the entire sequence file up into memory.
        logging.info("Applying transformations, writing to %s",
                destination_file)
        SeqIO.write(records, destination_file, destination_file_type)
Beispiel #10
0
def action(arguments):
    """
    Given parsed arguments, filter input files.
    """
    if arguments.quality_window_mean_qual and not arguments.quality_window:
        raise ValueError("--quality-window-mean-qual specified without "
                "--quality-window")

    if trie is None or triefind is None:
        raise ValueError('Missing Bio.trie and/or Bio.triefind modules. Cannot continue')

    filters = []
    input_type = fileformat.from_handle(arguments.sequence_file)
    output_type = fileformat.from_handle(arguments.output_file)
    with arguments.sequence_file as fp:
        if arguments.input_qual:
            sequences = QualityIO.PairedFastaQualIterator(fp,
                    arguments.input_qual)
        else:
            sequences = SeqIO.parse(fp, input_type)

        listener = RecordEventListener()
        if arguments.details_out:
            rh = RecordReportHandler(arguments.details_out, arguments.argv,
                    arguments.details_comment)
            rh.register_with(listener)

        # Track read sequences
        sequences = listener.iterable_hook('read', sequences)

        # Add filters
        if arguments.min_mean_quality and input_type == 'fastq':
            qfilter = QualityScoreFilter(arguments.min_mean_quality)
            filters.append(qfilter)
        if arguments.max_length:
            max_length_filter = MaxLengthFilter(arguments.max_length)
            filters.append(max_length_filter)
        if arguments.min_length:
            min_length_filter = MinLengthFilter(arguments.min_length)
            filters.append(min_length_filter)
        if arguments.max_ambiguous is not None:
            max_ambig_filter = MaxAmbiguousFilter(arguments.max_ambiguous)
            filters.append(max_ambig_filter)
        if arguments.pct_ambiguous is not None:
            pct_ambig_filter = PctAmbiguousFilter(arguments.pct_ambiguous)
            filters.append(pct_ambig_filter)
        if arguments.ambiguous_action:
            ambiguous_filter = AmbiguousBaseFilter(
                    arguments.ambiguous_action)
            filters.append(ambiguous_filter)
        if arguments.quality_window:
            min_qual = arguments.quality_window_mean_qual or \
                    arguments.min_mean_quality
            window_filter = WindowQualityScoreFilter(arguments.quality_window,
                    min_qual)
            filters.insert(0, window_filter)

        if arguments.barcode_file:
            with arguments.barcode_file:
                tr = parse_barcode_file(arguments.barcode_file,
                        arguments.primer, arguments.barcode_header)
            f = PrimerBarcodeFilter(tr)
            filters.append(f)

            if arguments.map_out:
                barcode_writer = csv.writer(arguments.map_out,
                        quoting=getattr(csv, arguments.quoting),
                        lineterminator='\n')
                def barcode_handler(record, sample, barcode=None):
                    barcode_writer.writerow((record.id, sample))
                listener.register_handler('found_barcode', barcode_handler)
        for f in filters:
            f.listener = listener
            sequences = f.filter_records(sequences)

        # Track sequences which passed all filters
        sequences = listener.iterable_hook('write', sequences)

        with arguments.output_file:
            SeqIO.write(sequences, arguments.output_file, output_type)

    rpt_rows = (f.report_dict() for f in filters)

    # Write report
    with arguments.report_out as fp:
        writer = csv.DictWriter(fp, BaseFilter.report_fields,
                lineterminator='\n', delimiter='\t')
        writer.writeheader()
        writer.writerows(rpt_rows)
Beispiel #11
0
def action(arguments):
    """
    Given parsed arguments, filter input files.
    """
    if arguments.quality_window_mean_qual and not arguments.quality_window:
        raise ValueError("--quality-window-mean-qual specified without "
                         "--quality-window")

    if trie is None or triefind is None:
        raise ValueError(
            'Missing Bio.trie and/or Bio.triefind modules. Cannot continue')

    filters = []
    input_type = fileformat.from_handle(arguments.sequence_file)
    output_type = fileformat.from_handle(arguments.output_file)
    with arguments.sequence_file as fp:
        if arguments.input_qual:
            sequences = QualityIO.PairedFastaQualIterator(
                fp, arguments.input_qual)
        else:
            sequences = SeqIO.parse(fp, input_type)

        listener = RecordEventListener()
        if arguments.details_out:
            rh = RecordReportHandler(arguments.details_out, arguments.argv,
                                     arguments.details_comment)
            rh.register_with(listener)

        # Track read sequences
        sequences = listener.iterable_hook('read', sequences)

        # Add filters
        if arguments.min_mean_quality and input_type == 'fastq':
            qfilter = QualityScoreFilter(arguments.min_mean_quality)
            filters.append(qfilter)
        if arguments.max_length:
            max_length_filter = MaxLengthFilter(arguments.max_length)
            filters.append(max_length_filter)
        if arguments.min_length:
            min_length_filter = MinLengthFilter(arguments.min_length)
            filters.append(min_length_filter)
        if arguments.max_ambiguous is not None:
            max_ambig_filter = MaxAmbiguousFilter(arguments.max_ambiguous)
            filters.append(max_ambig_filter)
        if arguments.pct_ambiguous is not None:
            pct_ambig_filter = PctAmbiguousFilter(arguments.pct_ambiguous)
            filters.append(pct_ambig_filter)
        if arguments.ambiguous_action:
            ambiguous_filter = AmbiguousBaseFilter(arguments.ambiguous_action)
            filters.append(ambiguous_filter)
        if arguments.quality_window:
            min_qual = (arguments.quality_window_mean_qual or
                        arguments.min_mean_quality)
            window_filter = WindowQualityScoreFilter(arguments.quality_window,
                                                     min_qual)
            filters.insert(0, window_filter)

        if arguments.barcode_file:
            with arguments.barcode_file:
                tr = parse_barcode_file(arguments.barcode_file,
                                        arguments.primer,
                                        arguments.barcode_header)
            f = PrimerBarcodeFilter(tr)
            filters.append(f)

            if arguments.map_out:
                barcode_writer = csv.writer(
                    arguments.map_out,
                    quoting=getattr(csv, arguments.quoting),
                    lineterminator='\n')

                def barcode_handler(record, sample, barcode=None):
                    barcode_writer.writerow((record.id, sample))

                listener.register_handler('found_barcode', barcode_handler)
        for f in filters:
            f.listener = listener
            sequences = f.filter_records(sequences)

        # Track sequences which passed all filters
        sequences = listener.iterable_hook('write', sequences)

        with arguments.output_file:
            SeqIO.write(sequences, arguments.output_file, output_type)

    rpt_rows = (f.report_dict() for f in filters)

    # Write report
    with arguments.report_out as fp:
        writer = csv.DictWriter(
            fp, BaseFilter.report_fields, lineterminator='\n', delimiter='\t')
        writer.writeheader()
        writer.writerows(rpt_rows)