Beispiel #1
0
#! /usr/bin/env python

import sys
import os
import re
import warnings

from Bio import Entrez

from seqsift.utils import dataio
from seqsift.utils import VALID_DATA_TYPES
from seqsift.utils.messaging import get_logger

_LOG = get_logger(__name__)
warnings.filterwarnings(action="ignore",
                        category=UserWarning,
                        module=r'.*Entrez.*')

LEADING_ZEROS = re.compile(r'^\s*([0]+)(\d)+\s*$')
GI = r'\s*(\d+)\s*'
GI_PATTERN = re.compile(r'^' + GI + r'$')
ACC = r'\s*([a-zA-z]{1,5})(\d{5,9})\s*'
ACC_PATTERN = re.compile(r'^' + ACC + r'$')


def parse_accession_numbers(string):
    acc_range = re.compile(r'^' + ACC + r'-\s*([a-zA-z]{0,5})(\d{5,9})\s*$')

    acc_list = string.strip().split(',')
    accs = set()
    for acc_str in acc_list:
Beispiel #2
0
        fetch_gb_seqs)
from seqsift.utils.messaging import get_logger

_program_info = {
    'name': os.path.basename(__file__),
    'author': 'Jamie Oaks',
    'version': 'Version 0.1.0',
    'description': __doc__,
    'copyright': 'Copyright (C) 2012 Jamie Oaks.',
    'license': (
        'This is free software distributed under the GNU General Public '
        'License in the hope that it will be useful, but WITHOUT ANY '
        'WARRANTY. You are free to change and redistribute it in accord with '
        'the GPL. See the GNU General Public License for more details.'),}

_LOG = get_logger(__name__, 'INFO')

EXTENSIONS = {'fas': 'fasta',
              'fasta': 'fasta',
              'gb': 'gb',
              'genbank': 'gb',}

def digest_seq(recognition_seq,
               seq_record,
               out_dir,
               append_dict,
               extra_length=0,
               min_length=0,
               max_length=None,
               include_overhang=True,):
    if max_length:
Beispiel #3
0
    'author':
    'Jamie Oaks',
    'version':
    'Version 0.1.0',
    'description':
    __doc__,
    'copyright':
    'Copyright (C) 2012 Jamie Oaks.',
    'license':
    ('This is free software distributed under the GNU General Public '
     'License in the hope that it will be useful, but WITHOUT ANY '
     'WARRANTY. You are free to change and redistribute it in accord with '
     'the GPL. See the GNU General Public License for more details.'),
}

_LOG = get_logger(__name__, 'INFO')

EXTENSIONS = {
    'fas': 'fasta',
    'fasta': 'fasta',
    'gb': 'gb',
    'genbank': 'gb',
}


def digest_seq(
    recognition_seq,
    seq_record,
    out_dir,
    append_dict,
    extra_length=0,
Beispiel #4
0
import unittest
import types
import itertools

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

from seqsift.seqops import seqstats
from seqsift.utils import dataio, alphabets
from seqsift.utils import functions, errors
from seqsift.test.support import package_paths
from seqsift.test.support.extended_test_case import SeqSiftTestCase
from seqsift.utils.messaging import get_logger

_LOG = get_logger(__name__)

class GetDuplicateIdsTestCase(unittest.TestCase):
    def test_two_dups(self):
        seqs = [
                SeqRecord(Seq('A--CGT'), id='a'),
                SeqRecord(Seq('G--CGT'), id='a'),
                ]
        dups = seqstats.get_duplicate_ids(seqs)
        self.assertEqual(dups, ['a'])

    def test_three_dups(self):
        seqs = [
                SeqRecord(Seq('A--CGT'), id='a'),
                SeqRecord(Seq('G--CGT'), id='a'),
                SeqRecord(Seq('C--CGT'), id='a'),
Beispiel #5
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('input_files', metavar='INPUT-SEQ-FILE',
            nargs = '+',
            type = argparse_utils.arg_is_file,
            help = ('Input sequence file(s) to be output into files with '
                    '`-n` sequences per file.'))
    parser.add_argument('-n', '--num-samples',
            type = int,
            required = True,
            help = ('The maximum number of sequences to put in each output '
                    'file.'))
    parser.add_argument('--format',
            dest = 'input_format',
            type = str,
            choices = FILE_FORMATS.supported_formats,
            help = ('The format of the input sequence file(s). Valid options '
                    'include: {0}. By default, the format is guessed based on '
                    'the extension of the first input file. However, if '
                    'provided, this option will always take precedence over '
                    'the file extension.'.format(
                          ', '.join(FILE_FORMATS.supported_formats))))
    parser.add_argument('-d', '--data-type',
            type = str,
            choices = VALID_DATA_TYPES,
            default='dna',
            help = ('The type of sequence data. The default is dna. Valid '
                    'options include: {0}.'.format(', '.join(
                            VALID_DATA_TYPES))))
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = ('Random number seed.'))
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name = __name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import dataio, GLOBAL_RNG, functions

    ##########################################################################
    ## handle args

    ## set seed if randomly sampling sequences
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    log.warning('Seed: {0}'.format(args.seed))

    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
                args.input_files[0])
    if not args.input_format:
        log.error("Could not determine input format.\n"
                   "You must either provide the input format\n"
                   "using the '--from' option or have a recognizable\n"
                   "file extension on the first input file.\n"
                   "Here are the supported file extensions:\n{0}".format(
                        str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    seqs = dataio.get_seq_iter(args.input_files,
            format = args.input_format,
            data_type = args.data_type)
    samples = functions.sample_iter(iterable = seqs,
            sample_size = args.num_samples)

    SeqIO.write(samples,
            handle = sys.stdout,
            format = args.input_format)
Beispiel #6
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_files', metavar='INPUT-SEQ-FILE',
            nargs = '+',
            type = argparse_utils.arg_is_file,
            help = ('Input sequence file(s) to be output into files with '
                    '`-n` sequences per file.'))
    parser.add_argument('-n', '--num-seqs-per-file',
            type = int,
            required = True,
            default = 4000000,
            help = ('The maximum number of sequences to put in each output '
                    'file.'))
    parser.add_argument('--format',
            dest = 'input_format',
            type = str,
            choices = FILE_FORMATS.supported_formats,
            help = ('The format of the input sequence file(s). Valid options '
                    'include: {0}. By default, the format is guessed based on '
                    'the extension of the first input file. However, if '
                    'provided, this option will always take precedence over '
                    'the file extension.'.format(
                          ', '.join(FILE_FORMATS.supported_formats))))
    parser.add_argument('-d', '--data-type',
            type = str,
            choices = VALID_DATA_TYPES,
            default='dna',
            help = ('The type of sequence data. The default is dna. Valid '
                    'options include: {0}.'.format(', '.join(
                            VALID_DATA_TYPES))))
    parser.add_argument('-c', '--compress',
            action = 'store_true',
            help = 'Compress (gzip) output files.')
    parser.add_argument('-o', '--output-dir',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the input file.'))
    parser.add_argument('-p', '--prefix',
            action = 'store',
            type = str,
            help = ('Prefix to use at beginning of output files. The default '
                    'is to use the first input file name.'))
    parser.add_argument('--log-frequency',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 100000,
            help = ('The frequency at which to log progress. Default is to log '
                    'every 100000 sequences.'))
    parser.add_argument('--force',
            action = 'store_true',
            help = ('Overwrite files if they already exist.'))
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name = __name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import dataio, errors
    from seqsift.utils.fileio import OpenFile

    ##########################################################################
    ## handle args

    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
                args.input_files[0])
    if not args.input_format:
        log.error("Could not determine input format.\n"
                   "You must either provide the input format\n"
                   "using the '--from' option or have a recognizable\n"
                   "file extension on the first input file.\n"
                   "Here are the supported file extensions:\n{0}".format(
                        str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if not args.prefix:
        args.prefix = os.path.splitext(args.input_files[0])[0]
    if args.output_dir:
        args.prefix = os.path.join(args.output_dir, os.path.basename(args.prefix))

    out_ext = FILE_FORMATS.get_ext(args.input_format,
            compressed = args.compress)

    compresslevel = None
    if args.compress:
        compresslevel = 9

    # handle sequential formats on the fly
    if FILE_FORMATS.is_sequential(args.input_format):
        seq_iter = dataio.get_seq_iter(
                file_objs = args.input_files,
                format = args.input_format,
                data_type = args.data_type)

        try:
            dataio.write_seqs_to_files(seq_iter,
                    max_num_seqs_per_file = args.num_seqs_per_file,
                    format = args.input_format,
                    compresslevel = compresslevel,
                    prefix = args.prefix,
                    force = args.force)
        except errors.PathExistsError as e:
            log.error('ERROR:\n'
                    'Output files already exist! You can specify a different\n'
                    'prefix or use the `--force` option to overwrite the\n'
                    'existing files. Here is the stack trace:\n\n{0}\n'.format(
                            e))
            sys.exit(1)


    # use SeqIO for non-sequential formats
    else:
        batch_iter = dataio.get_seq_batch_iter_from_files(
                file_objs = args.input_files,
                number_per_batch = args.num_seqs_per_file,
                format = args.input_format,
                data_type = args.data_type)

        for batch_idx, seq_iter in enumerate(batch_iter):
            out_path = '{0}_{1:0>4}{2}'.format(args.prefix, batch_idx + 1,
                    out_ext)
            if os.path.exists(out_path) and (not args.force):
                log.error('ERROR:\n'
                        'Output files already exist! You can specify a '
                        'different\nprefix or use the `--force` option to '
                        'overwrite the\nexisting files.')
                sys.exit(1)
            out = OpenFile(out_path, mode = 'w', compresslevel = compresslevel)
            SeqIO.write(seq_iter,
                    handle = out,
                    format = args.input_format)
            out.close()
Beispiel #7
0
def main():
    description = '{name} {version}'.format(**_program_info)
    usage = ("\n  %prog [options] <SEQ_INPUT_FILE> [<SEQ_OUTPUT_FILE>]")
    parser = OptionParser(usage=usage,
                          description=description,
                          version=_program_info['version'],
                          add_help_option=True)
    format_opts = OptionGroup(
        parser, 'Format Options',
        'These options designate file formats and data type.')
    format_opts.add_option(
        '-f',
        '--from',
        dest='from_format',
        type='string',
        help=('The format of the input sequence file. Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extension of the input file. However, if provided, '
              'this option will always take precedence over the file '
              'extension.'.format(', '.join(FILE_FORMATS.supported_formats))))
    format_opts.add_option(
        '-t',
        '--to',
        dest='to_format',
        type='string',
        help=('The desired format of the output sequence file. Valid '
              'options include: {0}. By default, if an output file path '
              'is provided, the format is guessed based on the extension '
              'of this file. However, this option will always take '
              'precedence over the file extension. Either this option or '
              'an output file path with an extension is required; if '
              'neither are provided the program will exit with an '
              'error.'.format(', '.join(FILE_FORMATS.supported_formats))))
    format_opts.add_option(
        '-d',
        '--data-type',
        dest='data_type',
        type='string',
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    parser.add_option_group(format_opts)

    filter_opts = OptionGroup(
        parser, 'Filter Options',
        'These options allow filtering of data by columns or sequences.')
    filter_opts.add_option(
        '--remove-duplicates',
        dest='remove_duplicates',
        default=False,
        action='store_true',
        help=('Remove duplicate sequences (i.e., sequences with the same '
              'ID and sequence). If a duplicate ID is found associated '
              'with a different sequence, the program will exit with an '
              'error.'))
    filter_opts.add_option(
        '-x',
        '--ids-to-exclude',
        dest='ids_to_exclude',
        type='string',
        help=('Comma-delimited list of the ids of sequences to exclude.'))
    filter_opts.add_option(
        '--remove-missing-columns',
        dest='remove_missing_columns',
        default=False,
        action='store_true',
        help=("Remove aligned columns with missing data. Characters to be "
              "considered missing can be specified with the "
              "--missing-characters option; the default is '?-'. "
              "The proportion of rows that must contain these characters "
              "for a row to be removed can be specified with the "
              "--missing-column-proportion option; the default is 1.0. "
              "Note, this option is only relevant to aligned sequences, "
              "and will result in an error if the input sequences are not "
              "aligned."))
    filter_opts.add_option(
        '--missing-column-proportion',
        dest='missing_column_proportion',
        type='float',
        default=1.0,
        help=('The proportion of rows that must contain '
              '--missing-characters for a column to be removed. '
              'This option is only relevant in combination with the '
              '--remove-missing-columns option.'))
    filter_opts.add_option(
        '--remove-missing-sequences',
        dest='remove_missing_sequences',
        default=False,
        action='store_true',
        help=("Remove sequences with missing data. Characters to be "
              "considered missing can be specified with the "
              "--missing-characters option; the default is '?-'. "
              "The proportion of the sites that must contain these "
              "characters for a sequence to be removed can be specified "
              "with the --missing-sequence-proportion option; the default "
              "is 1.0."))
    filter_opts.add_option(
        '--missing-sequence-proportion',
        dest='missing_sequence_proportion',
        type='float',
        default=1.0,
        help=('The proportion of sites that must contain '
              '--missing-characters for a sequence to be removed. '
              'This option is only relevant in combination with the '
              '--remove-missing-sequences option.'))
    filter_opts.add_option(
        '--missing-characters',
        dest='missing_characters',
        type='str',
        default='?-',
        help=("Characters to be considered missing and be used in "
              "evaluating columns/sequences to remove with the "
              "--remove-missing-columns and --remove-missing-sequences "
              "options. The default is '?-'."))
    filter_opts.add_option('--remove-constant-columns',
                           dest='remove_constant_columns',
                           default=False,
                           action='store_true',
                           help=("Remove aligned columns with no variation."))
    parser.add_option_group(filter_opts)

    rev_comp_opts = OptionGroup(
        parser, 'Reverse Complement Options',
        'These options are for reverse complementing sequences.')
    rev_comp_opts.add_option(
        '--rev-comp',
        dest='rev_comp',
        default=False,
        action='store_true',
        help=("Reverse complement all sequences. This option overrides "
              "all other reverse-complement options."))
    rev_comp_opts.add_option(
        '--fix-rev-comp-by',
        dest='fix_rev_comp_by',
        type='choice',
        choices=['first', 'read'],
        help=("Try to correct reverse complement errors. "
              "Options include 'first' and 'read'. If 'first' is "
              "specified, sequences are returned in their orientation "
              "that minimizes distance from the first sequence. "
              "If 'read' is used, sequences are returned in their "
              "orientation that has the longest read frame "
              "(see 'Translation Options' for controlling translation "
              "of reading frames)."))
    parser.add_option_group(rev_comp_opts)

    translation_opts = OptionGroup(
        parser, 'Translation Options',
        ('These options control translation from nucleotide to amino acid '
         'sequences.'))
    translation_opts.add_option(
        '--table',
        type='choice',
        choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)),
        default=1,
        help=('The translation table to use for any options associated '
              'with translating nucleotide sequences to amino acids. '
              'Option should be the integer that corresponds to the '
              'desired translation table according to NCBI '
              '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). '
              'The default is 1 (the "standard" code).'))
    translation_opts.add_option(
        '--allow-partial',
        default=False,
        action='store_true',
        help=('Allow partial reading frames at the beginning (no start '
              'codon) and end (no stop codon) of sequences.'))
    translation_opts.add_option(
        '--read-after-stop',
        default=False,
        action='store_true',
        help=('A new reading frame begins immediately after a stop codon. '
              'The default is to start reading frame at next start codon '
              'after a stop codon. This option might be useful for exons.'))
    parser.add_option_group(translation_opts)

    distance_opts = OptionGroup(
        parser, 'Distance Options',
        ('These options control how distances between sequences are '
         'calculated.'))
    distance_opts.add_option(
        '-g',
        '--count-gaps',
        default=False,
        action='store_true',
        help=('Count gaps when calculating pairwise sequence distances. '
              'The default is to calculate (number of differences '
              'ignoring gaps / number of aligned sites ignoring sites '
              'with gaps) for each pairwise comparison. When this option '
              'is used, the distance is (number of differences including '
              'gap differences / total number of aligned sites).'))
    parser.add_option_group(distance_opts)

    messaging_opts = OptionGroup(
        parser, 'Messaging Options',
        ('These options control verbosity of messaging.'))
    messaging_opts.add_option('--quiet',
                              action='store_true',
                              help='Run without verbose messaging.')
    messaging_opts.add_option('--debug',
                              action='store_true',
                              help='Run in debugging mode.')
    parser.add_option_group(messaging_opts)

    (options, args) = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if options.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if options.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.seqops import seqmod, seqfilter
    from seqsift.utils import dataio

    ##########################################################################
    ## handle args

    if len(args) == 1:
        in_file_path = args[0]
        out_file_path = sys.stdout
    elif len(args) == 2:
        in_file_path = args[0]
        out_file_path = args[1]
    elif len(args) > 2:
        log.error("Too many arguments. Expecting at most 2 arguments:\n"
                  "The path to the input file (required), and the path to\n"
                  "output file (optional; defaults to standard output).")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)
    elif len(args) < 1:
        log.error("Too few arguments. Expecting at least 1 argument:\n"
                  "the path to the input file.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    opt_dict = options.__dict__

    if options.from_format:
        in_format = opt_dict.pop('from_format')
    else:
        in_format = FILE_FORMATS.get_format_from_file_object(in_file_path)
    if not in_format:
        log.error("Could not determine format of input file.\n"
                  "You must either provide the format of the input file\n"
                  "using the '--from-format' option or have a recognized\n"
                  "file extension on the input file. Here are the supported\n"
                  "file extensions:\n{0}".format(str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if options.to_format:
        out_format = opt_dict.pop('to_format')
    else:
        out_format = FILE_FORMATS.get_format_from_file_object(out_file_path)
    if not out_format:
        log.error("Could not determine format of output file.\n"
                  "You must either provide the format of the output file\n"
                  "using the '--to-format' option or have a recognized\n"
                  "file extension on the output file. Here are the supported\n"
                  "file extensions:\n{0}".format(str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    data_type = opt_dict.pop('data_type')

    if len(opt_dict) == 0:
        dataio.convert_format(in_file=in_file_path,
                              out_file=out_file_path,
                              in_format=in_format,
                              out_format=out_format,
                              data_type=data_type)
        sys.exit(0)

    if ((options.rev_comp or options.fix_rev_comp_by)
            and (data_type.lower() not in ['dna', 'rna'])):
        log.error("You have selected an option for reverse complementing\n"
                  "sequences but the data type is not DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    seqs = dataio.get_seq_iter([in_file_path],
                               format=in_format,
                               data_type=data_type)

    if options.ids_to_exclude:
        to_exclude = [n.strip() for n in options.ids_to_exclude.split(',')]
        seqs = seqfilter.id_filter(seqs, to_exclude)

    if options.remove_duplicates:
        seqs = seqfilter.duplicate_id_filter(seqs)

    if options.remove_missing_sequences:
        seqs = seqfilter.row_filter(
            seqs,
            character_list=list(options.missing_characters),
            max_frequency=options.missing_sequence_proportion)

    if options.remove_missing_columns:
        seqs = seqfilter.column_filter(
            seqs,
            character_list=list(options.missing_characters),
            max_frequency=options.missing_column_proportion)

    if options.remove_constant_columns:
        seqs = seqfilter.constant_column_filter(seqs)

    if options.rev_comp:
        log.info('Reverse complementing all sequences...')
        seqs = seqmod.reverse_complement(seqs)
    elif options.fix_rev_comp_by == 'first':
        log.info('Reverse complementing to match first sequence...')
        seqs = seqmod.reverse_complement_to_first_seq(
            seqs,
            per_site=True,
            aligned=False,
            ignore_gaps=(not options.count_gaps),
            alphabet=None,
            aligner_tools=['muscle', 'mafft'],
            log_frequency=100)
    elif options.fix_rev_comp_by == 'read':
        log.info('Reverse complementing to longest reading frame...')
        seqs = seqmod.reverse_complement_to_longest_reading_frame(
            seqs,
            gap_characters=['-'],
            table=options.table,
            allow_partial=options.allow_partial,
            require_start_after_stop=(not options.read_after_stop),
            log_frequency=100)

    SeqIO.write(seqs, handle=out_file_path, format=out_format)
Beispiel #8
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('input_files', metavar='INPUT-SEQ-FILE',
            nargs = '+',
            type = argparse_utils.arg_is_file,
            help = ('Input sequence file(s) to be output into files with '
                    '`-n` sequences per file.'))
    parser.add_argument('--format',
            dest = 'input_format',
            type = str,
            choices = FILE_FORMATS.supported_formats,
            help = ('The format of the input sequence file(s). Valid options '
                    'include: {0}. By default, the format is guessed based on '
                    'the extensions of input file(s). However, if '
                    'provided, this option will always take precedence over '
                    'the file extension.'.format(
                          ', '.join(FILE_FORMATS.supported_formats))))
    parser.add_argument('-d', '--data-type',
            type = str,
            choices = VALID_DATA_TYPES,
            default='dna',
            help = ('The type of sequence data. The default is dna. Valid '
                    'options include: {0}.'.format(', '.join(
                            VALID_DATA_TYPES))))
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name = __name__)

    ##########################################################################
    ## package imports

    from seqsift.seqops import seqstats

    ##########################################################################
    ## handle args

    if not args.input_format:
        args.input_format = None

    summaries = seqstats.get_seq_summaries_from_files(args.input_files,
            format = args.input_format,
            data_type =  args.data_type)
    global_summary = summaries.pop('global')

    keys = sorted(summaries.keys())
    for k in keys:
        write_summary(k, summaries[k])
    write_summary('overall', global_summary)
Beispiel #9
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_file',
                        metavar='INPUT-SEQ-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Input sequence file to be vetted.'))

    comparison_args = parser.add_argument_group(
        'Comparison Options',
        'Options to control the number and nature of sequence comparisons')
    comparison_args.add_argument(
        '-n',
        '--num-samples',
        type=int,
        default=0,
        help=('The number of randomly sampled sequences to which each '
              'sequence will be compared. If less than 1 (the defualt is '
              '0), all pairwise comparisons will be performed. For very '
              'large numbers of sequences, performing all pairwise '
              'comparisons will take a long time. This option will speed '
              'things up as long as the number specified is less than '
              'about half of the number of input sequences. If the '
              'number you are considering is close to half of the number '
              'sequences, you should probably specify zero and do all '
              'combinations. You should not specify a number greater than '
              'half the number of sequences, because it will take longer '
              'and be less thorough than the default.'))
    comparison_args.add_argument(
        '--seed',
        action='store',
        type=int,
        help=('Random number seed to use for the analysis. This option '
              'is only revelant if a number greater than 0 is specified '
              'for the `-n/--num-samples` option.'))
    comparison_args.add_argument(
        '--compare-translated',
        action='store_true',
        help=('Compare amino acid sequences encoded by the longest '
              'reading frame found in each sequence. To use this option, '
              '`data-type` must be dna or rna. See "Translation Options" '
              'for controlling how the longest reading frame of each '
              'sequence is determined and translated.'))
    comparison_args.add_argument('--check-ids',
                                 action='store_true',
                                 help=('Check sequence IDs for duplicates.'))
    comparison_args.add_argument(
        '--summarize-reading-frame-lengths',
        action='store_true',
        help=('Report the length of the longest reading frame of '
              'each sequence. See "Translation Options" for controlling '
              'how reading frames are determined.'))
    comparison_args.add_argument(
        '-g',
        '--count-gaps',
        action='store_true',
        help=('Count gaps when calculating pairwise sequence distances. '
              'The default is to calculate (number of differences '
              'ignoring gaps / number of aligned sites ignoring sites '
              'with gaps) for each pairwise comparison. When this option '
              'is used, the distance is (number of differences including '
              'gap differences / total number of aligned sites).'))

    alignment_args = parser.add_argument_group(
        'Alignment Options',
        ('These options control if/how sequences are to be aligned prior '
         'to calculating distances.'))
    alignment_args.add_argument(
        '-a',
        '--aligned',
        action='store_true',
        help=('Treat input sequences as aligned. I.e., do not perform '
              'pairwise alignment before calculating distances between '
              'sequences (except when calculating distances for reverse '
              'and complemented sequences).'))
    alignment_args.add_argument(
        '--aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for pairwise'
              'alignments of sequences. '
              'The default is to look for muscle and then mafft in PATH, '
              'and if neither are found use the (slow) built-in '
              'function. Even if the `-a`/`--aligned` option is '
              'specified, the aligner will still be used for pairwise '
              'alignments when calculating distances of reverse and '
              'complemented sequences.'))
    alignment_args.add_argument(
        '--msa',
        action='store_true',
        help=('Perform a full multiple sequence alignemnt prior to '
              'comparing sequences. The default is to align each '
              'pair of sequences being compared. This option is '
              'overruled by the `-a`/`--aligned` option. '
              'If this option is used '
              'the resulting alignment is written to file.'))
    alignment_args.add_argument(
        '--msa-aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for full '
              'multiple sequence alignment. '
              'The default is to look for mafft and then muscle in PATH, '
              'and if neither are found the program will exit with an '
              'error message. If you do not have mafft or muscle '
              'you cannot use this option. '
              'This option is only used if the `-a`/`--aligned` option '
              'is not specified, and the `--msa` option is specified.'))

    translation_args = parser.add_argument_group(
        'Translation Options',
        ('These options control translation from nucleotide to amino acid '
         'sequences.'))
    translation_args.add_argument(
        '--table',
        type=int,
        choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)),
        default=1,
        help=('The translation table to use for any options associated '
              'with translating nucleotide sequences to amino acids. '
              'Option should be the integer that corresponds to the '
              'desired translation table according to NCBI '
              '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). '
              'The default is 1 (the "standard" code).'))
    translation_args.add_argument(
        '--allow-partial',
        action='store_true',
        default=False,
        help=('Allow partial reading frames at the beginning (no start '
              'codon) and end (no stop codon) of sequences.'))
    translation_args.add_argument(
        '--read-after-stop',
        action='store_true',
        default=False,
        help=('A new reading frame begins immediately after a stop codon. '
              'The default is to start reading frame at next start codon '
              'after a stop codon. This option might be useful for exons.'))

    data_args = parser.add_argument_group(
        'Data Options', ('Options specifying the input data type and format'))
    data_args.add_argument(
        '-d',
        '--data-type',
        type=str,
        choices=VALID_DATA_TYPES,
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    data_args.add_argument(
        '--format',
        dest='input_format',
        type=str,
        choices=FILE_FORMATS.supported_formats,
        help=('The format of the input sequence file. Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extension of the first input file. However, if '
              'provided, this option will always take precedence over '
              'the file extension.'.format(', '.join(
                  FILE_FORMATS.supported_formats))))

    output_args = parser.add_argument_group(
        'Output Options', 'Options for controlling output of program')
    output_args.add_argument(
        '-o',
        '--output-dir',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the input file.'))

    messaging_args = parser.add_argument_group(
        'Messaging Options', ('These options control verbosity of messaging.'))
    messaging_args.add_argument(
        '--log-frequency',
        type=argparse_utils.arg_is_nonnegative_int,
        default=1000,
        help=('The frequency at which to log progress. Default is to log '
              'every 1000 sequence comparisons.'))
    messaging_args.add_argument('--quiet',
                                action='store_true',
                                help='Run without verbose messaging.')
    messaging_args.add_argument('--debug',
                                action='store_true',
                                help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import GLOBAL_RNG, dataio, functions, alphabets
    from seqsift.seqops import seqsum, seqmod, seqstats
    from seqsift.utils.fileio import OpenFile

    ##########################################################################
    ## handle args

    ## set seed if randomly sampling sequences
    if args.num_samples > 0:
        if not args.seed:
            args.seed = random.randint(1, 999999999)
        GLOBAL_RNG.seed(args.seed)
        log.warning('Seed: {0}'.format(args.seed))

    ## get input file format
    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
            args.input_file)
    if not args.input_format:
        log.error("Could not determine input format.\n"
                  "You must either provide the input format\n"
                  "using the '--from' option or have a recognizable\n"
                  "file extension on the input file name.\n"
                  "Here are the supported file extensions:\n{0}".format(
                      str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    aligner_tools = ['muscle', 'mafft']
    if args.aligner:
        aligner_tools = [args.aligner]
    full_aligner_tools = ['mafft', 'muscle']
    if args.msa_aligner:
        full_aligner_tools = [args.msa_aligner]

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.input_file)

    full_alignment_out_path = os.path.join(args.output_dir, 'seqvet-msa.txt')
    alphabet = alphabets.DnaAlphabet()
    if args.data_type in ['aa', 'protein']:
        alphabet = alphabets.ProteinAlphabet()

    if (args.summarize_reading_frame_lengths
            and (not args.data_type in ['dna', 'rna'])):
        log.error("`--summarize-reading-frame-lengths` is only compatible "
                  "with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if (args.compare_translated and (not args.data_type in ['dna', 'rna'])):
        log.error("`-compare-translated` is only compatible with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    ##########################################################################
    ## heavy lifting

    seqs = dataio.get_seq_iter([args.input_file],
                               format=args.input_format,
                               data_type=args.data_type)

    if args.summarize_reading_frame_lengths:
        log.info('Summarizing longest reading frame lengths...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        lengths = seqsum.summarize_longest_read_lengths(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        length_path = os.path.join(args.output_dir,
                                   'seqvet-reading-frame-lengths.txt')
        log.info('Writing longest reading frame lengths to file...')
        with OpenFile(length_path, 'w') as out:
            out.write('seq_id\tlrf\trev_comp_lrf\n')
            for (l, rc_l, seq_id) in lengths:
                out.write('{0}\t{1}\t{2}\n'.format(seq_id, l, rc_l))

    if args.compare_translated:
        log.info('Translating longest reading frames for distance '
                 'calculations...')
        seqs = seqmod.translate_longest_reading_frames(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        alphabet = alphabets.ProteinAlphabet()

    if args.check_ids:
        log.info('Checking sequence IDs...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        dups = seqstats.get_duplicate_ids(seqs)
        if len(dups) > 0:
            dup_path = functions.get_new_path(
                os.path.join(args.output_dir, 'seqvet-duplicate-ids.txt'))
            log.warning('Duplicate IDs found! Writing them to '
                        '{0}'.format(dup_path))
            with OpenFile(dup_path, 'w') as out:
                for dup in dups:
                    out.write('{0}\n'.format(dup))
        else:
            log.info('No duplicate sequence IDs were found.')

    log.info('Calculating pairwise distances...')
    distances, rev_comp_errors = seqsum.summarize_distances(
        seqs,
        sample_size=args.num_samples,
        per_site=True,
        aligned=args.aligned,
        ignore_gaps=(not args.count_gaps),
        alphabet=alphabet,
        do_full_alignment=args.msa,
        full_alignment_out_path=full_alignment_out_path,
        aligner_tools=aligner_tools,
        full_aligner_tools=full_aligner_tools,
        log_frequency=args.log_frequency)
    log.info('Done!')

    log.info('Writing mean distances to file...')
    distances = sorted([(k, v) for k, v in iteritems(distances)],
                       key=lambda x: x[1].mean,
                       reverse=True)
    mean_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-mean-distances.txt'))
    with OpenFile(mean_path, 'w') as out:
        out.write('seq_id\tmean_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.mean))

    log.info('Writing max distances to file...')
    distances = sorted(distances, key=lambda x: x[1].maximum, reverse=True)
    max_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-max-distances.txt'))
    with OpenFile(max_path, 'w') as out:
        out.write('seq_id\tmax_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.maximum))

    if rev_comp_errors:
        rev_comp_errors = sorted(rev_comp_errors)
        rce_set = set()
        rce = []
        for (s1, s2, d, drc) in rev_comp_errors:
            pair = tuple(sorted([s1, s2]))
            if pair in rce_set:
                continue
            rce_set.add(pair)
            rce.append((pair[0], pair[1], d, drc))
        log.info('Writing potential reverse-complement errors to file...')
        path = functions.get_new_path(
            os.path.join(args.output_dir,
                         'seqvet-reverse-complement-warnings.txt'))
        with OpenFile(path, 'w') as out:
            out.write('seq1\tseq2\tdistance\trev_comp_distance\n')
            for (seq1, seq2, d, drc) in rce:
                out.write('{0}\t{1}\t{2}\t{3}\n'.format(seq1, seq2, d, drc))
Beispiel #10
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'input_files',
        metavar='INPUT-SEQ-FILE',
        nargs='+',
        type=argparse_utils.arg_is_file,
        help=('Input sequence file(s) from which to randomly sub-sample '
              'sequences (without replacement).'))
    parser.add_argument('-n',
                        '--num-samples',
                        type=int,
                        required=True,
                        help=('The number of sequences to randomly sample.'))
    parser.add_argument(
        '--format',
        dest='input_format',
        type=str,
        choices=FILE_FORMATS.supported_formats,
        help=('The format of the input sequence file(s). Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extension of the first input file. However, if '
              'provided, this option will always take precedence over '
              'the file extension.'.format(', '.join(
                  FILE_FORMATS.supported_formats))))
    parser.add_argument(
        '-d',
        '--data-type',
        type=str,
        choices=VALID_DATA_TYPES,
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help=('Random number seed.'))
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import dataio, GLOBAL_RNG, functions

    ##########################################################################
    ## handle args

    ## set seed if randomly sampling sequences
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    log.warning('Seed: {0}'.format(args.seed))

    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
            args.input_files[0])
    if not args.input_format:
        log.error("Could not determine input format.\n"
                  "You must either provide the input format\n"
                  "using the '--from' option or have a recognizable\n"
                  "file extension on the first input file.\n"
                  "Here are the supported file extensions:\n{0}".format(
                      str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    seqs = dataio.get_seq_iter(args.input_files,
                               format=args.input_format,
                               data_type=args.data_type)
    samples = functions.sample_iter(iterable=seqs,
                                    sample_size=args.num_samples)

    SeqIO.write(samples, handle=sys.stdout, format=args.input_format)
Beispiel #11
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_files', metavar='INPUT-SEQ-FILE',
            nargs = '+',
            type = argparse_utils.arg_is_file,
            help = ('Input sequence alignments(s).'))
    parser.add_argument('-k', '--keep',
            dest = 'slices_to_keep',
            action = 'append',
            nargs = 2,
            metavar = 'COLUMN-INDEX',
            type = int,
            required = True,
            help = ('Two integers specifying the beginning and end indices of '
                    'columns to keep.'))
    parser.add_argument('--format',
            dest = 'input_format',
            type = str,
            choices = FILE_FORMATS.supported_formats,
            help = ('The format of the input sequence file(s). Valid options '
                    'include: {0}. By default, the format is guessed based on '
                    'the extension of the first input file. However, if '
                    'provided, this option will always take precedence over '
                    'the file extension.'.format(
                          ', '.join(FILE_FORMATS.supported_formats))))
    parser.add_argument('-d', '--data-type',
            type = str,
            choices = VALID_DATA_TYPES,
            default='dna',
            help = ('The type of sequence data. The default is dna. Valid '
                    'options include: {0}.'.format(', '.join(
                            VALID_DATA_TYPES))))
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name = __name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import dataio
    from seqsift.seqops import seqmod

    ##########################################################################
    ## handle args

    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
                args.input_files[0])
    if not args.input_format:
        log.error("Could not determine input format.\n"
                   "You must either provide the input format\n"
                   "using the '--from' option or have a recognizable\n"
                   "file extension on the first input file.\n"
                   "Here are the supported file extensions:\n{0}".format(
                        str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    seqs = dataio.get_seq_iter(args.input_files,
            format = args.input_format,
            data_type = args.data_type)
    new_seqs = seqmod.dice(seq_iter = seqs,
            slices_to_keep = args.slices_to_keep)

    SeqIO.write(new_seqs,
            handle = sys.stdout,
            format = args.input_format)
Beispiel #12
0
def main():
    description = '{name} {version}'.format(**_program_info)
    usage = ("\n  %prog [options] <SEQ_INPUT_FILE> [<SEQ_OUTPUT_FILE>]")
    parser = OptionParser(usage=usage, description=description,
                          version=_program_info['version'],
                          add_help_option=True)
    format_opts = OptionGroup(parser, 'Format Options',
            'These options designate file formats and data type.')
    format_opts.add_option('-f', '--from', dest='from_format', type='string',
            help=('The format of the input sequence file. Valid options '
                  'include: {0}. By default, the format is guessed based on '
                  'the extension of the input file. However, if provided, '
                  'this option will always take precedence over the file '
                  'extension.'.format(
                        ', '.join(FILE_FORMATS.supported_formats))))
    format_opts.add_option('-t', '--to', dest='to_format', type='string',
            help=('The desired format of the output sequence file. Valid '
                  'options include: {0}. By default, if an output file path '
                  'is provided, the format is guessed based on the extension '
                  'of this file. However, this option will always take '
                  'precedence over the file extension. Either this option or '
                  'an output file path with an extension is required; if '
                  'neither are provided the program will exit with an '
                  'error.'.format(', '.join(FILE_FORMATS.supported_formats))))
    format_opts.add_option('-d', '--data-type', dest='data_type', type='string',
            default='dna',
            help=('The type of sequence data. The default is dna. Valid '
                  'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    parser.add_option_group(format_opts)

    filter_opts = OptionGroup(parser, 'Filter Options',
            'These options allow filtering of data by columns or sequences.')
    filter_opts.add_option('--remove-duplicates',
            dest='remove_duplicates',
            default=False,
            action='store_true',
            help = ('Remove duplicate sequences (i.e., sequences with the same '
                    'ID and sequence). If a duplicate ID is found associated '
                    'with a different sequence, the program will exit with an '
                    'error.'))
    filter_opts.add_option('-x', '--ids-to-exclude',
            dest='ids_to_exclude',
            type='string',
            help=('Comma-delimited list of the ids of sequences to exclude.'))
    filter_opts.add_option('--remove-missing-columns',
            dest='remove_missing_columns',
            default=False,
            action='store_true',
            help=("Remove aligned columns with missing data. Characters to be "
                  "considered missing can be specified with the "
                  "--missing-characters option; the default is '?-'. "
                  "The proportion of rows that must contain these characters "
                  "for a row to be removed can be specified with the "
                  "--missing-column-proportion option; the default is 1.0. "
                  "Note, this option is only relevant to aligned sequences, "
                  "and will result in an error if the input sequences are not "
                  "aligned."))
    filter_opts.add_option('--missing-column-proportion',
            dest='missing_column_proportion',
            type='float',
            default=1.0,
            help=('The proportion of rows that must contain '
                  '--missing-characters for a column to be removed. '
                  'This option is only relevant in combination with the '
                  '--remove-missing-columns option.'))
    filter_opts.add_option('--remove-missing-sequences',
            dest='remove_missing_sequences',
            default=False,
            action = 'store_true',
            help=("Remove sequences with missing data. Characters to be "
                  "considered missing can be specified with the "
                  "--missing-characters option; the default is '?-'. "
                  "The proportion of the sites that must contain these "
                  "characters for a sequence to be removed can be specified "
                  "with the --missing-sequence-proportion option; the default "
                  "is 1.0."))
    filter_opts.add_option('--missing-sequence-proportion',
            dest='missing_sequence_proportion',
            type='float',
            default=1.0,
            help=('The proportion of sites that must contain '
                  '--missing-characters for a sequence to be removed. '
                  'This option is only relevant in combination with the '
                  '--remove-missing-sequences option.'))
    filter_opts.add_option('--missing-characters', dest='missing_characters',
            type='str',
            default='?-',
            help=("Characters to be considered missing and be used in "
                  "evaluating columns/sequences to remove with the "
                  "--remove-missing-columns and --remove-missing-sequences "
                  "options. The default is '?-'."))
    filter_opts.add_option('--remove-constant-columns',
            dest='remove_constant_columns',
            default=False,
            action='store_true',
            help=("Remove aligned columns with no variation."))
    parser.add_option_group(filter_opts)

    rev_comp_opts = OptionGroup(parser, 'Reverse Complement Options',
            'These options are for reverse complementing sequences.')
    rev_comp_opts.add_option('--rev-comp',
            dest='rev_comp',
            default = False,
            action = 'store_true',
            help=("Reverse complement all sequences. This option overrides "
                  "all other reverse-complement options."))
    rev_comp_opts.add_option('--fix-rev-comp-by',
            dest='fix_rev_comp_by',
            type = 'choice',
            choices = ['first', 'read'],
            help=("Try to correct reverse complement errors. "
                  "Options include 'first' and 'read'. If 'first' is "
                  "specified, sequences are returned in their orientation "
                  "that minimizes distance from the first sequence. "
                  "If 'read' is used, sequences are returned in their "
                  "orientation that has the longest read frame "
                  "(see 'Translation Options' for controlling translation "
                  "of reading frames)."))
    parser.add_option_group(rev_comp_opts)

    translation_opts = OptionGroup(parser, 'Translation Options',
            ('These options control translation from nucleotide to amino acid '
             'sequences.'))
    translation_opts.add_option('--table',
            type = 'choice',
            choices = list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)),
            default = 1,
            help = ('The translation table to use for any options associated '
                    'with translating nucleotide sequences to amino acids. '
                    'Option should be the integer that corresponds to the '
                    'desired translation table according to NCBI '
                    '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). '
                    'The default is 1 (the "standard" code).'))
    translation_opts.add_option('--allow-partial',
        default = False,
        action = 'store_true',
        help = ('Allow partial reading frames at the beginning (no start '
                'codon) and end (no stop codon) of sequences.'))
    translation_opts.add_option('--read-after-stop',
        default = False,
        action = 'store_true',
        help = ('A new reading frame begins immediately after a stop codon. '
                'The default is to start reading frame at next start codon '
                'after a stop codon. This option might be useful for exons.'))
    parser.add_option_group(translation_opts)

    distance_opts = OptionGroup(parser, 'Distance Options',
            ('These options control how distances between sequences are '
             'calculated.'))
    distance_opts.add_option('-g', '--count-gaps',
            default = False,
            action = 'store_true',
            help = ('Count gaps when calculating pairwise sequence distances. '
                    'The default is to calculate (number of differences '
                    'ignoring gaps / number of aligned sites ignoring sites '
                    'with gaps) for each pairwise comparison. When this option '
                    'is used, the distance is (number of differences including '
                    'gap differences / total number of aligned sites).'))
    parser.add_option_group(distance_opts)

    messaging_opts = OptionGroup(parser, 'Messaging Options',
            ('These options control verbosity of messaging.'))
    messaging_opts.add_option('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    messaging_opts.add_option('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')
    parser.add_option_group(messaging_opts)

    (options, args) = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if options.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if options.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name = __name__)

    ##########################################################################
    ## package imports

    from seqsift.seqops import seqmod, seqfilter
    from seqsift.utils import dataio

    ##########################################################################
    ## handle args
    
    if len(args) == 1:
        in_file_path = args[0]
        out_file_path = sys.stdout
    elif len(args) == 2:
        in_file_path = args[0]
        out_file_path = args[1]
    elif len(args) > 2:
        log.error("Too many arguments. Expecting at most 2 arguments:\n"
                   "The path to the input file (required), and the path to\n"
                   "output file (optional; defaults to standard output).")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)
    elif len(args) < 1:
        log.error("Too few arguments. Expecting at least 1 argument:\n"
                   "the path to the input file.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    opt_dict = options.__dict__

    if options.from_format:
        in_format = opt_dict.pop('from_format')
    else:
        in_format = FILE_FORMATS.get_format_from_file_object(in_file_path)
    if not in_format:
        log.error("Could not determine format of input file.\n"
                   "You must either provide the format of the input file\n"
                   "using the '--from-format' option or have a recognized\n"
                   "file extension on the input file. Here are the supported\n"
                   "file extensions:\n{0}".format(str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if options.to_format:
        out_format = opt_dict.pop('to_format')
    else:
        out_format = FILE_FORMATS.get_format_from_file_object(out_file_path)
    if not out_format:
        log.error("Could not determine format of output file.\n"
                   "You must either provide the format of the output file\n"
                   "using the '--to-format' option or have a recognized\n"
                   "file extension on the output file. Here are the supported\n"
                   "file extensions:\n{0}".format(str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    data_type = opt_dict.pop('data_type')

    if len(opt_dict) == 0:
        dataio.convert_format(in_file = in_file_path,
                       out_file = out_file_path,
                       in_format = in_format,
                       out_format = out_format,
                       data_type = data_type)
        sys.exit(0)

    if ((options.rev_comp or options.fix_rev_comp_by) and
            (data_type.lower() not in ['dna', 'rna'])):
        log.error("You have selected an option for reverse complementing\n"
                   "sequences but the data type is not DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    seqs = dataio.get_seq_iter([in_file_path],
            format = in_format,
            data_type = data_type)

    if options.ids_to_exclude:
        to_exclude = [n.strip() for n in options.ids_to_exclude.split(',')]
        seqs = seqfilter.id_filter(seqs, to_exclude)

    if options.remove_duplicates:
        seqs = seqfilter.duplicate_id_filter(seqs)

    if options.remove_missing_sequences:
        seqs = seqfilter.row_filter(seqs,
                character_list = list(options.missing_characters),
                max_frequency = options.missing_sequence_proportion)

    if options.remove_missing_columns:
        seqs = seqfilter.column_filter(seqs,
                character_list = list(options.missing_characters),
                max_frequency = options.missing_column_proportion)

    if options.remove_constant_columns:
        seqs = seqfilter.constant_column_filter(seqs)

    if options.rev_comp:
        log.info('Reverse complementing all sequences...')
        seqs = seqmod.reverse_complement(seqs)
    elif options.fix_rev_comp_by == 'first':
        log.info('Reverse complementing to match first sequence...')
        seqs = seqmod.reverse_complement_to_first_seq(seqs,
                per_site = True,
                aligned = False,
                ignore_gaps = (not options.count_gaps),
                alphabet = None,
                aligner_tools = ['muscle', 'mafft'],
                log_frequency = 100)
    elif options.fix_rev_comp_by == 'read':
        log.info('Reverse complementing to longest reading frame...')
        seqs = seqmod.reverse_complement_to_longest_reading_frame(seqs,
                gap_characters=['-'],
                table = options.table,
                allow_partial = options.allow_partial,
                require_start_after_stop = (not options.read_after_stop),
                log_frequency = 100)

    SeqIO.write(seqs,
                handle = out_file_path,
                format = out_format)
Beispiel #13
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'input_files',
        metavar='INPUT-SEQ-FILE',
        nargs='+',
        type=argparse_utils.arg_is_file,
        help=('Input sequence file(s) to be output into files with '
              '`-n` sequences per file.'))
    parser.add_argument(
        '--format',
        dest='input_format',
        type=str,
        choices=FILE_FORMATS.supported_formats,
        help=('The format of the input sequence file(s). Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extensions of input file(s). However, if '
              'provided, this option will always take precedence over '
              'the file extension.'.format(', '.join(
                  FILE_FORMATS.supported_formats))))
    parser.add_argument(
        '-d',
        '--data-type',
        type=str,
        choices=VALID_DATA_TYPES,
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.seqops import seqstats

    ##########################################################################
    ## handle args

    if not args.input_format:
        args.input_format = None

    summaries = seqstats.get_seq_summaries_from_files(args.input_files,
                                                      format=args.input_format,
                                                      data_type=args.data_type)
    global_summary = summaries.pop('global')

    keys = sorted(summaries.keys())
    for k in keys:
        write_summary(k, summaries[k])
    write_summary('overall', global_summary)