Example #1
0
def write_seqs_to_files(seqs,
                        max_num_seqs_per_file=float('inf'),
                        format='fasta',
                        compresslevel=None,
                        prefix='',
                        force=False):
    compress = False
    if compresslevel:
        compress = True
    ext = FILE_FORMATS.get_ext(format, compress)
    file_idx = 0
    file_stream = None
    for seq_idx, seq in enumerate(seqs):
        if seq_idx % max_num_seqs_per_file == 0:
            if file_stream:
                file_stream.close()
            file_idx += 1
            path = '{0}_{1:0>4}{2}'.format(prefix, file_idx, ext)
            if os.path.exists(path) and (not force):
                raise errors.PathExistsError(
                    'File {0} already exists'.format(path))
            file_stream = fileio.OpenFile(path,
                                          mode='w',
                                          compresslevel=compresslevel)
        file_stream.write('{0}'.format(seq.format(format)))
    if file_stream and (not file_stream.closed):
        file_stream.close()
Example #2
0
def write_seqs_to_files(seqs,
        max_num_seqs_per_file = float('inf'),
        format = 'fasta',
        compresslevel = None,
        prefix = '',
        force = False):
    compress = False
    if compresslevel:
        compress = True
    ext = FILE_FORMATS.get_ext(format, compress)
    file_idx = 0
    file_stream = None
    for seq_idx, seq in enumerate(seqs):
        if seq_idx % max_num_seqs_per_file == 0:
            if file_stream:
                file_stream.close()
            file_idx += 1
            path = '{0}_{1:0>4}{2}'.format(prefix, file_idx, ext)
            if os.path.exists(path) and (not force):
                raise errors.PathExistsError('File {0} already exists'.format(
                        path))
            file_stream = fileio.OpenFile(path, mode = 'w',
                    compresslevel = compresslevel)
        file_stream.write('{0}'.format(seq.format(format)))
    if file_stream and (not file_stream.closed):
        file_stream.close()
Example #3
0
def read_seq(file_obj, format=None, data_type='dna', ambiguities=True):
    """
    Returns a single SeqRecord from a file containing exactly one sequence
    record.
    """
    if format == None:
        format = FILE_FORMATS.get_format_from_file_object(file_obj)
    _LOG.debug("reading sequence from {0!r}.".format(file_obj))
    return SeqIO.read(file_obj,
                      format=format,
                      alphabet=get_state_alphabet(data_type, ambiguities))
Example #4
0
def read_seq(file_obj, format=None, data_type='dna', ambiguities=True):
    """
    Returns a single SeqRecord from a file containing exactly one sequence
    record.
    """
    if format == None:
        format = FILE_FORMATS.get_format_from_file_object(file_obj)
    _LOG.debug("reading sequence from {0!r}.".format(file_obj))
    return SeqIO.read(file_obj,
            format=format,
            alphabet=get_state_alphabet(data_type, ambiguities))
Example #5
0
def get_seq_dict(file_obj, format=None, data_type='dna', ambiguities=True):
    """
    Returns a dict of SeqRecords from a sequence file.  This loads all the
    sequences in the file into memory. This is efficient for small sequence
    files, but may cause memory issues with large files.
    """
    if format == None:
        format = FILE_FORMATS.get_format_from_file_object(file_obj)
    return SeqIO.to_dict(get_seq_iter([file_obj],
            format=format,
            data_type=data_type,
            ambiguities=ambiguities))
Example #6
0
def convert_format(in_file,
                   out_file,
                   in_format=None,
                   out_format=None,
                   data_type='dna',
                   ambiguities=True):
    if in_format == None:
        in_format = FILE_FORMATS.get_format_from_file_object(in_file)
    if out_format == None:
        out_format = FILE_FORMATS.get_format_from_file_object(out_file)
    _LOG.debug("converting {in_format}-formatted file {in_file!r} to "
               "{out_format}-formatted file {out_file!r}.".format(
                   in_file=in_file,
                   in_format=in_format,
                   out_file=out_file,
                   out_format=out_format))
    nseqs = SeqIO.convert(in_file=in_file,
                          in_format=in_format,
                          out_file=out_file,
                          out_format=out_format,
                          alphabet=get_state_alphabet(data_type, ambiguities))
    return nseqs
Example #7
0
def get_seq_dict(file_obj, format=None, data_type='dna', ambiguities=True):
    """
    Returns a dict of SeqRecords from a sequence file.  This loads all the
    sequences in the file into memory. This is efficient for small sequence
    files, but may cause memory issues with large files.
    """
    if format == None:
        format = FILE_FORMATS.get_format_from_file_object(file_obj)
    return SeqIO.to_dict(
        get_seq_iter([file_obj],
                     format=format,
                     data_type=data_type,
                     ambiguities=ambiguities))
Example #8
0
def convert_format(in_file, out_file,
        in_format=None,
        out_format=None,
        data_type='dna',
        ambiguities=True):
    if in_format == None:
        in_format = FILE_FORMATS.get_format_from_file_object(in_file)
    if out_format == None:
        out_format = FILE_FORMATS.get_format_from_file_object(out_file)
    _LOG.debug("converting {in_format}-formatted file {in_file!r} to "
              "{out_format}-formatted file {out_file!r}.".format(
                    in_file=in_file,
                    in_format=in_format,
                    out_file=out_file,
                    out_format=out_format))
    nseqs = SeqIO.convert(
            in_file=in_file,
            in_format=in_format,
            out_file=out_file,
            out_format=out_format,
            alphabet=get_state_alphabet(data_type, ambiguities))
    return nseqs
Example #9
0
def get_indexed_seq_iter(file_path, format=None, data_type='dna',
        key_function=None,
        ambiguities=True):
    """
    Returns an indexed SeqRecord iterator from a sequence file.  Only supports
    sequential file formats (e.g., fasta and genbank).  The iterator acts like
    a dict, but only parses a sequence from a file when needed. For large
    sequence files, this is memory-efficient alternative to reading all the
    sequences into a dict.
    """
    if format == None:
        format = FILE_FORMATS.get_format_from_file_object(file_path)
    _LOG.debug("parsing indexed SeqRecord iterator from {0!r}.".format(
            file_path))
    return SeqIO.index(file_path,
            format=format,
            alphabet=get_state_alphabet(data_type, ambiguities),
            key_function=key_function)
Example #10
0
 def __init__(self, file_obj,
         format = None,
         data_type = 'dna',
         ambiguities = True):
     self.__class__.count += 1
     self.instance_name = '-'.join([self.__class__.__name__,
             str(self.count)])
     self.name = getattr(file_obj, 'name', self.instance_name)
     self._close = False
     self._file_obj = file_obj
     if isinstance(file_obj, str):
         self.name = file_obj
         self._file_obj = fileio.OpenFile(file_obj, 'r')
         self._close = True
     if format == None:
         format = FILE_FORMATS.get_format_from_file_object(file_obj)
     self._seqs = SeqIO.parse(self._file_obj,
             format=format,
             alphabet=get_state_alphabet(data_type, ambiguities))
Example #11
0
def get_indexed_seq_iter(file_path,
                         format=None,
                         data_type='dna',
                         key_function=None,
                         ambiguities=True):
    """
    Returns an indexed SeqRecord iterator from a sequence file.  Only supports
    sequential file formats (e.g., fasta and genbank).  The iterator acts like
    a dict, but only parses a sequence from a file when needed. For large
    sequence files, this is memory-efficient alternative to reading all the
    sequences into a dict.
    """
    if format == None:
        format = FILE_FORMATS.get_format_from_file_object(file_path)
    _LOG.debug(
        "parsing indexed SeqRecord iterator from {0!r}.".format(file_path))
    return SeqIO.index(file_path,
                       format=format,
                       alphabet=get_state_alphabet(data_type, ambiguities),
                       key_function=key_function)
Example #12
0
 def __init__(self,
              file_obj,
              format=None,
              data_type='dna',
              ambiguities=True):
     self.__class__.count += 1
     self.instance_name = '-'.join(
         [self.__class__.__name__, str(self.count)])
     self.name = getattr(file_obj, 'name', self.instance_name)
     self._close = False
     self._file_obj = file_obj
     if isinstance(file_obj, str):
         self.name = file_obj
         self._file_obj = fileio.OpenFile(file_obj, 'r')
         self._close = True
     if format == None:
         format = FILE_FORMATS.get_format_from_file_object(file_obj)
     self._seqs = SeqIO.parse(self._file_obj,
                              format=format,
                              alphabet=get_state_alphabet(
                                  data_type, ambiguities))
Example #13
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_file',
                        metavar='INPUT-SEQ-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Input sequence file to be vetted.'))

    comparison_args = parser.add_argument_group(
        'Comparison Options',
        'Options to control the number and nature of sequence comparisons')
    comparison_args.add_argument(
        '-n',
        '--num-samples',
        type=int,
        default=0,
        help=('The number of randomly sampled sequences to which each '
              'sequence will be compared. If less than 1 (the defualt is '
              '0), all pairwise comparisons will be performed. For very '
              'large numbers of sequences, performing all pairwise '
              'comparisons will take a long time. This option will speed '
              'things up as long as the number specified is less than '
              'about half of the number of input sequences. If the '
              'number you are considering is close to half of the number '
              'sequences, you should probably specify zero and do all '
              'combinations. You should not specify a number greater than '
              'half the number of sequences, because it will take longer '
              'and be less thorough than the default.'))
    comparison_args.add_argument(
        '--seed',
        action='store',
        type=int,
        help=('Random number seed to use for the analysis. This option '
              'is only revelant if a number greater than 0 is specified '
              'for the `-n/--num-samples` option.'))
    comparison_args.add_argument(
        '--compare-translated',
        action='store_true',
        help=('Compare amino acid sequences encoded by the longest '
              'reading frame found in each sequence. To use this option, '
              '`data-type` must be dna or rna. See "Translation Options" '
              'for controlling how the longest reading frame of each '
              'sequence is determined and translated.'))
    comparison_args.add_argument('--check-ids',
                                 action='store_true',
                                 help=('Check sequence IDs for duplicates.'))
    comparison_args.add_argument(
        '--summarize-reading-frame-lengths',
        action='store_true',
        help=('Report the length of the longest reading frame of '
              'each sequence. See "Translation Options" for controlling '
              'how reading frames are determined.'))
    comparison_args.add_argument(
        '-g',
        '--count-gaps',
        action='store_true',
        help=('Count gaps when calculating pairwise sequence distances. '
              'The default is to calculate (number of differences '
              'ignoring gaps / number of aligned sites ignoring sites '
              'with gaps) for each pairwise comparison. When this option '
              'is used, the distance is (number of differences including '
              'gap differences / total number of aligned sites).'))

    alignment_args = parser.add_argument_group(
        'Alignment Options',
        ('These options control if/how sequences are to be aligned prior '
         'to calculating distances.'))
    alignment_args.add_argument(
        '-a',
        '--aligned',
        action='store_true',
        help=('Treat input sequences as aligned. I.e., do not perform '
              'pairwise alignment before calculating distances between '
              'sequences (except when calculating distances for reverse '
              'and complemented sequences).'))
    alignment_args.add_argument(
        '--aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for pairwise'
              'alignments of sequences. '
              'The default is to look for muscle and then mafft in PATH, '
              'and if neither are found use the (slow) built-in '
              'function. Even if the `-a`/`--aligned` option is '
              'specified, the aligner will still be used for pairwise '
              'alignments when calculating distances of reverse and '
              'complemented sequences.'))
    alignment_args.add_argument(
        '--msa',
        action='store_true',
        help=('Perform a full multiple sequence alignemnt prior to '
              'comparing sequences. The default is to align each '
              'pair of sequences being compared. This option is '
              'overruled by the `-a`/`--aligned` option. '
              'If this option is used '
              'the resulting alignment is written to file.'))
    alignment_args.add_argument(
        '--msa-aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for full '
              'multiple sequence alignment. '
              'The default is to look for mafft and then muscle in PATH, '
              'and if neither are found the program will exit with an '
              'error message. If you do not have mafft or muscle '
              'you cannot use this option. '
              'This option is only used if the `-a`/`--aligned` option '
              'is not specified, and the `--msa` option is specified.'))

    translation_args = parser.add_argument_group(
        'Translation Options',
        ('These options control translation from nucleotide to amino acid '
         'sequences.'))
    translation_args.add_argument(
        '--table',
        type=int,
        choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)),
        default=1,
        help=('The translation table to use for any options associated '
              'with translating nucleotide sequences to amino acids. '
              'Option should be the integer that corresponds to the '
              'desired translation table according to NCBI '
              '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). '
              'The default is 1 (the "standard" code).'))
    translation_args.add_argument(
        '--allow-partial',
        action='store_true',
        default=False,
        help=('Allow partial reading frames at the beginning (no start '
              'codon) and end (no stop codon) of sequences.'))
    translation_args.add_argument(
        '--read-after-stop',
        action='store_true',
        default=False,
        help=('A new reading frame begins immediately after a stop codon. '
              'The default is to start reading frame at next start codon '
              'after a stop codon. This option might be useful for exons.'))

    data_args = parser.add_argument_group(
        'Data Options', ('Options specifying the input data type and format'))
    data_args.add_argument(
        '-d',
        '--data-type',
        type=str,
        choices=VALID_DATA_TYPES,
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    data_args.add_argument(
        '--format',
        dest='input_format',
        type=str,
        choices=FILE_FORMATS.supported_formats,
        help=('The format of the input sequence file. Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extension of the first input file. However, if '
              'provided, this option will always take precedence over '
              'the file extension.'.format(', '.join(
                  FILE_FORMATS.supported_formats))))

    output_args = parser.add_argument_group(
        'Output Options', 'Options for controlling output of program')
    output_args.add_argument(
        '-o',
        '--output-dir',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the input file.'))

    messaging_args = parser.add_argument_group(
        'Messaging Options', ('These options control verbosity of messaging.'))
    messaging_args.add_argument(
        '--log-frequency',
        type=argparse_utils.arg_is_nonnegative_int,
        default=1000,
        help=('The frequency at which to log progress. Default is to log '
              'every 1000 sequence comparisons.'))
    messaging_args.add_argument('--quiet',
                                action='store_true',
                                help='Run without verbose messaging.')
    messaging_args.add_argument('--debug',
                                action='store_true',
                                help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import GLOBAL_RNG, dataio, functions, alphabets
    from seqsift.seqops import seqsum, seqmod, seqstats
    from seqsift.utils.fileio import OpenFile

    ##########################################################################
    ## handle args

    ## set seed if randomly sampling sequences
    if args.num_samples > 0:
        if not args.seed:
            args.seed = random.randint(1, 999999999)
        GLOBAL_RNG.seed(args.seed)
        log.warning('Seed: {0}'.format(args.seed))

    ## get input file format
    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
            args.input_file)
    if not args.input_format:
        log.error("Could not determine input format.\n"
                  "You must either provide the input format\n"
                  "using the '--from' option or have a recognizable\n"
                  "file extension on the input file name.\n"
                  "Here are the supported file extensions:\n{0}".format(
                      str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    aligner_tools = ['muscle', 'mafft']
    if args.aligner:
        aligner_tools = [args.aligner]
    full_aligner_tools = ['mafft', 'muscle']
    if args.msa_aligner:
        full_aligner_tools = [args.msa_aligner]

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.input_file)

    full_alignment_out_path = os.path.join(args.output_dir, 'seqvet-msa.txt')
    alphabet = alphabets.DnaAlphabet()
    if args.data_type in ['aa', 'protein']:
        alphabet = alphabets.ProteinAlphabet()

    if (args.summarize_reading_frame_lengths
            and (not args.data_type in ['dna', 'rna'])):
        log.error("`--summarize-reading-frame-lengths` is only compatible "
                  "with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if (args.compare_translated and (not args.data_type in ['dna', 'rna'])):
        log.error("`-compare-translated` is only compatible with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    ##########################################################################
    ## heavy lifting

    seqs = dataio.get_seq_iter([args.input_file],
                               format=args.input_format,
                               data_type=args.data_type)

    if args.summarize_reading_frame_lengths:
        log.info('Summarizing longest reading frame lengths...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        lengths = seqsum.summarize_longest_read_lengths(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        length_path = os.path.join(args.output_dir,
                                   'seqvet-reading-frame-lengths.txt')
        log.info('Writing longest reading frame lengths to file...')
        with OpenFile(length_path, 'w') as out:
            out.write('seq_id\tlrf\trev_comp_lrf\n')
            for (l, rc_l, seq_id) in lengths:
                out.write('{0}\t{1}\t{2}\n'.format(seq_id, l, rc_l))

    if args.compare_translated:
        log.info('Translating longest reading frames for distance '
                 'calculations...')
        seqs = seqmod.translate_longest_reading_frames(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        alphabet = alphabets.ProteinAlphabet()

    if args.check_ids:
        log.info('Checking sequence IDs...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        dups = seqstats.get_duplicate_ids(seqs)
        if len(dups) > 0:
            dup_path = functions.get_new_path(
                os.path.join(args.output_dir, 'seqvet-duplicate-ids.txt'))
            log.warning('Duplicate IDs found! Writing them to '
                        '{0}'.format(dup_path))
            with OpenFile(dup_path, 'w') as out:
                for dup in dups:
                    out.write('{0}\n'.format(dup))
        else:
            log.info('No duplicate sequence IDs were found.')

    log.info('Calculating pairwise distances...')
    distances, rev_comp_errors = seqsum.summarize_distances(
        seqs,
        sample_size=args.num_samples,
        per_site=True,
        aligned=args.aligned,
        ignore_gaps=(not args.count_gaps),
        alphabet=alphabet,
        do_full_alignment=args.msa,
        full_alignment_out_path=full_alignment_out_path,
        aligner_tools=aligner_tools,
        full_aligner_tools=full_aligner_tools,
        log_frequency=args.log_frequency)
    log.info('Done!')

    log.info('Writing mean distances to file...')
    distances = sorted([(k, v) for k, v in iteritems(distances)],
                       key=lambda x: x[1].mean,
                       reverse=True)
    mean_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-mean-distances.txt'))
    with OpenFile(mean_path, 'w') as out:
        out.write('seq_id\tmean_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.mean))

    log.info('Writing max distances to file...')
    distances = sorted(distances, key=lambda x: x[1].maximum, reverse=True)
    max_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-max-distances.txt'))
    with OpenFile(max_path, 'w') as out:
        out.write('seq_id\tmax_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.maximum))

    if rev_comp_errors:
        rev_comp_errors = sorted(rev_comp_errors)
        rce_set = set()
        rce = []
        for (s1, s2, d, drc) in rev_comp_errors:
            pair = tuple(sorted([s1, s2]))
            if pair in rce_set:
                continue
            rce_set.add(pair)
            rce.append((pair[0], pair[1], d, drc))
        log.info('Writing potential reverse-complement errors to file...')
        path = functions.get_new_path(
            os.path.join(args.output_dir,
                         'seqvet-reverse-complement-warnings.txt'))
        with OpenFile(path, 'w') as out:
            out.write('seq1\tseq2\tdistance\trev_comp_distance\n')
            for (seq1, seq2, d, drc) in rce:
                out.write('{0}\t{1}\t{2}\t{3}\n'.format(seq1, seq2, d, drc))
Example #14
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument(
        'input_files',
        metavar='INPUT-SEQ-FILE',
        nargs='+',
        type=argparse_utils.arg_is_file,
        help=('Input sequence file(s) from which to randomly sub-sample '
              'sequences (without replacement).'))
    parser.add_argument('-n',
                        '--num-samples',
                        type=int,
                        required=True,
                        help=('The number of sequences to randomly sample.'))
    parser.add_argument(
        '--format',
        dest='input_format',
        type=str,
        choices=FILE_FORMATS.supported_formats,
        help=('The format of the input sequence file(s). Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extension of the first input file. However, if '
              'provided, this option will always take precedence over '
              'the file extension.'.format(', '.join(
                  FILE_FORMATS.supported_formats))))
    parser.add_argument(
        '-d',
        '--data-type',
        type=str,
        choices=VALID_DATA_TYPES,
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help=('Random number seed.'))
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import dataio, GLOBAL_RNG, functions

    ##########################################################################
    ## handle args

    ## set seed if randomly sampling sequences
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    log.warning('Seed: {0}'.format(args.seed))

    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
            args.input_files[0])
    if not args.input_format:
        log.error("Could not determine input format.\n"
                  "You must either provide the input format\n"
                  "using the '--from' option or have a recognizable\n"
                  "file extension on the first input file.\n"
                  "Here are the supported file extensions:\n{0}".format(
                      str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    seqs = dataio.get_seq_iter(args.input_files,
                               format=args.input_format,
                               data_type=args.data_type)
    samples = functions.sample_iter(iterable=seqs,
                                    sample_size=args.num_samples)

    SeqIO.write(samples, handle=sys.stdout, format=args.input_format)
Example #15
0
def main():
    description = '{name} {version}'.format(**_program_info)
    usage = ("\n  %prog [options] <SEQ_INPUT_FILE> [<SEQ_OUTPUT_FILE>]")
    parser = OptionParser(usage=usage,
                          description=description,
                          version=_program_info['version'],
                          add_help_option=True)
    format_opts = OptionGroup(
        parser, 'Format Options',
        'These options designate file formats and data type.')
    format_opts.add_option(
        '-f',
        '--from',
        dest='from_format',
        type='string',
        help=('The format of the input sequence file. Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extension of the input file. However, if provided, '
              'this option will always take precedence over the file '
              'extension.'.format(', '.join(FILE_FORMATS.supported_formats))))
    format_opts.add_option(
        '-t',
        '--to',
        dest='to_format',
        type='string',
        help=('The desired format of the output sequence file. Valid '
              'options include: {0}. By default, if an output file path '
              'is provided, the format is guessed based on the extension '
              'of this file. However, this option will always take '
              'precedence over the file extension. Either this option or '
              'an output file path with an extension is required; if '
              'neither are provided the program will exit with an '
              'error.'.format(', '.join(FILE_FORMATS.supported_formats))))
    format_opts.add_option(
        '-d',
        '--data-type',
        dest='data_type',
        type='string',
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    parser.add_option_group(format_opts)

    filter_opts = OptionGroup(
        parser, 'Filter Options',
        'These options allow filtering of data by columns or sequences.')
    filter_opts.add_option(
        '--remove-duplicates',
        dest='remove_duplicates',
        default=False,
        action='store_true',
        help=('Remove duplicate sequences (i.e., sequences with the same '
              'ID and sequence). If a duplicate ID is found associated '
              'with a different sequence, the program will exit with an '
              'error.'))
    filter_opts.add_option(
        '-x',
        '--ids-to-exclude',
        dest='ids_to_exclude',
        type='string',
        help=('Comma-delimited list of the ids of sequences to exclude.'))
    filter_opts.add_option(
        '--remove-missing-columns',
        dest='remove_missing_columns',
        default=False,
        action='store_true',
        help=("Remove aligned columns with missing data. Characters to be "
              "considered missing can be specified with the "
              "--missing-characters option; the default is '?-'. "
              "The proportion of rows that must contain these characters "
              "for a row to be removed can be specified with the "
              "--missing-column-proportion option; the default is 1.0. "
              "Note, this option is only relevant to aligned sequences, "
              "and will result in an error if the input sequences are not "
              "aligned."))
    filter_opts.add_option(
        '--missing-column-proportion',
        dest='missing_column_proportion',
        type='float',
        default=1.0,
        help=('The proportion of rows that must contain '
              '--missing-characters for a column to be removed. '
              'This option is only relevant in combination with the '
              '--remove-missing-columns option.'))
    filter_opts.add_option(
        '--remove-missing-sequences',
        dest='remove_missing_sequences',
        default=False,
        action='store_true',
        help=("Remove sequences with missing data. Characters to be "
              "considered missing can be specified with the "
              "--missing-characters option; the default is '?-'. "
              "The proportion of the sites that must contain these "
              "characters for a sequence to be removed can be specified "
              "with the --missing-sequence-proportion option; the default "
              "is 1.0."))
    filter_opts.add_option(
        '--missing-sequence-proportion',
        dest='missing_sequence_proportion',
        type='float',
        default=1.0,
        help=('The proportion of sites that must contain '
              '--missing-characters for a sequence to be removed. '
              'This option is only relevant in combination with the '
              '--remove-missing-sequences option.'))
    filter_opts.add_option(
        '--missing-characters',
        dest='missing_characters',
        type='str',
        default='?-',
        help=("Characters to be considered missing and be used in "
              "evaluating columns/sequences to remove with the "
              "--remove-missing-columns and --remove-missing-sequences "
              "options. The default is '?-'."))
    filter_opts.add_option('--remove-constant-columns',
                           dest='remove_constant_columns',
                           default=False,
                           action='store_true',
                           help=("Remove aligned columns with no variation."))
    parser.add_option_group(filter_opts)

    rev_comp_opts = OptionGroup(
        parser, 'Reverse Complement Options',
        'These options are for reverse complementing sequences.')
    rev_comp_opts.add_option(
        '--rev-comp',
        dest='rev_comp',
        default=False,
        action='store_true',
        help=("Reverse complement all sequences. This option overrides "
              "all other reverse-complement options."))
    rev_comp_opts.add_option(
        '--fix-rev-comp-by',
        dest='fix_rev_comp_by',
        type='choice',
        choices=['first', 'read'],
        help=("Try to correct reverse complement errors. "
              "Options include 'first' and 'read'. If 'first' is "
              "specified, sequences are returned in their orientation "
              "that minimizes distance from the first sequence. "
              "If 'read' is used, sequences are returned in their "
              "orientation that has the longest read frame "
              "(see 'Translation Options' for controlling translation "
              "of reading frames)."))
    parser.add_option_group(rev_comp_opts)

    translation_opts = OptionGroup(
        parser, 'Translation Options',
        ('These options control translation from nucleotide to amino acid '
         'sequences.'))
    translation_opts.add_option(
        '--table',
        type='choice',
        choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)),
        default=1,
        help=('The translation table to use for any options associated '
              'with translating nucleotide sequences to amino acids. '
              'Option should be the integer that corresponds to the '
              'desired translation table according to NCBI '
              '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). '
              'The default is 1 (the "standard" code).'))
    translation_opts.add_option(
        '--allow-partial',
        default=False,
        action='store_true',
        help=('Allow partial reading frames at the beginning (no start '
              'codon) and end (no stop codon) of sequences.'))
    translation_opts.add_option(
        '--read-after-stop',
        default=False,
        action='store_true',
        help=('A new reading frame begins immediately after a stop codon. '
              'The default is to start reading frame at next start codon '
              'after a stop codon. This option might be useful for exons.'))
    parser.add_option_group(translation_opts)

    distance_opts = OptionGroup(
        parser, 'Distance Options',
        ('These options control how distances between sequences are '
         'calculated.'))
    distance_opts.add_option(
        '-g',
        '--count-gaps',
        default=False,
        action='store_true',
        help=('Count gaps when calculating pairwise sequence distances. '
              'The default is to calculate (number of differences '
              'ignoring gaps / number of aligned sites ignoring sites '
              'with gaps) for each pairwise comparison. When this option '
              'is used, the distance is (number of differences including '
              'gap differences / total number of aligned sites).'))
    parser.add_option_group(distance_opts)

    messaging_opts = OptionGroup(
        parser, 'Messaging Options',
        ('These options control verbosity of messaging.'))
    messaging_opts.add_option('--quiet',
                              action='store_true',
                              help='Run without verbose messaging.')
    messaging_opts.add_option('--debug',
                              action='store_true',
                              help='Run in debugging mode.')
    parser.add_option_group(messaging_opts)

    (options, args) = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if options.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if options.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.seqops import seqmod, seqfilter
    from seqsift.utils import dataio

    ##########################################################################
    ## handle args

    if len(args) == 1:
        in_file_path = args[0]
        out_file_path = sys.stdout
    elif len(args) == 2:
        in_file_path = args[0]
        out_file_path = args[1]
    elif len(args) > 2:
        log.error("Too many arguments. Expecting at most 2 arguments:\n"
                  "The path to the input file (required), and the path to\n"
                  "output file (optional; defaults to standard output).")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)
    elif len(args) < 1:
        log.error("Too few arguments. Expecting at least 1 argument:\n"
                  "the path to the input file.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    opt_dict = options.__dict__

    if options.from_format:
        in_format = opt_dict.pop('from_format')
    else:
        in_format = FILE_FORMATS.get_format_from_file_object(in_file_path)
    if not in_format:
        log.error("Could not determine format of input file.\n"
                  "You must either provide the format of the input file\n"
                  "using the '--from-format' option or have a recognized\n"
                  "file extension on the input file. Here are the supported\n"
                  "file extensions:\n{0}".format(str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if options.to_format:
        out_format = opt_dict.pop('to_format')
    else:
        out_format = FILE_FORMATS.get_format_from_file_object(out_file_path)
    if not out_format:
        log.error("Could not determine format of output file.\n"
                  "You must either provide the format of the output file\n"
                  "using the '--to-format' option or have a recognized\n"
                  "file extension on the output file. Here are the supported\n"
                  "file extensions:\n{0}".format(str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    data_type = opt_dict.pop('data_type')

    if len(opt_dict) == 0:
        dataio.convert_format(in_file=in_file_path,
                              out_file=out_file_path,
                              in_format=in_format,
                              out_format=out_format,
                              data_type=data_type)
        sys.exit(0)

    if ((options.rev_comp or options.fix_rev_comp_by)
            and (data_type.lower() not in ['dna', 'rna'])):
        log.error("You have selected an option for reverse complementing\n"
                  "sequences but the data type is not DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    seqs = dataio.get_seq_iter([in_file_path],
                               format=in_format,
                               data_type=data_type)

    if options.ids_to_exclude:
        to_exclude = [n.strip() for n in options.ids_to_exclude.split(',')]
        seqs = seqfilter.id_filter(seqs, to_exclude)

    if options.remove_duplicates:
        seqs = seqfilter.duplicate_id_filter(seqs)

    if options.remove_missing_sequences:
        seqs = seqfilter.row_filter(
            seqs,
            character_list=list(options.missing_characters),
            max_frequency=options.missing_sequence_proportion)

    if options.remove_missing_columns:
        seqs = seqfilter.column_filter(
            seqs,
            character_list=list(options.missing_characters),
            max_frequency=options.missing_column_proportion)

    if options.remove_constant_columns:
        seqs = seqfilter.constant_column_filter(seqs)

    if options.rev_comp:
        log.info('Reverse complementing all sequences...')
        seqs = seqmod.reverse_complement(seqs)
    elif options.fix_rev_comp_by == 'first':
        log.info('Reverse complementing to match first sequence...')
        seqs = seqmod.reverse_complement_to_first_seq(
            seqs,
            per_site=True,
            aligned=False,
            ignore_gaps=(not options.count_gaps),
            alphabet=None,
            aligner_tools=['muscle', 'mafft'],
            log_frequency=100)
    elif options.fix_rev_comp_by == 'read':
        log.info('Reverse complementing to longest reading frame...')
        seqs = seqmod.reverse_complement_to_longest_reading_frame(
            seqs,
            gap_characters=['-'],
            table=options.table,
            allow_partial=options.allow_partial,
            require_start_after_stop=(not options.read_after_stop),
            log_frequency=100)

    SeqIO.write(seqs, handle=out_file_path, format=out_format)
Example #16
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_files', metavar='INPUT-SEQ-FILE',
            nargs = '+',
            type = argparse_utils.arg_is_file,
            help = ('Input sequence alignments(s).'))
    parser.add_argument('-k', '--keep',
            dest = 'slices_to_keep',
            action = 'append',
            nargs = 2,
            metavar = 'COLUMN-INDEX',
            type = int,
            required = True,
            help = ('Two integers specifying the beginning and end indices of '
                    'columns to keep.'))
    parser.add_argument('--format',
            dest = 'input_format',
            type = str,
            choices = FILE_FORMATS.supported_formats,
            help = ('The format of the input sequence file(s). Valid options '
                    'include: {0}. By default, the format is guessed based on '
                    'the extension of the first input file. However, if '
                    'provided, this option will always take precedence over '
                    'the file extension.'.format(
                          ', '.join(FILE_FORMATS.supported_formats))))
    parser.add_argument('-d', '--data-type',
            type = str,
            choices = VALID_DATA_TYPES,
            default='dna',
            help = ('The type of sequence data. The default is dna. Valid '
                    'options include: {0}.'.format(', '.join(
                            VALID_DATA_TYPES))))
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name = __name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import dataio
    from seqsift.seqops import seqmod

    ##########################################################################
    ## handle args

    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
                args.input_files[0])
    if not args.input_format:
        log.error("Could not determine input format.\n"
                   "You must either provide the input format\n"
                   "using the '--from' option or have a recognizable\n"
                   "file extension on the first input file.\n"
                   "Here are the supported file extensions:\n{0}".format(
                        str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    seqs = dataio.get_seq_iter(args.input_files,
            format = args.input_format,
            data_type = args.data_type)
    new_seqs = seqmod.dice(seq_iter = seqs,
            slices_to_keep = args.slices_to_keep)

    SeqIO.write(new_seqs,
            handle = sys.stdout,
            format = args.input_format)
Example #17
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('input_files', metavar='INPUT-SEQ-FILE',
            nargs = '+',
            type = argparse_utils.arg_is_file,
            help = ('Input sequence file(s) to be output into files with '
                    '`-n` sequences per file.'))
    parser.add_argument('-n', '--num-samples',
            type = int,
            required = True,
            help = ('The maximum number of sequences to put in each output '
                    'file.'))
    parser.add_argument('--format',
            dest = 'input_format',
            type = str,
            choices = FILE_FORMATS.supported_formats,
            help = ('The format of the input sequence file(s). Valid options '
                    'include: {0}. By default, the format is guessed based on '
                    'the extension of the first input file. However, if '
                    'provided, this option will always take precedence over '
                    'the file extension.'.format(
                          ', '.join(FILE_FORMATS.supported_formats))))
    parser.add_argument('-d', '--data-type',
            type = str,
            choices = VALID_DATA_TYPES,
            default='dna',
            help = ('The type of sequence data. The default is dna. Valid '
                    'options include: {0}.'.format(', '.join(
                            VALID_DATA_TYPES))))
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = ('Random number seed.'))
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name = __name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import dataio, GLOBAL_RNG, functions

    ##########################################################################
    ## handle args

    ## set seed if randomly sampling sequences
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    log.warning('Seed: {0}'.format(args.seed))

    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
                args.input_files[0])
    if not args.input_format:
        log.error("Could not determine input format.\n"
                   "You must either provide the input format\n"
                   "using the '--from' option or have a recognizable\n"
                   "file extension on the first input file.\n"
                   "Here are the supported file extensions:\n{0}".format(
                        str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    seqs = dataio.get_seq_iter(args.input_files,
            format = args.input_format,
            data_type = args.data_type)
    samples = functions.sample_iter(iterable = seqs,
            sample_size = args.num_samples)

    SeqIO.write(samples,
            handle = sys.stdout,
            format = args.input_format)
Example #18
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_files', metavar='INPUT-SEQ-FILE',
            nargs = '+',
            type = argparse_utils.arg_is_file,
            help = ('Input sequence file(s) to be output into files with '
                    '`-n` sequences per file.'))
    parser.add_argument('-n', '--num-seqs-per-file',
            type = int,
            required = True,
            default = 4000000,
            help = ('The maximum number of sequences to put in each output '
                    'file.'))
    parser.add_argument('--format',
            dest = 'input_format',
            type = str,
            choices = FILE_FORMATS.supported_formats,
            help = ('The format of the input sequence file(s). Valid options '
                    'include: {0}. By default, the format is guessed based on '
                    'the extension of the first input file. However, if '
                    'provided, this option will always take precedence over '
                    'the file extension.'.format(
                          ', '.join(FILE_FORMATS.supported_formats))))
    parser.add_argument('-d', '--data-type',
            type = str,
            choices = VALID_DATA_TYPES,
            default='dna',
            help = ('The type of sequence data. The default is dna. Valid '
                    'options include: {0}.'.format(', '.join(
                            VALID_DATA_TYPES))))
    parser.add_argument('-c', '--compress',
            action = 'store_true',
            help = 'Compress (gzip) output files.')
    parser.add_argument('-o', '--output-dir',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the input file.'))
    parser.add_argument('-p', '--prefix',
            action = 'store',
            type = str,
            help = ('Prefix to use at beginning of output files. The default '
                    'is to use the first input file name.'))
    parser.add_argument('--log-frequency',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 100000,
            help = ('The frequency at which to log progress. Default is to log '
                    'every 100000 sequences.'))
    parser.add_argument('--force',
            action = 'store_true',
            help = ('Overwrite files if they already exist.'))
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name = __name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import dataio, errors
    from seqsift.utils.fileio import OpenFile

    ##########################################################################
    ## handle args

    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
                args.input_files[0])
    if not args.input_format:
        log.error("Could not determine input format.\n"
                   "You must either provide the input format\n"
                   "using the '--from' option or have a recognizable\n"
                   "file extension on the first input file.\n"
                   "Here are the supported file extensions:\n{0}".format(
                        str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if not args.prefix:
        args.prefix = os.path.splitext(args.input_files[0])[0]
    if args.output_dir:
        args.prefix = os.path.join(args.output_dir, os.path.basename(args.prefix))

    out_ext = FILE_FORMATS.get_ext(args.input_format,
            compressed = args.compress)

    compresslevel = None
    if args.compress:
        compresslevel = 9

    # handle sequential formats on the fly
    if FILE_FORMATS.is_sequential(args.input_format):
        seq_iter = dataio.get_seq_iter(
                file_objs = args.input_files,
                format = args.input_format,
                data_type = args.data_type)

        try:
            dataio.write_seqs_to_files(seq_iter,
                    max_num_seqs_per_file = args.num_seqs_per_file,
                    format = args.input_format,
                    compresslevel = compresslevel,
                    prefix = args.prefix,
                    force = args.force)
        except errors.PathExistsError as e:
            log.error('ERROR:\n'
                    'Output files already exist! You can specify a different\n'
                    'prefix or use the `--force` option to overwrite the\n'
                    'existing files. Here is the stack trace:\n\n{0}\n'.format(
                            e))
            sys.exit(1)


    # use SeqIO for non-sequential formats
    else:
        batch_iter = dataio.get_seq_batch_iter_from_files(
                file_objs = args.input_files,
                number_per_batch = args.num_seqs_per_file,
                format = args.input_format,
                data_type = args.data_type)

        for batch_idx, seq_iter in enumerate(batch_iter):
            out_path = '{0}_{1:0>4}{2}'.format(args.prefix, batch_idx + 1,
                    out_ext)
            if os.path.exists(out_path) and (not args.force):
                log.error('ERROR:\n'
                        'Output files already exist! You can specify a '
                        'different\nprefix or use the `--force` option to '
                        'overwrite the\nexisting files.')
                sys.exit(1)
            out = OpenFile(out_path, mode = 'w', compresslevel = compresslevel)
            SeqIO.write(seq_iter,
                    handle = out,
                    format = args.input_format)
            out.close()
Example #19
0
def main():
    description = '{name} {version}'.format(**_program_info)
    usage = ("\n  %prog [options] <SEQ_INPUT_FILE> [<SEQ_OUTPUT_FILE>]")
    parser = OptionParser(usage=usage, description=description,
                          version=_program_info['version'],
                          add_help_option=True)
    format_opts = OptionGroup(parser, 'Format Options',
            'These options designate file formats and data type.')
    format_opts.add_option('-f', '--from', dest='from_format', type='string',
            help=('The format of the input sequence file. Valid options '
                  'include: {0}. By default, the format is guessed based on '
                  'the extension of the input file. However, if provided, '
                  'this option will always take precedence over the file '
                  'extension.'.format(
                        ', '.join(FILE_FORMATS.supported_formats))))
    format_opts.add_option('-t', '--to', dest='to_format', type='string',
            help=('The desired format of the output sequence file. Valid '
                  'options include: {0}. By default, if an output file path '
                  'is provided, the format is guessed based on the extension '
                  'of this file. However, this option will always take '
                  'precedence over the file extension. Either this option or '
                  'an output file path with an extension is required; if '
                  'neither are provided the program will exit with an '
                  'error.'.format(', '.join(FILE_FORMATS.supported_formats))))
    format_opts.add_option('-d', '--data-type', dest='data_type', type='string',
            default='dna',
            help=('The type of sequence data. The default is dna. Valid '
                  'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    parser.add_option_group(format_opts)

    filter_opts = OptionGroup(parser, 'Filter Options',
            'These options allow filtering of data by columns or sequences.')
    filter_opts.add_option('--remove-duplicates',
            dest='remove_duplicates',
            default=False,
            action='store_true',
            help = ('Remove duplicate sequences (i.e., sequences with the same '
                    'ID and sequence). If a duplicate ID is found associated '
                    'with a different sequence, the program will exit with an '
                    'error.'))
    filter_opts.add_option('-x', '--ids-to-exclude',
            dest='ids_to_exclude',
            type='string',
            help=('Comma-delimited list of the ids of sequences to exclude.'))
    filter_opts.add_option('--remove-missing-columns',
            dest='remove_missing_columns',
            default=False,
            action='store_true',
            help=("Remove aligned columns with missing data. Characters to be "
                  "considered missing can be specified with the "
                  "--missing-characters option; the default is '?-'. "
                  "The proportion of rows that must contain these characters "
                  "for a row to be removed can be specified with the "
                  "--missing-column-proportion option; the default is 1.0. "
                  "Note, this option is only relevant to aligned sequences, "
                  "and will result in an error if the input sequences are not "
                  "aligned."))
    filter_opts.add_option('--missing-column-proportion',
            dest='missing_column_proportion',
            type='float',
            default=1.0,
            help=('The proportion of rows that must contain '
                  '--missing-characters for a column to be removed. '
                  'This option is only relevant in combination with the '
                  '--remove-missing-columns option.'))
    filter_opts.add_option('--remove-missing-sequences',
            dest='remove_missing_sequences',
            default=False,
            action = 'store_true',
            help=("Remove sequences with missing data. Characters to be "
                  "considered missing can be specified with the "
                  "--missing-characters option; the default is '?-'. "
                  "The proportion of the sites that must contain these "
                  "characters for a sequence to be removed can be specified "
                  "with the --missing-sequence-proportion option; the default "
                  "is 1.0."))
    filter_opts.add_option('--missing-sequence-proportion',
            dest='missing_sequence_proportion',
            type='float',
            default=1.0,
            help=('The proportion of sites that must contain '
                  '--missing-characters for a sequence to be removed. '
                  'This option is only relevant in combination with the '
                  '--remove-missing-sequences option.'))
    filter_opts.add_option('--missing-characters', dest='missing_characters',
            type='str',
            default='?-',
            help=("Characters to be considered missing and be used in "
                  "evaluating columns/sequences to remove with the "
                  "--remove-missing-columns and --remove-missing-sequences "
                  "options. The default is '?-'."))
    filter_opts.add_option('--remove-constant-columns',
            dest='remove_constant_columns',
            default=False,
            action='store_true',
            help=("Remove aligned columns with no variation."))
    parser.add_option_group(filter_opts)

    rev_comp_opts = OptionGroup(parser, 'Reverse Complement Options',
            'These options are for reverse complementing sequences.')
    rev_comp_opts.add_option('--rev-comp',
            dest='rev_comp',
            default = False,
            action = 'store_true',
            help=("Reverse complement all sequences. This option overrides "
                  "all other reverse-complement options."))
    rev_comp_opts.add_option('--fix-rev-comp-by',
            dest='fix_rev_comp_by',
            type = 'choice',
            choices = ['first', 'read'],
            help=("Try to correct reverse complement errors. "
                  "Options include 'first' and 'read'. If 'first' is "
                  "specified, sequences are returned in their orientation "
                  "that minimizes distance from the first sequence. "
                  "If 'read' is used, sequences are returned in their "
                  "orientation that has the longest read frame "
                  "(see 'Translation Options' for controlling translation "
                  "of reading frames)."))
    parser.add_option_group(rev_comp_opts)

    translation_opts = OptionGroup(parser, 'Translation Options',
            ('These options control translation from nucleotide to amino acid '
             'sequences.'))
    translation_opts.add_option('--table',
            type = 'choice',
            choices = list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)),
            default = 1,
            help = ('The translation table to use for any options associated '
                    'with translating nucleotide sequences to amino acids. '
                    'Option should be the integer that corresponds to the '
                    'desired translation table according to NCBI '
                    '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). '
                    'The default is 1 (the "standard" code).'))
    translation_opts.add_option('--allow-partial',
        default = False,
        action = 'store_true',
        help = ('Allow partial reading frames at the beginning (no start '
                'codon) and end (no stop codon) of sequences.'))
    translation_opts.add_option('--read-after-stop',
        default = False,
        action = 'store_true',
        help = ('A new reading frame begins immediately after a stop codon. '
                'The default is to start reading frame at next start codon '
                'after a stop codon. This option might be useful for exons.'))
    parser.add_option_group(translation_opts)

    distance_opts = OptionGroup(parser, 'Distance Options',
            ('These options control how distances between sequences are '
             'calculated.'))
    distance_opts.add_option('-g', '--count-gaps',
            default = False,
            action = 'store_true',
            help = ('Count gaps when calculating pairwise sequence distances. '
                    'The default is to calculate (number of differences '
                    'ignoring gaps / number of aligned sites ignoring sites '
                    'with gaps) for each pairwise comparison. When this option '
                    'is used, the distance is (number of differences including '
                    'gap differences / total number of aligned sites).'))
    parser.add_option_group(distance_opts)

    messaging_opts = OptionGroup(parser, 'Messaging Options',
            ('These options control verbosity of messaging.'))
    messaging_opts.add_option('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    messaging_opts.add_option('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')
    parser.add_option_group(messaging_opts)

    (options, args) = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if options.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if options.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name = __name__)

    ##########################################################################
    ## package imports

    from seqsift.seqops import seqmod, seqfilter
    from seqsift.utils import dataio

    ##########################################################################
    ## handle args
    
    if len(args) == 1:
        in_file_path = args[0]
        out_file_path = sys.stdout
    elif len(args) == 2:
        in_file_path = args[0]
        out_file_path = args[1]
    elif len(args) > 2:
        log.error("Too many arguments. Expecting at most 2 arguments:\n"
                   "The path to the input file (required), and the path to\n"
                   "output file (optional; defaults to standard output).")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)
    elif len(args) < 1:
        log.error("Too few arguments. Expecting at least 1 argument:\n"
                   "the path to the input file.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    opt_dict = options.__dict__

    if options.from_format:
        in_format = opt_dict.pop('from_format')
    else:
        in_format = FILE_FORMATS.get_format_from_file_object(in_file_path)
    if not in_format:
        log.error("Could not determine format of input file.\n"
                   "You must either provide the format of the input file\n"
                   "using the '--from-format' option or have a recognized\n"
                   "file extension on the input file. Here are the supported\n"
                   "file extensions:\n{0}".format(str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if options.to_format:
        out_format = opt_dict.pop('to_format')
    else:
        out_format = FILE_FORMATS.get_format_from_file_object(out_file_path)
    if not out_format:
        log.error("Could not determine format of output file.\n"
                   "You must either provide the format of the output file\n"
                   "using the '--to-format' option or have a recognized\n"
                   "file extension on the output file. Here are the supported\n"
                   "file extensions:\n{0}".format(str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    data_type = opt_dict.pop('data_type')

    if len(opt_dict) == 0:
        dataio.convert_format(in_file = in_file_path,
                       out_file = out_file_path,
                       in_format = in_format,
                       out_format = out_format,
                       data_type = data_type)
        sys.exit(0)

    if ((options.rev_comp or options.fix_rev_comp_by) and
            (data_type.lower() not in ['dna', 'rna'])):
        log.error("You have selected an option for reverse complementing\n"
                   "sequences but the data type is not DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    seqs = dataio.get_seq_iter([in_file_path],
            format = in_format,
            data_type = data_type)

    if options.ids_to_exclude:
        to_exclude = [n.strip() for n in options.ids_to_exclude.split(',')]
        seqs = seqfilter.id_filter(seqs, to_exclude)

    if options.remove_duplicates:
        seqs = seqfilter.duplicate_id_filter(seqs)

    if options.remove_missing_sequences:
        seqs = seqfilter.row_filter(seqs,
                character_list = list(options.missing_characters),
                max_frequency = options.missing_sequence_proportion)

    if options.remove_missing_columns:
        seqs = seqfilter.column_filter(seqs,
                character_list = list(options.missing_characters),
                max_frequency = options.missing_column_proportion)

    if options.remove_constant_columns:
        seqs = seqfilter.constant_column_filter(seqs)

    if options.rev_comp:
        log.info('Reverse complementing all sequences...')
        seqs = seqmod.reverse_complement(seqs)
    elif options.fix_rev_comp_by == 'first':
        log.info('Reverse complementing to match first sequence...')
        seqs = seqmod.reverse_complement_to_first_seq(seqs,
                per_site = True,
                aligned = False,
                ignore_gaps = (not options.count_gaps),
                alphabet = None,
                aligner_tools = ['muscle', 'mafft'],
                log_frequency = 100)
    elif options.fix_rev_comp_by == 'read':
        log.info('Reverse complementing to longest reading frame...')
        seqs = seqmod.reverse_complement_to_longest_reading_frame(seqs,
                gap_characters=['-'],
                table = options.table,
                allow_partial = options.allow_partial,
                require_start_after_stop = (not options.read_after_stop),
                log_frequency = 100)

    SeqIO.write(seqs,
                handle = out_file_path,
                format = out_format)