Example #1
0
def get_differences(seq1,
                    seq2,
                    aligned=False,
                    ignore_gaps=True,
                    alphabet=None,
                    aligner_tools=['mafft', 'muscle']):
    if not alphabet:
        alphabet = alphabets.DnaAlphabet()
    if not aligned:
        seq1, seq2 = align_pair(seq1, seq2, aligner_tools)
    if len(seq1) != len(seq2):
        raise AlignmentError('Sequences are not aligned')
    residue_codes = alphabet.all_residue_codes
    diffs = {}
    num_comparisons = 0
    for i in range(len(seq1)):
        if (seq1[i] == alphabet.missing) or (seq2[i] == alphabet.missing):
            continue
        if (seq1[i] == alphabet.gap) or (seq2[i] == alphabet.gap):
            if ignore_gaps:
                continue
            num_comparisons += 1
            if seq1[i].upper() != seq2[i].upper():
                diffs[i] = (seq1[i].upper(), seq2[i].upper())
            continue
        num_comparisons += 1
        if seq1[i].upper() == seq2[i].upper():
            continue
        s1 = set(residue_codes[seq1[i].upper()])
        s2 = set(residue_codes[seq2[i].upper()])
        if not s1.intersection(s2):
            diffs[i] = (seq1[i].upper(), seq2[i].upper())
    return diffs, num_comparisons
Example #2
0
 def test_dna_alphabet(self):
     a = alphabets.DnaAlphabet()
     self.assertEqual(self.states, a.states)
     self.assertEqual(self.ambiguity_codes, a.ambiguity_codes)
     self.assertEqual(self.residue_ambiguity_codes, a.residue_ambiguity_codes)
     self.assertEqual(self.all_residue_codes, a.all_residue_codes)
     self.assertEqual(a.gap, '-')
     self.assertEqual(a.missing, '?')
     self.assertEqual(a.get_symbol('AG'), 'R')
     self.assertEqual(a.get_symbol('AGCT-'), '?')
     self.assertEqual(a.get_symbol('AGCT'), 'N')
Example #3
0
class LociFileIter(object):
    count = 0
    dna_alphabet = alphabets.DnaAlphabet()
    dna_symbols = ''.join(
        set([x.upper() for x in dna_alphabet.get_valid_symbols()] +
            [x.lower() for x in dna_alphabet.get_valid_symbols()]))
    seq_pattern = re.compile(
        r'^>(?P<name>.+)\s+(?P<seq>[{0}]+)$'.format(dna_symbols))
    inter_locus_pattern = re.compile(r'^//.*$')

    def __init__(self, file_obj):
        self.__class__.count += 1
        self.instance_name = '-'.join(
            [self.__class__.__name__, str(self.count)])
        self.name = getattr(file_obj, 'name', self.instance_name)
        self._close = False
        self._file_obj = file_obj
        if isinstance(file_obj, str):
            self.name = file_obj
            self._file_obj = fileio.OpenFile(file_obj, 'r')
            self._close = True

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        try:
            return self._next_locus().next()
        except StopIteration as e:
            if self._close:
                self._file_obj.close()
            raise e

    def _next_locus(self):
        seqs = []
        for line in self._file_obj:
            l = line.strip()
            m = self.seq_pattern.match(l)
            x = self.inter_locus_pattern.match(l)
            if m:
                s = Seq(m.group('seq'),
                        alphabet=get_state_alphabet('dna', ambiguities=True))
                name = m.group('name').strip()
                seqs.append(SeqRecord(seq=s, id=name, name=name))
            elif x:
                yield seqs
            else:
                raise Exception('unexpected format of line in loci-formatted '
                                'file {0}:\n{1}\n'.format(self.name, l))
Example #4
0
class PyMsBayesComparisons(object):
    count = 0
    alphabet = alphabets.DnaAlphabet()
    sample_table_header = ('# taxon\tlocus\tploidy_multiplier\t'
                           'rate_multiplier\tnsamples1\tnsamples2\tkappa\t'
                           'nsites\ta\tc\tg\tpath\n'
                           'BEGIN SAMPLE_TBL\n')
    sample_table_footer = 'END SAMPLE_TBL\n'
    pi_header = 'taxon\tlocus\tpi1\tpi2\n'

    def __init__(self, comparisons, name=None, locus=None):
        self.__class__.count += 1
        if not name:
            name = self.__class__.__name__ + '-' + str(self.count)
        self.name = name
        if not locus:
            locus = 'locus{0}'.format(self.count)
        self.locus = locus
        self.comparisons = comparisons

    def add_sequence(self, seq_record, strict=False):
        ret = None
        for comp in self.comparisons:
            r = comp.add_sequence(seq_record)
            if r is not None:
                ret = r
        if strict and (ret is None):
            raise Exception('{0} is not in comparisons'.format(seq_record.id))

    def extend_sequences(self, seq_record_iter, strict=False):
        for s in seq_record_iter:
            self.add_sequence(s, strict=strict)

    def _get_smallest_number_of_sequences(self):
        smallest = None
        for comp in self.comparisons:
            s = comp.smallest_sample_size
            if smallest is None:
                smallest = s
            if s < smallest:
                smallest = s
        return smallest

    smallest_sample_size = property(_get_smallest_number_of_sequences)

    def _get_shortest_alignment(self):
        shortest = None
        for comp in self.comparisons:
            s = comp.alignment_length
            if shortest is None:
                shortest = s
            if s < shortest:
                shortest = s
        return shortest

    shortest_alignment = property(_get_shortest_alignment)

    def write_comparisons(self,
                          fasta_dir,
                          config_dir=None,
                          estimate_hky_parameters=False):
        s = StringIO()
        pi_s = StringIO()
        for comp in self.comparisons:
            if estimate_hky_parameters and (not comp.estimated_hky_parameters):
                comp.estimate_hky_parameters()
            nsamples = comp.number_of_sequences
            taxon = comp.comparison_str
            path = os.path.join(fasta_dir,
                                '{0}-{1}.fasta'.format(taxon, self.locus))
            rel_path = path
            if config_dir:
                rel_path = os.path.relpath(path, config_dir)
            comp.write_sequences(path)
            s.write('{taxon}\t{locus}\t{ploidy_multiplier}\t'
                    '{rate_multiplier}\t{nsamples1}\t{nsamples2}\t{kappa}\t'
                    '{nsites}\t{a}\t{c}\t{g}\t{path}\n'.format(
                        taxon=taxon,
                        locus=self.locus,
                        ploidy_multiplier=comp.ploidy_multiplier,
                        rate_multiplier=comp.rate_multiplier,
                        nsamples1=nsamples[0],
                        nsamples2=nsamples[1],
                        kappa=comp.kappa,
                        nsites=comp.alignment_length,
                        a=comp.a,
                        c=comp.c,
                        g=comp.g,
                        path=rel_path))
            comp.estimate_pi()
            pi_s.write('{taxon}\t{locus}\t{pi1}\t{pi2}\n'.format(
                taxon=taxon, locus=self.locus, pi1=comp.pi[0], pi2=comp.pi[1]))
        return s.getvalue(), pi_s.getvalue()

    @classmethod
    def process_loci_file(cls,
                          loci_file_obj,
                          pop_id_maps,
                          fasta_out_dir,
                          config_out_dir=None,
                          minimum_sample_size=2,
                          minimum_alignment_length=50,
                          max_ambiguities_per_seq=0.2,
                          require_shared_loci=False,
                          estimate_hky_parameters=False):
        if not config_out_dir:
            config_out_dir = fasta_out_dir
        config_out_path = os.path.join(config_out_dir,
                                       'config-sample-table.txt')
        pi_out_path = os.path.join(config_out_dir, 'pi-estimates.txt')
        config_stream = StringIO()
        pi_stream = StringIO()
        for i, locus in enumerate(dataio.LociFileIter(loci_file_obj)):
            comps = []
            for id_map in pop_id_maps:
                assert (len(id_map) == 2)
                pop1, pop2 = sorted(id_map.keys())
                c = Comparison(
                    population_1_ids=id_map[pop1],
                    population_2_ids=id_map[pop2],
                    population_1_name=pop1,
                    population_2_name=pop2,
                )
                # remove rows with many ambiguities
                seqs = seqfilter.row_filter(
                    locus,
                    character_list=(cls.alphabet.ambiguity_codes.keys() +
                                    [cls.alphabet.gap]),
                    max_frequency=max_ambiguities_per_seq)
                # remove all columns with ambiguities
                seqs = seqfilter.column_filter(
                    seqs,
                    character_list=(cls.alphabet.ambiguity_codes.keys() +
                                    [cls.alphabet.gap]),
                    max_frequency=0.00001)
                c.extend_sequences(seqs)
                if ((c.alignment_length >= minimum_alignment_length)
                        and (c.smallest_sample_size >= minimum_sample_size)):
                    comps.append(c)
            if not comps:
                continue
            if require_shared_loci and (len(comps) < len(pop_id_maps)):
                continue
            pymsbayes_comps = cls(comparisons=comps,
                                  locus='locus{0}'.format(i))
            config_str, pi_str = pymsbayes_comps.write_comparisons(
                fasta_dir=fasta_out_dir,
                config_dir=config_out_dir,
                estimate_hky_parameters=estimate_hky_parameters)
            config_stream.write(config_str)
            pi_stream.write(pi_str)
        with OpenFile(config_out_path, 'w') as out:

            out.write(cls.sample_table_header)
            out.write(config_stream.getvalue())
            out.write(cls.sample_table_footer)
        with OpenFile(pi_out_path, 'w') as out:
            out.write(cls.pi_header)
            out.write(pi_stream.getvalue())
        return i + 1

    @classmethod
    def process_loci_files_as_pairs(cls,
                                    loci_file_objects,
                                    id_component_delimiter,
                                    id_component_index,
                                    fasta_out_dir,
                                    config_out_dir=None,
                                    minimum_sample_size=2,
                                    minimum_alignment_length=50,
                                    max_ambiguities_per_seq=0.2,
                                    estimate_hky_parameters=False):
        if not config_out_dir:
            config_out_dir = fasta_out_dir
        config_out_path = os.path.join(config_out_dir,
                                       'config-sample-table.txt')
        pi_out_path = os.path.join(config_out_dir, 'pi-estimates.txt')
        config_stream = StringIO()
        pi_stream = StringIO()
        locus_count = 0
        for i, loci_file_obj in enumerate(loci_file_objects):
            comparison_prefix = None
            try:
                comparison_prefix = os.path.basename(loci_file_obj).split(
                    '.')[0]
            except:
                try:
                    comparison_prefix = os.path.basename(
                        loci_file_obj.name).split('.')[0]
                except:
                    pass
                pass
            for j, locus in enumerate(dataio.LociFileIter(loci_file_obj)):
                locus_count += 1
                # remove rows with many ambiguities
                seqs = seqfilter.row_filter(
                    locus,
                    character_list=(cls.alphabet.ambiguity_codes.keys() +
                                    [cls.alphabet.gap]),
                    max_frequency=max_ambiguities_per_seq)
                # remove all columns with ambiguities
                seqs = seqfilter.column_filter(
                    seqs,
                    character_list=(cls.alphabet.ambiguity_codes.keys() +
                                    [cls.alphabet.gap]),
                    max_frequency=0.00001)
                c = Comparison.get_comparison_from_seqs(
                    sequences=seqs,
                    id_component_delimiter=id_component_delimiter,
                    id_component_index=id_component_index,
                    comparison_prefix=comparison_prefix,
                )
                if not ((c.alignment_length >= minimum_alignment_length) and
                        (c.smallest_sample_size >= minimum_sample_size)):
                    continue
                pymsbayes_comps = cls(comparisons=[c])
                config_str, pi_str = pymsbayes_comps.write_comparisons(
                    fasta_dir=fasta_out_dir,
                    config_dir=config_out_dir,
                    estimate_hky_parameters=estimate_hky_parameters)
                config_stream.write(config_str)
                pi_stream.write(pi_str)
        with OpenFile(config_out_path, 'w') as out:

            out.write(cls.sample_table_header)
            out.write(config_stream.getvalue())
            out.write(cls.sample_table_footer)
        with OpenFile(pi_out_path, 'w') as out:
            out.write(cls.pi_header)
            out.write(pi_stream.getvalue())
        return locus_count
Example #5
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_file',
                        metavar='INPUT-SEQ-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Input sequence file to be vetted.'))

    comparison_args = parser.add_argument_group(
        'Comparison Options',
        'Options to control the number and nature of sequence comparisons')
    comparison_args.add_argument(
        '-n',
        '--num-samples',
        type=int,
        default=0,
        help=('The number of randomly sampled sequences to which each '
              'sequence will be compared. If less than 1 (the defualt is '
              '0), all pairwise comparisons will be performed. For very '
              'large numbers of sequences, performing all pairwise '
              'comparisons will take a long time. This option will speed '
              'things up as long as the number specified is less than '
              'about half of the number of input sequences. If the '
              'number you are considering is close to half of the number '
              'sequences, you should probably specify zero and do all '
              'combinations. You should not specify a number greater than '
              'half the number of sequences, because it will take longer '
              'and be less thorough than the default.'))
    comparison_args.add_argument(
        '--seed',
        action='store',
        type=int,
        help=('Random number seed to use for the analysis. This option '
              'is only revelant if a number greater than 0 is specified '
              'for the `-n/--num-samples` option.'))
    comparison_args.add_argument(
        '--compare-translated',
        action='store_true',
        help=('Compare amino acid sequences encoded by the longest '
              'reading frame found in each sequence. To use this option, '
              '`data-type` must be dna or rna. See "Translation Options" '
              'for controlling how the longest reading frame of each '
              'sequence is determined and translated.'))
    comparison_args.add_argument('--check-ids',
                                 action='store_true',
                                 help=('Check sequence IDs for duplicates.'))
    comparison_args.add_argument(
        '--summarize-reading-frame-lengths',
        action='store_true',
        help=('Report the length of the longest reading frame of '
              'each sequence. See "Translation Options" for controlling '
              'how reading frames are determined.'))
    comparison_args.add_argument(
        '-g',
        '--count-gaps',
        action='store_true',
        help=('Count gaps when calculating pairwise sequence distances. '
              'The default is to calculate (number of differences '
              'ignoring gaps / number of aligned sites ignoring sites '
              'with gaps) for each pairwise comparison. When this option '
              'is used, the distance is (number of differences including '
              'gap differences / total number of aligned sites).'))

    alignment_args = parser.add_argument_group(
        'Alignment Options',
        ('These options control if/how sequences are to be aligned prior '
         'to calculating distances.'))
    alignment_args.add_argument(
        '-a',
        '--aligned',
        action='store_true',
        help=('Treat input sequences as aligned. I.e., do not perform '
              'pairwise alignment before calculating distances between '
              'sequences (except when calculating distances for reverse '
              'and complemented sequences).'))
    alignment_args.add_argument(
        '--aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for pairwise'
              'alignments of sequences. '
              'The default is to look for muscle and then mafft in PATH, '
              'and if neither are found use the (slow) built-in '
              'function. Even if the `-a`/`--aligned` option is '
              'specified, the aligner will still be used for pairwise '
              'alignments when calculating distances of reverse and '
              'complemented sequences.'))
    alignment_args.add_argument(
        '--msa',
        action='store_true',
        help=('Perform a full multiple sequence alignemnt prior to '
              'comparing sequences. The default is to align each '
              'pair of sequences being compared. This option is '
              'overruled by the `-a`/`--aligned` option. '
              'If this option is used '
              'the resulting alignment is written to file.'))
    alignment_args.add_argument(
        '--msa-aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for full '
              'multiple sequence alignment. '
              'The default is to look for mafft and then muscle in PATH, '
              'and if neither are found the program will exit with an '
              'error message. If you do not have mafft or muscle '
              'you cannot use this option. '
              'This option is only used if the `-a`/`--aligned` option '
              'is not specified, and the `--msa` option is specified.'))

    translation_args = parser.add_argument_group(
        'Translation Options',
        ('These options control translation from nucleotide to amino acid '
         'sequences.'))
    translation_args.add_argument(
        '--table',
        type=int,
        choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)),
        default=1,
        help=('The translation table to use for any options associated '
              'with translating nucleotide sequences to amino acids. '
              'Option should be the integer that corresponds to the '
              'desired translation table according to NCBI '
              '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). '
              'The default is 1 (the "standard" code).'))
    translation_args.add_argument(
        '--allow-partial',
        action='store_true',
        default=False,
        help=('Allow partial reading frames at the beginning (no start '
              'codon) and end (no stop codon) of sequences.'))
    translation_args.add_argument(
        '--read-after-stop',
        action='store_true',
        default=False,
        help=('A new reading frame begins immediately after a stop codon. '
              'The default is to start reading frame at next start codon '
              'after a stop codon. This option might be useful for exons.'))

    data_args = parser.add_argument_group(
        'Data Options', ('Options specifying the input data type and format'))
    data_args.add_argument(
        '-d',
        '--data-type',
        type=str,
        choices=VALID_DATA_TYPES,
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    data_args.add_argument(
        '--format',
        dest='input_format',
        type=str,
        choices=FILE_FORMATS.supported_formats,
        help=('The format of the input sequence file. Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extension of the first input file. However, if '
              'provided, this option will always take precedence over '
              'the file extension.'.format(', '.join(
                  FILE_FORMATS.supported_formats))))

    output_args = parser.add_argument_group(
        'Output Options', 'Options for controlling output of program')
    output_args.add_argument(
        '-o',
        '--output-dir',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the input file.'))

    messaging_args = parser.add_argument_group(
        'Messaging Options', ('These options control verbosity of messaging.'))
    messaging_args.add_argument(
        '--log-frequency',
        type=argparse_utils.arg_is_nonnegative_int,
        default=1000,
        help=('The frequency at which to log progress. Default is to log '
              'every 1000 sequence comparisons.'))
    messaging_args.add_argument('--quiet',
                                action='store_true',
                                help='Run without verbose messaging.')
    messaging_args.add_argument('--debug',
                                action='store_true',
                                help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import GLOBAL_RNG, dataio, functions, alphabets
    from seqsift.seqops import seqsum, seqmod, seqstats
    from seqsift.utils.fileio import OpenFile

    ##########################################################################
    ## handle args

    ## set seed if randomly sampling sequences
    if args.num_samples > 0:
        if not args.seed:
            args.seed = random.randint(1, 999999999)
        GLOBAL_RNG.seed(args.seed)
        log.warning('Seed: {0}'.format(args.seed))

    ## get input file format
    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
            args.input_file)
    if not args.input_format:
        log.error("Could not determine input format.\n"
                  "You must either provide the input format\n"
                  "using the '--from' option or have a recognizable\n"
                  "file extension on the input file name.\n"
                  "Here are the supported file extensions:\n{0}".format(
                      str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    aligner_tools = ['muscle', 'mafft']
    if args.aligner:
        aligner_tools = [args.aligner]
    full_aligner_tools = ['mafft', 'muscle']
    if args.msa_aligner:
        full_aligner_tools = [args.msa_aligner]

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.input_file)

    full_alignment_out_path = os.path.join(args.output_dir, 'seqvet-msa.txt')
    alphabet = alphabets.DnaAlphabet()
    if args.data_type in ['aa', 'protein']:
        alphabet = alphabets.ProteinAlphabet()

    if (args.summarize_reading_frame_lengths
            and (not args.data_type in ['dna', 'rna'])):
        log.error("`--summarize-reading-frame-lengths` is only compatible "
                  "with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if (args.compare_translated and (not args.data_type in ['dna', 'rna'])):
        log.error("`-compare-translated` is only compatible with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    ##########################################################################
    ## heavy lifting

    seqs = dataio.get_seq_iter([args.input_file],
                               format=args.input_format,
                               data_type=args.data_type)

    if args.summarize_reading_frame_lengths:
        log.info('Summarizing longest reading frame lengths...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        lengths = seqsum.summarize_longest_read_lengths(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        length_path = os.path.join(args.output_dir,
                                   'seqvet-reading-frame-lengths.txt')
        log.info('Writing longest reading frame lengths to file...')
        with OpenFile(length_path, 'w') as out:
            out.write('seq_id\tlrf\trev_comp_lrf\n')
            for (l, rc_l, seq_id) in lengths:
                out.write('{0}\t{1}\t{2}\n'.format(seq_id, l, rc_l))

    if args.compare_translated:
        log.info('Translating longest reading frames for distance '
                 'calculations...')
        seqs = seqmod.translate_longest_reading_frames(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        alphabet = alphabets.ProteinAlphabet()

    if args.check_ids:
        log.info('Checking sequence IDs...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        dups = seqstats.get_duplicate_ids(seqs)
        if len(dups) > 0:
            dup_path = functions.get_new_path(
                os.path.join(args.output_dir, 'seqvet-duplicate-ids.txt'))
            log.warning('Duplicate IDs found! Writing them to '
                        '{0}'.format(dup_path))
            with OpenFile(dup_path, 'w') as out:
                for dup in dups:
                    out.write('{0}\n'.format(dup))
        else:
            log.info('No duplicate sequence IDs were found.')

    log.info('Calculating pairwise distances...')
    distances, rev_comp_errors = seqsum.summarize_distances(
        seqs,
        sample_size=args.num_samples,
        per_site=True,
        aligned=args.aligned,
        ignore_gaps=(not args.count_gaps),
        alphabet=alphabet,
        do_full_alignment=args.msa,
        full_alignment_out_path=full_alignment_out_path,
        aligner_tools=aligner_tools,
        full_aligner_tools=full_aligner_tools,
        log_frequency=args.log_frequency)
    log.info('Done!')

    log.info('Writing mean distances to file...')
    distances = sorted([(k, v) for k, v in iteritems(distances)],
                       key=lambda x: x[1].mean,
                       reverse=True)
    mean_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-mean-distances.txt'))
    with OpenFile(mean_path, 'w') as out:
        out.write('seq_id\tmean_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.mean))

    log.info('Writing max distances to file...')
    distances = sorted(distances, key=lambda x: x[1].maximum, reverse=True)
    max_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-max-distances.txt'))
    with OpenFile(max_path, 'w') as out:
        out.write('seq_id\tmax_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.maximum))

    if rev_comp_errors:
        rev_comp_errors = sorted(rev_comp_errors)
        rce_set = set()
        rce = []
        for (s1, s2, d, drc) in rev_comp_errors:
            pair = tuple(sorted([s1, s2]))
            if pair in rce_set:
                continue
            rce_set.add(pair)
            rce.append((pair[0], pair[1], d, drc))
        log.info('Writing potential reverse-complement errors to file...')
        path = functions.get_new_path(
            os.path.join(args.output_dir,
                         'seqvet-reverse-complement-warnings.txt'))
        with OpenFile(path, 'w') as out:
            out.write('seq1\tseq2\tdistance\trev_comp_distance\n')
            for (seq1, seq2, d, drc) in rce:
                out.write('{0}\t{1}\t{2}\t{3}\n'.format(seq1, seq2, d, drc))