Python DnaAlphabet Examples

Programming Language: Python

Namespace/Package Name: seqsift.utils.alphabets

Method/Function: DnaAlphabet

Examples at hotexamples.com: 5

Python DnaAlphabet - 5 examples found. These are the top rated real world Python examples of seqsift.utils.alphabets.DnaAlphabet extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def get_differences(seq1,
                    seq2,
                    aligned=False,
                    ignore_gaps=True,
                    alphabet=None,
                    aligner_tools=['mafft', 'muscle']):
    if not alphabet:
        alphabet = alphabets.DnaAlphabet()
    if not aligned:
        seq1, seq2 = align_pair(seq1, seq2, aligner_tools)
    if len(seq1) != len(seq2):
        raise AlignmentError('Sequences are not aligned')
    residue_codes = alphabet.all_residue_codes
    diffs = {}
    num_comparisons = 0
    for i in range(len(seq1)):
        if (seq1[i] == alphabet.missing) or (seq2[i] == alphabet.missing):
            continue
        if (seq1[i] == alphabet.gap) or (seq2[i] == alphabet.gap):
            if ignore_gaps:
                continue
            num_comparisons += 1
            if seq1[i].upper() != seq2[i].upper():
                diffs[i] = (seq1[i].upper(), seq2[i].upper())
            continue
        num_comparisons += 1
        if seq1[i].upper() == seq2[i].upper():
            continue
        s1 = set(residue_codes[seq1[i].upper()])
        s2 = set(residue_codes[seq2[i].upper()])
        if not s1.intersection(s2):
            diffs[i] = (seq1[i].upper(), seq2[i].upper())
    return diffs, num_comparisons

Example #2

Show file

 def test_dna_alphabet(self):
     a = alphabets.DnaAlphabet()
     self.assertEqual(self.states, a.states)
     self.assertEqual(self.ambiguity_codes, a.ambiguity_codes)
     self.assertEqual(self.residue_ambiguity_codes, a.residue_ambiguity_codes)
     self.assertEqual(self.all_residue_codes, a.all_residue_codes)
     self.assertEqual(a.gap, '-')
     self.assertEqual(a.missing, '?')
     self.assertEqual(a.get_symbol('AG'), 'R')
     self.assertEqual(a.get_symbol('AGCT-'), '?')
     self.assertEqual(a.get_symbol('AGCT'), 'N')

Example #3

Show file

File: dataio.py Project: joaks1/SeqSift

class LociFileIter(object):
    count = 0
    dna_alphabet = alphabets.DnaAlphabet()
    dna_symbols = ''.join(
        set([x.upper() for x in dna_alphabet.get_valid_symbols()] +
            [x.lower() for x in dna_alphabet.get_valid_symbols()]))
    seq_pattern = re.compile(
        r'^>(?P<name>.+)\s+(?P<seq>[{0}]+)$'.format(dna_symbols))
    inter_locus_pattern = re.compile(r'^//.*$')

    def __init__(self, file_obj):
        self.__class__.count += 1
        self.instance_name = '-'.join(
            [self.__class__.__name__, str(self.count)])
        self.name = getattr(file_obj, 'name', self.instance_name)
        self._close = False
        self._file_obj = file_obj
        if isinstance(file_obj, str):
            self.name = file_obj
            self._file_obj = fileio.OpenFile(file_obj, 'r')
            self._close = True

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        try:
            return self._next_locus().next()
        except StopIteration as e:
            if self._close:
                self._file_obj.close()
            raise e

    def _next_locus(self):
        seqs = []
        for line in self._file_obj:
            l = line.strip()
            m = self.seq_pattern.match(l)
            x = self.inter_locus_pattern.match(l)
            if m:
                s = Seq(m.group('seq'),
                        alphabet=get_state_alphabet('dna', ambiguities=True))
                name = m.group('name').strip()
                seqs.append(SeqRecord(seq=s, id=name, name=name))
            elif x:
                yield seqs
            else:
                raise Exception('unexpected format of line in loci-formatted '
                                'file {0}:\n{1}\n'.format(self.name, l))

Example #4

Show file

class PyMsBayesComparisons(object):
    count = 0
    alphabet = alphabets.DnaAlphabet()
    sample_table_header = ('# taxon\tlocus\tploidy_multiplier\t'
                           'rate_multiplier\tnsamples1\tnsamples2\tkappa\t'
                           'nsites\ta\tc\tg\tpath\n'
                           'BEGIN SAMPLE_TBL\n')
    sample_table_footer = 'END SAMPLE_TBL\n'
    pi_header = 'taxon\tlocus\tpi1\tpi2\n'

    def __init__(self, comparisons, name=None, locus=None):
        self.__class__.count += 1
        if not name:
            name = self.__class__.__name__ + '-' + str(self.count)
        self.name = name
        if not locus:
            locus = 'locus{0}'.format(self.count)
        self.locus = locus
        self.comparisons = comparisons

    def add_sequence(self, seq_record, strict=False):
        ret = None
        for comp in self.comparisons:
            r = comp.add_sequence(seq_record)
            if r is not None:
                ret = r
        if strict and (ret is None):
            raise Exception('{0} is not in comparisons'.format(seq_record.id))

    def extend_sequences(self, seq_record_iter, strict=False):
        for s in seq_record_iter:
            self.add_sequence(s, strict=strict)

    def _get_smallest_number_of_sequences(self):
        smallest = None
        for comp in self.comparisons:
            s = comp.smallest_sample_size
            if smallest is None:
                smallest = s
            if s < smallest:
                smallest = s
        return smallest

    smallest_sample_size = property(_get_smallest_number_of_sequences)

    def _get_shortest_alignment(self):
        shortest = None
        for comp in self.comparisons:
            s = comp.alignment_length
            if shortest is None:
                shortest = s
            if s < shortest:
                shortest = s
        return shortest

    shortest_alignment = property(_get_shortest_alignment)

    def write_comparisons(self,
                          fasta_dir,
                          config_dir=None,
                          estimate_hky_parameters=False):
        s = StringIO()
        pi_s = StringIO()
        for comp in self.comparisons:
            if estimate_hky_parameters and (not comp.estimated_hky_parameters):
                comp.estimate_hky_parameters()
            nsamples = comp.number_of_sequences
            taxon = comp.comparison_str
            path = os.path.join(fasta_dir,
                                '{0}-{1}.fasta'.format(taxon, self.locus))
            rel_path = path
            if config_dir:
                rel_path = os.path.relpath(path, config_dir)
            comp.write_sequences(path)
            s.write('{taxon}\t{locus}\t{ploidy_multiplier}\t'
                    '{rate_multiplier}\t{nsamples1}\t{nsamples2}\t{kappa}\t'
                    '{nsites}\t{a}\t{c}\t{g}\t{path}\n'.format(
                        taxon=taxon,
                        locus=self.locus,
                        ploidy_multiplier=comp.ploidy_multiplier,
                        rate_multiplier=comp.rate_multiplier,
                        nsamples1=nsamples[0],
                        nsamples2=nsamples[1],
                        kappa=comp.kappa,
                        nsites=comp.alignment_length,
                        a=comp.a,
                        c=comp.c,
                        g=comp.g,
                        path=rel_path))
            comp.estimate_pi()
            pi_s.write('{taxon}\t{locus}\t{pi1}\t{pi2}\n'.format(
                taxon=taxon, locus=self.locus, pi1=comp.pi[0], pi2=comp.pi[1]))
        return s.getvalue(), pi_s.getvalue()

    @classmethod
    def process_loci_file(cls,
                          loci_file_obj,
                          pop_id_maps,
                          fasta_out_dir,
                          config_out_dir=None,
                          minimum_sample_size=2,
                          minimum_alignment_length=50,
                          max_ambiguities_per_seq=0.2,
                          require_shared_loci=False,
                          estimate_hky_parameters=False):
        if not config_out_dir:
            config_out_dir = fasta_out_dir
        config_out_path = os.path.join(config_out_dir,
                                       'config-sample-table.txt')
        pi_out_path = os.path.join(config_out_dir, 'pi-estimates.txt')
        config_stream = StringIO()
        pi_stream = StringIO()
        for i, locus in enumerate(dataio.LociFileIter(loci_file_obj)):
            comps = []
            for id_map in pop_id_maps:
                assert (len(id_map) == 2)
                pop1, pop2 = sorted(id_map.keys())
                c = Comparison(
                    population_1_ids=id_map[pop1],
                    population_2_ids=id_map[pop2],
                    population_1_name=pop1,
                    population_2_name=pop2,
                )
                # remove rows with many ambiguities
                seqs = seqfilter.row_filter(
                    locus,
                    character_list=(cls.alphabet.ambiguity_codes.keys() +
                                    [cls.alphabet.gap]),
                    max_frequency=max_ambiguities_per_seq)
                # remove all columns with ambiguities
                seqs = seqfilter.column_filter(
                    seqs,
                    character_list=(cls.alphabet.ambiguity_codes.keys() +
                                    [cls.alphabet.gap]),
                    max_frequency=0.00001)
                c.extend_sequences(seqs)
                if ((c.alignment_length >= minimum_alignment_length)
                        and (c.smallest_sample_size >= minimum_sample_size)):
                    comps.append(c)
            if not comps:
                continue
            if require_shared_loci and (len(comps) < len(pop_id_maps)):
                continue
            pymsbayes_comps = cls(comparisons=comps,
                                  locus='locus{0}'.format(i))
            config_str, pi_str = pymsbayes_comps.write_comparisons(
                fasta_dir=fasta_out_dir,
                config_dir=config_out_dir,
                estimate_hky_parameters=estimate_hky_parameters)
            config_stream.write(config_str)
            pi_stream.write(pi_str)
        with OpenFile(config_out_path, 'w') as out:

            out.write(cls.sample_table_header)
            out.write(config_stream.getvalue())
            out.write(cls.sample_table_footer)
        with OpenFile(pi_out_path, 'w') as out:
            out.write(cls.pi_header)
            out.write(pi_stream.getvalue())
        return i + 1

    @classmethod
    def process_loci_files_as_pairs(cls,
                                    loci_file_objects,
                                    id_component_delimiter,
                                    id_component_index,
                                    fasta_out_dir,
                                    config_out_dir=None,
                                    minimum_sample_size=2,
                                    minimum_alignment_length=50,
                                    max_ambiguities_per_seq=0.2,
                                    estimate_hky_parameters=False):
        if not config_out_dir:
            config_out_dir = fasta_out_dir
        config_out_path = os.path.join(config_out_dir,
                                       'config-sample-table.txt')
        pi_out_path = os.path.join(config_out_dir, 'pi-estimates.txt')
        config_stream = StringIO()
        pi_stream = StringIO()
        locus_count = 0
        for i, loci_file_obj in enumerate(loci_file_objects):
            comparison_prefix = None
            try:
                comparison_prefix = os.path.basename(loci_file_obj).split(
                    '.')[0]
            except:
                try:
                    comparison_prefix = os.path.basename(
                        loci_file_obj.name).split('.')[0]
                except:
                    pass
                pass
            for j, locus in enumerate(dataio.LociFileIter(loci_file_obj)):
                locus_count += 1
                # remove rows with many ambiguities
                seqs = seqfilter.row_filter(
                    locus,
                    character_list=(cls.alphabet.ambiguity_codes.keys() +
                                    [cls.alphabet.gap]),
                    max_frequency=max_ambiguities_per_seq)
                # remove all columns with ambiguities
                seqs = seqfilter.column_filter(
                    seqs,
                    character_list=(cls.alphabet.ambiguity_codes.keys() +
                                    [cls.alphabet.gap]),
                    max_frequency=0.00001)
                c = Comparison.get_comparison_from_seqs(
                    sequences=seqs,
                    id_component_delimiter=id_component_delimiter,
                    id_component_index=id_component_index,
                    comparison_prefix=comparison_prefix,
                )
                if not ((c.alignment_length >= minimum_alignment_length) and
                        (c.smallest_sample_size >= minimum_sample_size)):
                    continue
                pymsbayes_comps = cls(comparisons=[c])
                config_str, pi_str = pymsbayes_comps.write_comparisons(
                    fasta_dir=fasta_out_dir,
                    config_dir=config_out_dir,
                    estimate_hky_parameters=estimate_hky_parameters)
                config_stream.write(config_str)
                pi_stream.write(pi_str)
        with OpenFile(config_out_path, 'w') as out:

            out.write(cls.sample_table_header)
            out.write(config_stream.getvalue())
            out.write(cls.sample_table_footer)
        with OpenFile(pi_out_path, 'w') as out:
            out.write(cls.pi_header)
            out.write(pi_stream.getvalue())
        return locus_count

Example #5

Show file

File: seqvet.py Project: joaks1/SeqSift

def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_file',
                        metavar='INPUT-SEQ-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Input sequence file to be vetted.'))

    comparison_args = parser.add_argument_group(
        'Comparison Options',
        'Options to control the number and nature of sequence comparisons')
    comparison_args.add_argument(
        '-n',
        '--num-samples',
        type=int,
        default=0,
        help=('The number of randomly sampled sequences to which each '
              'sequence will be compared. If less than 1 (the defualt is '
              '0), all pairwise comparisons will be performed. For very '
              'large numbers of sequences, performing all pairwise '
              'comparisons will take a long time. This option will speed '
              'things up as long as the number specified is less than '
              'about half of the number of input sequences. If the '
              'number you are considering is close to half of the number '
              'sequences, you should probably specify zero and do all '
              'combinations. You should not specify a number greater than '
              'half the number of sequences, because it will take longer '
              'and be less thorough than the default.'))
    comparison_args.add_argument(
        '--seed',
        action='store',
        type=int,
        help=('Random number seed to use for the analysis. This option '
              'is only revelant if a number greater than 0 is specified '
              'for the `-n/--num-samples` option.'))
    comparison_args.add_argument(
        '--compare-translated',
        action='store_true',
        help=('Compare amino acid sequences encoded by the longest '
              'reading frame found in each sequence. To use this option, '
              '`data-type` must be dna or rna. See "Translation Options" '
              'for controlling how the longest reading frame of each '
              'sequence is determined and translated.'))
    comparison_args.add_argument('--check-ids',
                                 action='store_true',
                                 help=('Check sequence IDs for duplicates.'))
    comparison_args.add_argument(
        '--summarize-reading-frame-lengths',
        action='store_true',
        help=('Report the length of the longest reading frame of '
              'each sequence. See "Translation Options" for controlling '
              'how reading frames are determined.'))
    comparison_args.add_argument(
        '-g',
        '--count-gaps',
        action='store_true',
        help=('Count gaps when calculating pairwise sequence distances. '
              'The default is to calculate (number of differences '
              'ignoring gaps / number of aligned sites ignoring sites '
              'with gaps) for each pairwise comparison. When this option '
              'is used, the distance is (number of differences including '
              'gap differences / total number of aligned sites).'))

    alignment_args = parser.add_argument_group(
        'Alignment Options',
        ('These options control if/how sequences are to be aligned prior '
         'to calculating distances.'))
    alignment_args.add_argument(
        '-a',
        '--aligned',
        action='store_true',
        help=('Treat input sequences as aligned. I.e., do not perform '
              'pairwise alignment before calculating distances between '
              'sequences (except when calculating distances for reverse '
              'and complemented sequences).'))
    alignment_args.add_argument(
        '--aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for pairwise'
              'alignments of sequences. '
              'The default is to look for muscle and then mafft in PATH, '
              'and if neither are found use the (slow) built-in '
              'function. Even if the `-a`/`--aligned` option is '
              'specified, the aligner will still be used for pairwise '
              'alignments when calculating distances of reverse and '
              'complemented sequences.'))
    alignment_args.add_argument(
        '--msa',
        action='store_true',
        help=('Perform a full multiple sequence alignemnt prior to '
              'comparing sequences. The default is to align each '
              'pair of sequences being compared. This option is '
              'overruled by the `-a`/`--aligned` option. '
              'If this option is used '
              'the resulting alignment is written to file.'))
    alignment_args.add_argument(
        '--msa-aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for full '
              'multiple sequence alignment. '
              'The default is to look for mafft and then muscle in PATH, '
              'and if neither are found the program will exit with an '
              'error message. If you do not have mafft or muscle '
              'you cannot use this option. '
              'This option is only used if the `-a`/`--aligned` option '
              'is not specified, and the `--msa` option is specified.'))

    translation_args = parser.add_argument_group(
        'Translation Options',
        ('These options control translation from nucleotide to amino acid '
         'sequences.'))
    translation_args.add_argument(
        '--table',
        type=int,
        choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)),
        default=1,
        help=('The translation table to use for any options associated '
              'with translating nucleotide sequences to amino acids. '
              'Option should be the integer that corresponds to the '
              'desired translation table according to NCBI '
              '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). '
              'The default is 1 (the "standard" code).'))
    translation_args.add_argument(
        '--allow-partial',
        action='store_true',
        default=False,
        help=('Allow partial reading frames at the beginning (no start '
              'codon) and end (no stop codon) of sequences.'))
    translation_args.add_argument(
        '--read-after-stop',
        action='store_true',
        default=False,
        help=('A new reading frame begins immediately after a stop codon. '
              'The default is to start reading frame at next start codon '
              'after a stop codon. This option might be useful for exons.'))

    data_args = parser.add_argument_group(
        'Data Options', ('Options specifying the input data type and format'))
    data_args.add_argument(
        '-d',
        '--data-type',
        type=str,
        choices=VALID_DATA_TYPES,
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    data_args.add_argument(
        '--format',
        dest='input_format',
        type=str,
        choices=FILE_FORMATS.supported_formats,
        help=('The format of the input sequence file. Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extension of the first input file. However, if '
              'provided, this option will always take precedence over '
              'the file extension.'.format(', '.join(
                  FILE_FORMATS.supported_formats))))

    output_args = parser.add_argument_group(
        'Output Options', 'Options for controlling output of program')
    output_args.add_argument(
        '-o',
        '--output-dir',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the input file.'))

    messaging_args = parser.add_argument_group(
        'Messaging Options', ('These options control verbosity of messaging.'))
    messaging_args.add_argument(
        '--log-frequency',
        type=argparse_utils.arg_is_nonnegative_int,
        default=1000,
        help=('The frequency at which to log progress. Default is to log '
              'every 1000 sequence comparisons.'))
    messaging_args.add_argument('--quiet',
                                action='store_true',
                                help='Run without verbose messaging.')
    messaging_args.add_argument('--debug',
                                action='store_true',
                                help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import GLOBAL_RNG, dataio, functions, alphabets
    from seqsift.seqops import seqsum, seqmod, seqstats
    from seqsift.utils.fileio import OpenFile

    ##########################################################################
    ## handle args

    ## set seed if randomly sampling sequences
    if args.num_samples > 0:
        if not args.seed:
            args.seed = random.randint(1, 999999999)
        GLOBAL_RNG.seed(args.seed)
        log.warning('Seed: {0}'.format(args.seed))

    ## get input file format
    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
            args.input_file)
    if not args.input_format:
        log.error("Could not determine input format.\n"
                  "You must either provide the input format\n"
                  "using the '--from' option or have a recognizable\n"
                  "file extension on the input file name.\n"
                  "Here are the supported file extensions:\n{0}".format(
                      str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    aligner_tools = ['muscle', 'mafft']
    if args.aligner:
        aligner_tools = [args.aligner]
    full_aligner_tools = ['mafft', 'muscle']
    if args.msa_aligner:
        full_aligner_tools = [args.msa_aligner]

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.input_file)

    full_alignment_out_path = os.path.join(args.output_dir, 'seqvet-msa.txt')
    alphabet = alphabets.DnaAlphabet()
    if args.data_type in ['aa', 'protein']:
        alphabet = alphabets.ProteinAlphabet()

    if (args.summarize_reading_frame_lengths
            and (not args.data_type in ['dna', 'rna'])):
        log.error("`--summarize-reading-frame-lengths` is only compatible "
                  "with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if (args.compare_translated and (not args.data_type in ['dna', 'rna'])):
        log.error("`-compare-translated` is only compatible with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    ##########################################################################
    ## heavy lifting

    seqs = dataio.get_seq_iter([args.input_file],
                               format=args.input_format,
                               data_type=args.data_type)

    if args.summarize_reading_frame_lengths:
        log.info('Summarizing longest reading frame lengths...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        lengths = seqsum.summarize_longest_read_lengths(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        length_path = os.path.join(args.output_dir,
                                   'seqvet-reading-frame-lengths.txt')
        log.info('Writing longest reading frame lengths to file...')
        with OpenFile(length_path, 'w') as out:
            out.write('seq_id\tlrf\trev_comp_lrf\n')
            for (l, rc_l, seq_id) in lengths:
                out.write('{0}\t{1}\t{2}\n'.format(seq_id, l, rc_l))

    if args.compare_translated:
        log.info('Translating longest reading frames for distance '
                 'calculations...')
        seqs = seqmod.translate_longest_reading_frames(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        alphabet = alphabets.ProteinAlphabet()

    if args.check_ids:
        log.info('Checking sequence IDs...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        dups = seqstats.get_duplicate_ids(seqs)
        if len(dups) > 0:
            dup_path = functions.get_new_path(
                os.path.join(args.output_dir, 'seqvet-duplicate-ids.txt'))
            log.warning('Duplicate IDs found! Writing them to '
                        '{0}'.format(dup_path))
            with OpenFile(dup_path, 'w') as out:
                for dup in dups:
                    out.write('{0}\n'.format(dup))
        else:
            log.info('No duplicate sequence IDs were found.')

    log.info('Calculating pairwise distances...')
    distances, rev_comp_errors = seqsum.summarize_distances(
        seqs,
        sample_size=args.num_samples,
        per_site=True,
        aligned=args.aligned,
        ignore_gaps=(not args.count_gaps),
        alphabet=alphabet,
        do_full_alignment=args.msa,
        full_alignment_out_path=full_alignment_out_path,
        aligner_tools=aligner_tools,
        full_aligner_tools=full_aligner_tools,
        log_frequency=args.log_frequency)
    log.info('Done!')

    log.info('Writing mean distances to file...')
    distances = sorted([(k, v) for k, v in iteritems(distances)],
                       key=lambda x: x[1].mean,
                       reverse=True)
    mean_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-mean-distances.txt'))
    with OpenFile(mean_path, 'w') as out:
        out.write('seq_id\tmean_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.mean))

    log.info('Writing max distances to file...')
    distances = sorted(distances, key=lambda x: x[1].maximum, reverse=True)
    max_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-max-distances.txt'))
    with OpenFile(max_path, 'w') as out:
        out.write('seq_id\tmax_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.maximum))

    if rev_comp_errors:
        rev_comp_errors = sorted(rev_comp_errors)
        rce_set = set()
        rce = []
        for (s1, s2, d, drc) in rev_comp_errors:
            pair = tuple(sorted([s1, s2]))
            if pair in rce_set:
                continue
            rce_set.add(pair)
            rce.append((pair[0], pair[1], d, drc))
        log.info('Writing potential reverse-complement errors to file...')
        path = functions.get_new_path(
            os.path.join(args.output_dir,
                         'seqvet-reverse-complement-warnings.txt'))
        with OpenFile(path, 'w') as out:
            out.write('seq1\tseq2\tdistance\trev_comp_distance\n')
            for (seq1, seq2, d, drc) in rce:
                out.write('{0}\t{1}\t{2}\t{3}\n'.format(seq1, seq2, d, drc))