Exemple #1
0
 def setUp(self):
     self.states = tuple(sorted("ACGT-"))
     self.ambiguity_codes = {
         "R": ("A", "G"),
         "Y": ("C", "T"),
         "K": ("G", "T"),
         "M": ("A", "C"),
         "S": ("C", "G"),
         "W": ("A", "T"),
         "V": ("A", "C", "G"),
         "H": ("A", "C", "T"),
         "D": ("A", "G", "T"),
         "B": ("C", "G", "T"),
         "N": ("A", "C", "G", "T"),
         "?": self.states,
     }
     self.reverse_ambiguity_codes = {}
     for k, v in iteritems(self.ambiguity_codes):
         self.reverse_ambiguity_codes[v] = k
     self.residue_ambiguity_codes = {
         "R": ("A", "G"),
         "Y": ("C", "T"),
         "K": ("G", "T"),
         "M": ("A", "C"),
         "S": ("C", "G"),
         "W": ("A", "T"),
         "V": ("A", "C", "G"),
         "H": ("A", "C", "T"),
         "D": ("A", "G", "T"),
         "B": ("C", "G", "T"),
         "N": ("A", "C", "G", "T"),
     }
     self.all_residue_codes = {"A": ("A",), "C": ("C",), "G": ("G",), "T": ("T",)}
     for k, v in iteritems(self.residue_ambiguity_codes):
         self.all_residue_codes[k] = v
Exemple #2
0
 def test_format_conversion_protein(self):
     for filename in [
             'caenophidia.fasta', 'caenophidia.phylip', 'caenophidia.nexus'
     ]:
         in_ext = os.path.splitext(filename)[-1]
         if in_ext == '.phylip':
             in_format = 'phylip-relaxed'
         else:
             in_format = in_ext.replace('.', '')
         in_path = package_paths.data_path(filename)
         for out_ext, out_format in iteritems(self.to_formats):
             if out_ext == in_ext:
                 continue
             if out_format == 'genbank':
                 continue
             out_path = self.getTestFile(filename.replace(in_ext, out_ext))
             _LOG.info('converting {0} to {1}'.format(filename, out_ext))
             self.exe_seqaid([in_path, out_path, '-d', 'aa'])
             seqs_in = SeqIO.parse(in_path,
                                   format=in_format,
                                   alphabet=IUPAC.extended_protein)
             seqs_out = SeqIO.parse(out_path,
                                    format=out_format,
                                    alphabet=IUPAC.extended_protein)
             self.assertSameSequenceData(seqs_in, seqs_out, aligned=True)
Exemple #3
0
 def setUp(self):
     self.seqaid = package_paths.scripts_path("seqaid.py")
     self.mkTestDir()
     self.simple_alignment = [
         SeqRecord(Seq('ACGT?', alphabet=IUPAC.ambiguous_dna),
                   id='1',
                   letter_annotations={'phred_quality': [1, 1, 1, 1, 1]}),
         SeqRecord(Seq('ACGT-', alphabet=IUPAC.ambiguous_dna),
                   id='2',
                   letter_annotations={'phred_quality': [1, 1, 1, 1, 1]}),
         SeqRecord(Seq('ACGT?', alphabet=IUPAC.ambiguous_dna),
                   id='3',
                   letter_annotations={'phred_quality': [1, 1, 1, 1, 1]}),
         SeqRecord(Seq('ACGT-', alphabet=IUPAC.ambiguous_dna),
                   id='4',
                   letter_annotations={'phred_quality': [1, 1, 1, 1, 1]}),
         SeqRecord(Seq('ACGT?', alphabet=IUPAC.ambiguous_dna),
                   id='5',
                   letter_annotations={'phred_quality': [1, 1, 1, 1, 1]})
     ]
     self.simple_alignment_path = None
     self.write_alignment()
     self.from_formats = copy.deepcopy(FILE_FORMATS)
     self.to_formats = {
         k: v
         for k, v in iteritems(FILE_FORMATS) if v != 'fastq'
     }
Exemple #4
0
 def test_get_counts(self):
     x = [0, 0, 0, 1, 1, 1, 1, 2, 3, 4]
     expected = {0: 0.3, 1: 0.4, 2: 0.1, 3: 0.1, 4: 0.1}
     freqs = get_freqs(x)
     self.assertAlmostEqual(sum(freqs.values()), 1.0)
     for k, v in iteritems(freqs):
         self.assertAlmostEqual(v, expected[k])
Exemple #5
0
 def test_get_counts(self):
     x = [0,0,0,1,1,1,1,2,3,4]
     expected = {0: 0.3, 1: 0.4, 2: 0.1, 3: 0.1, 4: 0.1}
     freqs = get_freqs(x)
     self.assertAlmostEqual(sum(freqs.values()), 1.0)
     for k, v in iteritems(freqs):
         self.assertAlmostEqual(v, expected[k])
Exemple #6
0
def get_freqs(elements):
    counts = get_counts(elements)
    total = float(sum(itervalues(counts)))
    freqs = {}
    for k, v in iteritems(counts):
        freqs[k] = v / total
    return freqs
Exemple #7
0
def get_freqs(elements):
    counts = get_counts(elements)
    total = float(sum(itervalues(counts)))
    freqs = {}
    for k, v in iteritems(counts):
        freqs[k] = v / total
    return freqs
Exemple #8
0
 def test_format_conversion(self):
     for in_ext, in_format in iteritems(self.from_formats):
         test_stream, test_path = self.getTestStream('simple' + in_ext)
         SeqIO.write(self.simple_alignment, test_stream, format=in_format)
         test_stream.close()
         for out_ext, out_format in iteritems(self.to_formats):
             if out_ext == in_ext:
                 continue
             out_path = self.getTestFile(test_path.replace(in_ext, out_ext))
             self.exe_seqaid([test_path, out_path])
             seqs_in = SeqIO.parse(test_path,
                                   format=in_format,
                                   alphabet=IUPAC.ambiguous_dna)
             seqs_out = SeqIO.parse(out_path,
                                    format=out_format,
                                    alphabet=IUPAC.ambiguous_dna)
             self.assertSameSequences(seqs_in, seqs_out, aligned=True)
Exemple #9
0
 def test_limnonectes(self):
     formats = {'fasta': '.fasta', 'phylip-relaxed': '.phylip',
             'nexus': '.nexus'}
     for in_format, in_ext in iteritems(formats):
         in_file = package_paths.data_path('limnonectes' + in_ext)
         for out_format, out_ext in iteritems(formats):
             out_file = self.getTestFile('limnonectes' + out_ext)
             n = convert_format(in_file=in_file,
                     in_format=in_format,
                     out_file=out_file,
                     out_format=out_format,
                     data_type='dna')
             self.assertEqual(n, 80)
             in_seqs = SeqIO.parse(in_file, format=in_format,
                     alphabet=IUPAC.ambiguous_dna)
             out_seqs = SeqIO.parse(out_file, format=out_format,
                     alphabet=IUPAC.ambiguous_dna)
             self.assertSameData(in_seqs, out_seqs)
Exemple #10
0
 def test_caenophidia(self):
     formats = {'fasta': '.fasta', 'phylip-relaxed': '.phylip',
             'nexus': '.nexus'}
     for in_format, in_ext in iteritems(formats):
         in_file = package_paths.data_path('caenophidia' + in_ext)
         for out_format, out_ext in iteritems(formats):
             out_file = self.getTestFile('caenophidia' + out_ext)
             n = convert_format(in_file=in_file,
                     in_format=in_format,
                     out_file=out_file,
                     out_format=out_format,
                     data_type='protein')
             self.assertEqual(n, 114)
             in_seqs = SeqIO.parse(in_file, format=in_format,
                     alphabet=IUPAC.extended_protein)
             out_seqs = SeqIO.parse(out_file, format=out_format,
                     alphabet=IUPAC.extended_protein)
             self.assertSameData(in_seqs, out_seqs)
Exemple #11
0
 def setUp(self):
     self.states = tuple(sorted('ACGT-'))
     self.ambiguity_codes = {
             'R': ('A', 'G'),
             'Y': ('C', 'T'),
             'K': ('G', 'T'),
             'M': ('A', 'C'),
             'S': ('C', 'G'),
             'W': ('A', 'T'),
             'V': ('A', 'C', 'G'),
             'H': ('A', 'C', 'T'),
             'D': ('A', 'G', 'T'),
             'B': ('C', 'G', 'T'),
             'N': ('A', 'C', 'G', 'T'),
             '?': self.states,
             }
     self.reverse_ambiguity_codes = {}
     for k, v in iteritems(self.ambiguity_codes):
         self.reverse_ambiguity_codes[v] = k
     self.residue_ambiguity_codes = {
             'R': ('A', 'G'),
             'Y': ('C', 'T'),
             'K': ('G', 'T'),
             'M': ('A', 'C'),
             'S': ('C', 'G'),
             'W': ('A', 'T'),
             'V': ('A', 'C', 'G'),
             'H': ('A', 'C', 'T'),
             'D': ('A', 'G', 'T'),
             'B': ('C', 'G', 'T'),
             'N': ('A', 'C', 'G', 'T'),
             }
     self.all_residue_codes = {
             'A': ('A',),
             'C': ('C',),
             'G': ('G',),
             'T': ('T',),
             }
     for k, v in iteritems(self.residue_ambiguity_codes):
         self.all_residue_codes[k] = v
Exemple #12
0
 def test_limnonectes(self):
     formats = {
         'fasta': '.fasta',
         'phylip-relaxed': '.phylip',
         'nexus': '.nexus'
     }
     for in_format, in_ext in iteritems(formats):
         in_file = package_paths.data_path('limnonectes' + in_ext)
         for out_format, out_ext in iteritems(formats):
             out_file = self.getTestFile('limnonectes' + out_ext)
             n = convert_format(in_file=in_file,
                                in_format=in_format,
                                out_file=out_file,
                                out_format=out_format,
                                data_type='dna')
             self.assertEqual(n, 80)
             in_seqs = SeqIO.parse(in_file,
                                   format=in_format,
                                   alphabet=IUPAC.ambiguous_dna)
             out_seqs = SeqIO.parse(out_file,
                                    format=out_format,
                                    alphabet=IUPAC.ambiguous_dna)
             self.assertSameData(in_seqs, out_seqs)
Exemple #13
0
 def test_caenophidia(self):
     formats = {
         'fasta': '.fasta',
         'phylip-relaxed': '.phylip',
         'nexus': '.nexus'
     }
     for in_format, in_ext in iteritems(formats):
         in_file = package_paths.data_path('caenophidia' + in_ext)
         for out_format, out_ext in iteritems(formats):
             out_file = self.getTestFile('caenophidia' + out_ext)
             n = convert_format(in_file=in_file,
                                in_format=in_format,
                                out_file=out_file,
                                out_format=out_format,
                                data_type='protein')
             self.assertEqual(n, 114)
             in_seqs = SeqIO.parse(in_file,
                                   format=in_format,
                                   alphabet=IUPAC.extended_protein)
             out_seqs = SeqIO.parse(out_file,
                                    format=out_format,
                                    alphabet=IUPAC.extended_protein)
             self.assertSameData(in_seqs, out_seqs)
Exemple #14
0
 def _set_ambiguity_codes(self, symbol_to_states_dict):
     self._ambiguity_codes = {}
     self._reverse_ambiguity_codes = {}
     for ambig, states in iteritems(symbol_to_states_dict):
         for s in states:
             if not s in self.states:
                 raise ValueError('Ambiguity {0!r} maps to an invalid '
                         'state {0!r}'.format(ambig, s))
         tup = self.standardize_states(states)
         self._ambiguity_codes[ambig] = tup
         self._reverse_ambiguity_codes[tup] = ambig
         if tup == self.states:
             self._missing = ambig
         if not self.gap or (self.gap not in tup):
             self._residue_ambiguity_codes[ambig] = tup
Exemple #15
0
 def _set_ambiguity_codes(self, symbol_to_states_dict):
     self._ambiguity_codes = {}
     self._reverse_ambiguity_codes = {}
     for ambig, states in iteritems(symbol_to_states_dict):
         for s in states:
             if not s in self.states:
                 raise ValueError('Ambiguity {0!r} maps to an invalid '
                                  'state {0!r}'.format(ambig, s))
         tup = self.standardize_states(states)
         self._ambiguity_codes[ambig] = tup
         self._reverse_ambiguity_codes[tup] = ambig
         if tup == self.states:
             self._missing = ambig
         if not self.gap or (self.gap not in tup):
             self._residue_ambiguity_codes[ambig] = tup
Exemple #16
0
def mode_list(samples, bin_width = 'auto', zero_value = 'boundary'):
    """
    Return a list of modes, or mode bins, from a list of values.

    Arguments include:

        `samples` is an iterable set of values, which can be integers, strings
        or floats.

        `bin_width` controls the behavior of the mode estimation, with the
        following options:
        
            `bin_width = 'a'|'auto'` - The default. The function automatically
            determines whether to treat the values as discrete or continuous by
            checking for floating point numbers in the sample. If there are no
            floats the samples are treated as discrete and a list of the most
            common values is returned. If there are floats, a bin width is
            determined by calling `get_bin_width(samples, algorithm='custom')`.
            The values are then binned using this bin width and the
            `zero_value` argument, and a list of tuples is returned, each tuple
            represents the lower and upper bounds of the most common bins.

            `bin_width = None|0` - The samples are treated as
            discrete and a list of the most common values is returned.

            `bin_width = <NUMBER OTHER THAN ZERO>` - The samples are treated as
            floats and are binned into categories of width `bin_width` to
            determine the mode.

            `bin_width =
                'c'|'custom'
                'f'|'fd'|'freedman-diaconis'|
                's'|'sturges'|
                'r'|'rice'|
                'd'|'doane'`
            The 'best' bin width is determined using the specified algorithm
            (see `get_bin_width` function for details regarding the algorithm
            options).
 
        `zero_value` zero always corresponds to a bin, and this option controls
        whether zero is at the center of a bin or at the edge. Options include:

            `zero_value = 'b'|'boundary'` - zero is a boundary between bins.
            In most cases choosing between 'b' and 'c' will be arbitrary, but
            if the samples are bounded by zero (i.e., the parameter is either
            strictly positive or negative, zero should be set as a boundary.

            `zero_value = 'c'|'center'` - zero is at the center of a bin.  If
            the samples are bounded by zero, use 'boundary' instead.  However,
            this option can be useful if the samples span zero and are
            suspected to be centered at zero.

    The function returns:

        If values are treated as discrete (i.e., `bin_width = None`), a list
        containing the most common values is returned. The list will contain
        multiple values if there is a tie.

        If values are treated as floats (i.e. `bin_width != None`), a list of
        tuples containing the lower and upper boundaries of the most common
        bins is returned. The list will contain multiple tuples each
        representing a most common bin, if there is a tie.

    Some examples:
        >>> from pymsbayes.utils.stats import mode_list
        >>> x = range(10) + [2]
        >>> mode_list(x)  # treat values as discrete by default
        [2]
        >>> x += [6]
        >>> mode_list(x)  # a tie!
        [2, 6]
        >>> x = ['a', 'b', 'b', 'c', 'c', 'b']
        >>> # strings work too when treated as discrete
        >>> mode_list(x)
        ['b']
        >>> import random
        >>> x = [random.Random().expovariate(1) for i in range(10000)]
        >>> # specify bin width for continuous values
        >>> mode_list(x, bin_width='auto')
        [(0.0, 0.10405355148832289)]
        >>> x = [random.Random().normalvariate(1, 1) for i in range(10000)]
        >>> mode_list(x, bin_width='auto')
        [(0.8910191831744725, 1.0183076379136828)]
        >>> x = [random.Random().normalvariate(0, 1) for i in range(10000)]
        >>> # zero is a bin boundary by default
        >>> mode_list(x, bin_width='auto') 
        [(-0.1263661814981197, 0.0)]
        >>> # specify zero_value as bin center to get mode that spans zero
        >>> mode_list(x, bin_width='auto', zero_value='center')
        [(-0.06318309074905985, 0.06318309074905985)]

    The beginnings of this function were based on the mode function in DendroPy
    (Copyright Jeet Sukumaran and Mark T. Holder; licensed under BSD License;
    http://pythonhosted.org/DendroPy/):

    Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
    for phylogenetic computing. Bioinformatics 26: 1569-1571.
    """
    if not samples:
        raise ValueError('empty samples')
    if len(list(samples)) == 1:
        return list(samples)
    zero_value = zero_value.strip().lower()
    discrete = False
    if not bin_width:
        discrete = True
    elif hasattr(bin_width, 'lower'):
        bin_width = bin_width.strip().lower()
        if bin_width in ['a', 'auto']:
            discrete = not has_floats(samples)
            if discrete:
                bin_width = None
            else:
                bin_width = 'c'
        else:
            discrete = False
        if not discrete:
            bin_width = get_bin_width(samples, bin_width)
            if bin_width == 0.0:
                bin_width = (max(samples) - min(samples)) / float(10)
                if bin_width == 0.0:
                    bin_width = 0.001
    if not discrete:
        bw = float(bin_width)
    counts = {}
    for s in samples:
        if discrete:
            index = s
        else:
            if zero_value in ['b', 'boundary']:
                index = int(math.floor(s / bw))
                bounds = (0.0, bw)
            elif zero_value in ['c', 'center']:
                index = int(math.floor((s / bw) + 0.5))
                bounds = ((bw / 2), (bw / 2))
            else:
                raise ValueError('unsupported `zero_value` argument: '
                        '{0!r}'.format(zero_value))
        counts[index] = counts.get(index, 0) + 1
    count_tups = sorted(iteritems(counts), key = operator.itemgetter(1),
            reverse = True)
    max_count = count_tups[0][1]
    if discrete:
        return [val for val, cnt in count_tups if cnt >= max_count]
    return [((val * bin_width) - bounds[0], (val * bin_width) + bounds[1]) \
            for val, cnt in count_tups if cnt >= max_count]
Exemple #17
0
def main_cli():
    description = '{name} {version}\n\n{description}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('input_file',
                        metavar='INPUT-SEQ-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Input sequence file to be vetted.'))

    comparison_args = parser.add_argument_group(
        'Comparison Options',
        'Options to control the number and nature of sequence comparisons')
    comparison_args.add_argument(
        '-n',
        '--num-samples',
        type=int,
        default=0,
        help=('The number of randomly sampled sequences to which each '
              'sequence will be compared. If less than 1 (the defualt is '
              '0), all pairwise comparisons will be performed. For very '
              'large numbers of sequences, performing all pairwise '
              'comparisons will take a long time. This option will speed '
              'things up as long as the number specified is less than '
              'about half of the number of input sequences. If the '
              'number you are considering is close to half of the number '
              'sequences, you should probably specify zero and do all '
              'combinations. You should not specify a number greater than '
              'half the number of sequences, because it will take longer '
              'and be less thorough than the default.'))
    comparison_args.add_argument(
        '--seed',
        action='store',
        type=int,
        help=('Random number seed to use for the analysis. This option '
              'is only revelant if a number greater than 0 is specified '
              'for the `-n/--num-samples` option.'))
    comparison_args.add_argument(
        '--compare-translated',
        action='store_true',
        help=('Compare amino acid sequences encoded by the longest '
              'reading frame found in each sequence. To use this option, '
              '`data-type` must be dna or rna. See "Translation Options" '
              'for controlling how the longest reading frame of each '
              'sequence is determined and translated.'))
    comparison_args.add_argument('--check-ids',
                                 action='store_true',
                                 help=('Check sequence IDs for duplicates.'))
    comparison_args.add_argument(
        '--summarize-reading-frame-lengths',
        action='store_true',
        help=('Report the length of the longest reading frame of '
              'each sequence. See "Translation Options" for controlling '
              'how reading frames are determined.'))
    comparison_args.add_argument(
        '-g',
        '--count-gaps',
        action='store_true',
        help=('Count gaps when calculating pairwise sequence distances. '
              'The default is to calculate (number of differences '
              'ignoring gaps / number of aligned sites ignoring sites '
              'with gaps) for each pairwise comparison. When this option '
              'is used, the distance is (number of differences including '
              'gap differences / total number of aligned sites).'))

    alignment_args = parser.add_argument_group(
        'Alignment Options',
        ('These options control if/how sequences are to be aligned prior '
         'to calculating distances.'))
    alignment_args.add_argument(
        '-a',
        '--aligned',
        action='store_true',
        help=('Treat input sequences as aligned. I.e., do not perform '
              'pairwise alignment before calculating distances between '
              'sequences (except when calculating distances for reverse '
              'and complemented sequences).'))
    alignment_args.add_argument(
        '--aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for pairwise'
              'alignments of sequences. '
              'The default is to look for muscle and then mafft in PATH, '
              'and if neither are found use the (slow) built-in '
              'function. Even if the `-a`/`--aligned` option is '
              'specified, the aligner will still be used for pairwise '
              'alignments when calculating distances of reverse and '
              'complemented sequences.'))
    alignment_args.add_argument(
        '--msa',
        action='store_true',
        help=('Perform a full multiple sequence alignemnt prior to '
              'comparing sequences. The default is to align each '
              'pair of sequences being compared. This option is '
              'overruled by the `-a`/`--aligned` option. '
              'If this option is used '
              'the resulting alignment is written to file.'))
    alignment_args.add_argument(
        '--msa-aligner',
        type=argparse_utils.arg_is_executable,
        help=('Path to alignment program executable to use for full '
              'multiple sequence alignment. '
              'The default is to look for mafft and then muscle in PATH, '
              'and if neither are found the program will exit with an '
              'error message. If you do not have mafft or muscle '
              'you cannot use this option. '
              'This option is only used if the `-a`/`--aligned` option '
              'is not specified, and the `--msa` option is specified.'))

    translation_args = parser.add_argument_group(
        'Translation Options',
        ('These options control translation from nucleotide to amino acid '
         'sequences.'))
    translation_args.add_argument(
        '--table',
        type=int,
        choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)),
        default=1,
        help=('The translation table to use for any options associated '
              'with translating nucleotide sequences to amino acids. '
              'Option should be the integer that corresponds to the '
              'desired translation table according to NCBI '
              '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). '
              'The default is 1 (the "standard" code).'))
    translation_args.add_argument(
        '--allow-partial',
        action='store_true',
        default=False,
        help=('Allow partial reading frames at the beginning (no start '
              'codon) and end (no stop codon) of sequences.'))
    translation_args.add_argument(
        '--read-after-stop',
        action='store_true',
        default=False,
        help=('A new reading frame begins immediately after a stop codon. '
              'The default is to start reading frame at next start codon '
              'after a stop codon. This option might be useful for exons.'))

    data_args = parser.add_argument_group(
        'Data Options', ('Options specifying the input data type and format'))
    data_args.add_argument(
        '-d',
        '--data-type',
        type=str,
        choices=VALID_DATA_TYPES,
        default='dna',
        help=('The type of sequence data. The default is dna. Valid '
              'options include: {0}.'.format(', '.join(VALID_DATA_TYPES))))
    data_args.add_argument(
        '--format',
        dest='input_format',
        type=str,
        choices=FILE_FORMATS.supported_formats,
        help=('The format of the input sequence file. Valid options '
              'include: {0}. By default, the format is guessed based on '
              'the extension of the first input file. However, if '
              'provided, this option will always take precedence over '
              'the file extension.'.format(', '.join(
                  FILE_FORMATS.supported_formats))))

    output_args = parser.add_argument_group(
        'Output Options', 'Options for controlling output of program')
    output_args.add_argument(
        '-o',
        '--output-dir',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the input file.'))

    messaging_args = parser.add_argument_group(
        'Messaging Options', ('These options control verbosity of messaging.'))
    messaging_args.add_argument(
        '--log-frequency',
        type=argparse_utils.arg_is_nonnegative_int,
        default=1000,
        help=('The frequency at which to log progress. Default is to log '
              'every 1000 sequence comparisons.'))
    messaging_args.add_argument('--quiet',
                                action='store_true',
                                help='Run without verbose messaging.')
    messaging_args.add_argument('--debug',
                                action='store_true',
                                help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## set up logging

    from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR

    os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO"
    if args.quiet:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING"
    if args.debug:
        os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG"
    log = get_logger(name=__name__)

    ##########################################################################
    ## package imports

    from seqsift.utils import GLOBAL_RNG, dataio, functions, alphabets
    from seqsift.seqops import seqsum, seqmod, seqstats
    from seqsift.utils.fileio import OpenFile

    ##########################################################################
    ## handle args

    ## set seed if randomly sampling sequences
    if args.num_samples > 0:
        if not args.seed:
            args.seed = random.randint(1, 999999999)
        GLOBAL_RNG.seed(args.seed)
        log.warning('Seed: {0}'.format(args.seed))

    ## get input file format
    if not args.input_format:
        args.input_format = FILE_FORMATS.get_format_from_file_object(
            args.input_file)
    if not args.input_format:
        log.error("Could not determine input format.\n"
                  "You must either provide the input format\n"
                  "using the '--from' option or have a recognizable\n"
                  "file extension on the input file name.\n"
                  "Here are the supported file extensions:\n{0}".format(
                      str(FILE_FORMATS)))
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    aligner_tools = ['muscle', 'mafft']
    if args.aligner:
        aligner_tools = [args.aligner]
    full_aligner_tools = ['mafft', 'muscle']
    if args.msa_aligner:
        full_aligner_tools = [args.msa_aligner]

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.input_file)

    full_alignment_out_path = os.path.join(args.output_dir, 'seqvet-msa.txt')
    alphabet = alphabets.DnaAlphabet()
    if args.data_type in ['aa', 'protein']:
        alphabet = alphabets.ProteinAlphabet()

    if (args.summarize_reading_frame_lengths
            and (not args.data_type in ['dna', 'rna'])):
        log.error("`--summarize-reading-frame-lengths` is only compatible "
                  "with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    if (args.compare_translated and (not args.data_type in ['dna', 'rna'])):
        log.error("`-compare-translated` is only compatible with DNA or RNA.")
        sys.stderr.write(str(parser.print_help()))
        sys.exit(1)

    ##########################################################################
    ## heavy lifting

    seqs = dataio.get_seq_iter([args.input_file],
                               format=args.input_format,
                               data_type=args.data_type)

    if args.summarize_reading_frame_lengths:
        log.info('Summarizing longest reading frame lengths...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        lengths = seqsum.summarize_longest_read_lengths(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        length_path = os.path.join(args.output_dir,
                                   'seqvet-reading-frame-lengths.txt')
        log.info('Writing longest reading frame lengths to file...')
        with OpenFile(length_path, 'w') as out:
            out.write('seq_id\tlrf\trev_comp_lrf\n')
            for (l, rc_l, seq_id) in lengths:
                out.write('{0}\t{1}\t{2}\n'.format(seq_id, l, rc_l))

    if args.compare_translated:
        log.info('Translating longest reading frames for distance '
                 'calculations...')
        seqs = seqmod.translate_longest_reading_frames(
            seqs,
            table=args.table,
            allow_partial=args.allow_partial,
            require_start_after_stop=(not args.read_after_stop))
        alphabet = alphabets.ProteinAlphabet()

    if args.check_ids:
        log.info('Checking sequence IDs...')
        if not isinstance(seqs, dataio.BufferedIter):
            seqs = dataio.BufferedIter(seqs)
        dups = seqstats.get_duplicate_ids(seqs)
        if len(dups) > 0:
            dup_path = functions.get_new_path(
                os.path.join(args.output_dir, 'seqvet-duplicate-ids.txt'))
            log.warning('Duplicate IDs found! Writing them to '
                        '{0}'.format(dup_path))
            with OpenFile(dup_path, 'w') as out:
                for dup in dups:
                    out.write('{0}\n'.format(dup))
        else:
            log.info('No duplicate sequence IDs were found.')

    log.info('Calculating pairwise distances...')
    distances, rev_comp_errors = seqsum.summarize_distances(
        seqs,
        sample_size=args.num_samples,
        per_site=True,
        aligned=args.aligned,
        ignore_gaps=(not args.count_gaps),
        alphabet=alphabet,
        do_full_alignment=args.msa,
        full_alignment_out_path=full_alignment_out_path,
        aligner_tools=aligner_tools,
        full_aligner_tools=full_aligner_tools,
        log_frequency=args.log_frequency)
    log.info('Done!')

    log.info('Writing mean distances to file...')
    distances = sorted([(k, v) for k, v in iteritems(distances)],
                       key=lambda x: x[1].mean,
                       reverse=True)
    mean_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-mean-distances.txt'))
    with OpenFile(mean_path, 'w') as out:
        out.write('seq_id\tmean_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.mean))

    log.info('Writing max distances to file...')
    distances = sorted(distances, key=lambda x: x[1].maximum, reverse=True)
    max_path = functions.get_new_path(
        os.path.join(args.output_dir, 'seqvet-max-distances.txt'))
    with OpenFile(max_path, 'w') as out:
        out.write('seq_id\tmax_distance\n')
        for (seq_id, dist) in distances:
            out.write('{0}\t{1}\n'.format(seq_id, dist.maximum))

    if rev_comp_errors:
        rev_comp_errors = sorted(rev_comp_errors)
        rce_set = set()
        rce = []
        for (s1, s2, d, drc) in rev_comp_errors:
            pair = tuple(sorted([s1, s2]))
            if pair in rce_set:
                continue
            rce_set.add(pair)
            rce.append((pair[0], pair[1], d, drc))
        log.info('Writing potential reverse-complement errors to file...')
        path = functions.get_new_path(
            os.path.join(args.output_dir,
                         'seqvet-reverse-complement-warnings.txt'))
        with OpenFile(path, 'w') as out:
            out.write('seq1\tseq2\tdistance\trev_comp_distance\n')
            for (seq1, seq2, d, drc) in rce:
                out.write('{0}\t{1}\t{2}\t{3}\n'.format(seq1, seq2, d, drc))
Exemple #18
0
def mode_list(samples, bin_width='auto', zero_value='boundary'):
    """
    Return a list of modes, or mode bins, from a list of values.

    Arguments include:

        `samples` is an iterable set of values, which can be integers, strings
        or floats.

        `bin_width` controls the behavior of the mode estimation, with the
        following options:
        
            `bin_width = 'a'|'auto'` - The default. The function automatically
            determines whether to treat the values as discrete or continuous by
            checking for floating point numbers in the sample. If there are no
            floats the samples are treated as discrete and a list of the most
            common values is returned. If there are floats, a bin width is
            determined by calling `get_bin_width(samples, algorithm='custom')`.
            The values are then binned using this bin width and the
            `zero_value` argument, and a list of tuples is returned, each tuple
            represents the lower and upper bounds of the most common bins.

            `bin_width = None|0` - The samples are treated as
            discrete and a list of the most common values is returned.

            `bin_width = <NUMBER OTHER THAN ZERO>` - The samples are treated as
            floats and are binned into categories of width `bin_width` to
            determine the mode.

            `bin_width =
                'c'|'custom'
                'f'|'fd'|'freedman-diaconis'|
                's'|'sturges'|
                'r'|'rice'|
                'd'|'doane'`
            The 'best' bin width is determined using the specified algorithm
            (see `get_bin_width` function for details regarding the algorithm
            options).
 
        `zero_value` zero always corresponds to a bin, and this option controls
        whether zero is at the center of a bin or at the edge. Options include:

            `zero_value = 'b'|'boundary'` - zero is a boundary between bins.
            In most cases choosing between 'b' and 'c' will be arbitrary, but
            if the samples are bounded by zero (i.e., the parameter is either
            strictly positive or negative, zero should be set as a boundary.

            `zero_value = 'c'|'center'` - zero is at the center of a bin.  If
            the samples are bounded by zero, use 'boundary' instead.  However,
            this option can be useful if the samples span zero and are
            suspected to be centered at zero.

    The function returns:

        If values are treated as discrete (i.e., `bin_width = None`), a list
        containing the most common values is returned. The list will contain
        multiple values if there is a tie.

        If values are treated as floats (i.e. `bin_width != None`), a list of
        tuples containing the lower and upper boundaries of the most common
        bins is returned. The list will contain multiple tuples each
        representing a most common bin, if there is a tie.

    Some examples:
        >>> from pymsbayes.utils.stats import mode_list
        >>> x = range(10) + [2]
        >>> mode_list(x)  # treat values as discrete by default
        [2]
        >>> x += [6]
        >>> mode_list(x)  # a tie!
        [2, 6]
        >>> x = ['a', 'b', 'b', 'c', 'c', 'b']
        >>> # strings work too when treated as discrete
        >>> mode_list(x)
        ['b']
        >>> import random
        >>> x = [random.Random().expovariate(1) for i in range(10000)]
        >>> # specify bin width for continuous values
        >>> mode_list(x, bin_width='auto')
        [(0.0, 0.10405355148832289)]
        >>> x = [random.Random().normalvariate(1, 1) for i in range(10000)]
        >>> mode_list(x, bin_width='auto')
        [(0.8910191831744725, 1.0183076379136828)]
        >>> x = [random.Random().normalvariate(0, 1) for i in range(10000)]
        >>> # zero is a bin boundary by default
        >>> mode_list(x, bin_width='auto') 
        [(-0.1263661814981197, 0.0)]
        >>> # specify zero_value as bin center to get mode that spans zero
        >>> mode_list(x, bin_width='auto', zero_value='center')
        [(-0.06318309074905985, 0.06318309074905985)]

    The beginnings of this function were based on the mode function in DendroPy
    (Copyright Jeet Sukumaran and Mark T. Holder; licensed under BSD License;
    http://pythonhosted.org/DendroPy/):

    Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
    for phylogenetic computing. Bioinformatics 26: 1569-1571.
    """
    if not samples:
        raise ValueError('empty samples')
    if len(list(samples)) == 1:
        return list(samples)
    zero_value = zero_value.strip().lower()
    discrete = False
    if not bin_width:
        discrete = True
    elif hasattr(bin_width, 'lower'):
        bin_width = bin_width.strip().lower()
        if bin_width in ['a', 'auto']:
            discrete = not has_floats(samples)
            if discrete:
                bin_width = None
            else:
                bin_width = 'c'
        else:
            discrete = False
        if not discrete:
            bin_width = get_bin_width(samples, bin_width)
            if bin_width == 0.0:
                bin_width = (max(samples) - min(samples)) / float(10)
                if bin_width == 0.0:
                    bin_width = 0.001
    if not discrete:
        bw = float(bin_width)
    counts = {}
    for s in samples:
        if discrete:
            index = s
        else:
            if zero_value in ['b', 'boundary']:
                index = int(math.floor(s / bw))
                bounds = (0.0, bw)
            elif zero_value in ['c', 'center']:
                index = int(math.floor((s / bw) + 0.5))
                bounds = ((bw / 2), (bw / 2))
            else:
                raise ValueError('unsupported `zero_value` argument: '
                                 '{0!r}'.format(zero_value))
        counts[index] = counts.get(index, 0) + 1
    count_tups = sorted(iteritems(counts),
                        key=operator.itemgetter(1),
                        reverse=True)
    max_count = count_tups[0][1]
    if discrete:
        return [val for val, cnt in count_tups if cnt >= max_count]
    return [((val * bin_width) - bounds[0], (val * bin_width) + bounds[1]) \
            for val, cnt in count_tups if cnt >= max_count]
Exemple #19
0
 def test_dict(self):
     self.assertIsInstance(self.seqs, dict)
     for k, v in iteritems(self.seqs):
         self.assertIsInstance(v, SeqRecord)
         self.assertEqual(v.id, k)
Exemple #20
0
 def test_dict(self):
     self.assertIsInstance(self.seqs, dict)
     for k, v in iteritems(self.seqs):
         self.assertIsInstance(v, SeqRecord)
         self.assertEqual(v.id, k)