def get_differences(seq1, seq2, aligned=False, ignore_gaps=True, alphabet=None, aligner_tools=['mafft', 'muscle']): if not alphabet: alphabet = alphabets.DnaAlphabet() if not aligned: seq1, seq2 = align_pair(seq1, seq2, aligner_tools) if len(seq1) != len(seq2): raise AlignmentError('Sequences are not aligned') residue_codes = alphabet.all_residue_codes diffs = {} num_comparisons = 0 for i in range(len(seq1)): if (seq1[i] == alphabet.missing) or (seq2[i] == alphabet.missing): continue if (seq1[i] == alphabet.gap) or (seq2[i] == alphabet.gap): if ignore_gaps: continue num_comparisons += 1 if seq1[i].upper() != seq2[i].upper(): diffs[i] = (seq1[i].upper(), seq2[i].upper()) continue num_comparisons += 1 if seq1[i].upper() == seq2[i].upper(): continue s1 = set(residue_codes[seq1[i].upper()]) s2 = set(residue_codes[seq2[i].upper()]) if not s1.intersection(s2): diffs[i] = (seq1[i].upper(), seq2[i].upper()) return diffs, num_comparisons
def test_dna_alphabet(self): a = alphabets.DnaAlphabet() self.assertEqual(self.states, a.states) self.assertEqual(self.ambiguity_codes, a.ambiguity_codes) self.assertEqual(self.residue_ambiguity_codes, a.residue_ambiguity_codes) self.assertEqual(self.all_residue_codes, a.all_residue_codes) self.assertEqual(a.gap, '-') self.assertEqual(a.missing, '?') self.assertEqual(a.get_symbol('AG'), 'R') self.assertEqual(a.get_symbol('AGCT-'), '?') self.assertEqual(a.get_symbol('AGCT'), 'N')
class LociFileIter(object): count = 0 dna_alphabet = alphabets.DnaAlphabet() dna_symbols = ''.join( set([x.upper() for x in dna_alphabet.get_valid_symbols()] + [x.lower() for x in dna_alphabet.get_valid_symbols()])) seq_pattern = re.compile( r'^>(?P<name>.+)\s+(?P<seq>[{0}]+)$'.format(dna_symbols)) inter_locus_pattern = re.compile(r'^//.*$') def __init__(self, file_obj): self.__class__.count += 1 self.instance_name = '-'.join( [self.__class__.__name__, str(self.count)]) self.name = getattr(file_obj, 'name', self.instance_name) self._close = False self._file_obj = file_obj if isinstance(file_obj, str): self.name = file_obj self._file_obj = fileio.OpenFile(file_obj, 'r') self._close = True def __iter__(self): return self def __next__(self): return self.next() def next(self): try: return self._next_locus().next() except StopIteration as e: if self._close: self._file_obj.close() raise e def _next_locus(self): seqs = [] for line in self._file_obj: l = line.strip() m = self.seq_pattern.match(l) x = self.inter_locus_pattern.match(l) if m: s = Seq(m.group('seq'), alphabet=get_state_alphabet('dna', ambiguities=True)) name = m.group('name').strip() seqs.append(SeqRecord(seq=s, id=name, name=name)) elif x: yield seqs else: raise Exception('unexpected format of line in loci-formatted ' 'file {0}:\n{1}\n'.format(self.name, l))
class PyMsBayesComparisons(object): count = 0 alphabet = alphabets.DnaAlphabet() sample_table_header = ('# taxon\tlocus\tploidy_multiplier\t' 'rate_multiplier\tnsamples1\tnsamples2\tkappa\t' 'nsites\ta\tc\tg\tpath\n' 'BEGIN SAMPLE_TBL\n') sample_table_footer = 'END SAMPLE_TBL\n' pi_header = 'taxon\tlocus\tpi1\tpi2\n' def __init__(self, comparisons, name=None, locus=None): self.__class__.count += 1 if not name: name = self.__class__.__name__ + '-' + str(self.count) self.name = name if not locus: locus = 'locus{0}'.format(self.count) self.locus = locus self.comparisons = comparisons def add_sequence(self, seq_record, strict=False): ret = None for comp in self.comparisons: r = comp.add_sequence(seq_record) if r is not None: ret = r if strict and (ret is None): raise Exception('{0} is not in comparisons'.format(seq_record.id)) def extend_sequences(self, seq_record_iter, strict=False): for s in seq_record_iter: self.add_sequence(s, strict=strict) def _get_smallest_number_of_sequences(self): smallest = None for comp in self.comparisons: s = comp.smallest_sample_size if smallest is None: smallest = s if s < smallest: smallest = s return smallest smallest_sample_size = property(_get_smallest_number_of_sequences) def _get_shortest_alignment(self): shortest = None for comp in self.comparisons: s = comp.alignment_length if shortest is None: shortest = s if s < shortest: shortest = s return shortest shortest_alignment = property(_get_shortest_alignment) def write_comparisons(self, fasta_dir, config_dir=None, estimate_hky_parameters=False): s = StringIO() pi_s = StringIO() for comp in self.comparisons: if estimate_hky_parameters and (not comp.estimated_hky_parameters): comp.estimate_hky_parameters() nsamples = comp.number_of_sequences taxon = comp.comparison_str path = os.path.join(fasta_dir, '{0}-{1}.fasta'.format(taxon, self.locus)) rel_path = path if config_dir: rel_path = os.path.relpath(path, config_dir) comp.write_sequences(path) s.write('{taxon}\t{locus}\t{ploidy_multiplier}\t' '{rate_multiplier}\t{nsamples1}\t{nsamples2}\t{kappa}\t' '{nsites}\t{a}\t{c}\t{g}\t{path}\n'.format( taxon=taxon, locus=self.locus, ploidy_multiplier=comp.ploidy_multiplier, rate_multiplier=comp.rate_multiplier, nsamples1=nsamples[0], nsamples2=nsamples[1], kappa=comp.kappa, nsites=comp.alignment_length, a=comp.a, c=comp.c, g=comp.g, path=rel_path)) comp.estimate_pi() pi_s.write('{taxon}\t{locus}\t{pi1}\t{pi2}\n'.format( taxon=taxon, locus=self.locus, pi1=comp.pi[0], pi2=comp.pi[1])) return s.getvalue(), pi_s.getvalue() @classmethod def process_loci_file(cls, loci_file_obj, pop_id_maps, fasta_out_dir, config_out_dir=None, minimum_sample_size=2, minimum_alignment_length=50, max_ambiguities_per_seq=0.2, require_shared_loci=False, estimate_hky_parameters=False): if not config_out_dir: config_out_dir = fasta_out_dir config_out_path = os.path.join(config_out_dir, 'config-sample-table.txt') pi_out_path = os.path.join(config_out_dir, 'pi-estimates.txt') config_stream = StringIO() pi_stream = StringIO() for i, locus in enumerate(dataio.LociFileIter(loci_file_obj)): comps = [] for id_map in pop_id_maps: assert (len(id_map) == 2) pop1, pop2 = sorted(id_map.keys()) c = Comparison( population_1_ids=id_map[pop1], population_2_ids=id_map[pop2], population_1_name=pop1, population_2_name=pop2, ) # remove rows with many ambiguities seqs = seqfilter.row_filter( locus, character_list=(cls.alphabet.ambiguity_codes.keys() + [cls.alphabet.gap]), max_frequency=max_ambiguities_per_seq) # remove all columns with ambiguities seqs = seqfilter.column_filter( seqs, character_list=(cls.alphabet.ambiguity_codes.keys() + [cls.alphabet.gap]), max_frequency=0.00001) c.extend_sequences(seqs) if ((c.alignment_length >= minimum_alignment_length) and (c.smallest_sample_size >= minimum_sample_size)): comps.append(c) if not comps: continue if require_shared_loci and (len(comps) < len(pop_id_maps)): continue pymsbayes_comps = cls(comparisons=comps, locus='locus{0}'.format(i)) config_str, pi_str = pymsbayes_comps.write_comparisons( fasta_dir=fasta_out_dir, config_dir=config_out_dir, estimate_hky_parameters=estimate_hky_parameters) config_stream.write(config_str) pi_stream.write(pi_str) with OpenFile(config_out_path, 'w') as out: out.write(cls.sample_table_header) out.write(config_stream.getvalue()) out.write(cls.sample_table_footer) with OpenFile(pi_out_path, 'w') as out: out.write(cls.pi_header) out.write(pi_stream.getvalue()) return i + 1 @classmethod def process_loci_files_as_pairs(cls, loci_file_objects, id_component_delimiter, id_component_index, fasta_out_dir, config_out_dir=None, minimum_sample_size=2, minimum_alignment_length=50, max_ambiguities_per_seq=0.2, estimate_hky_parameters=False): if not config_out_dir: config_out_dir = fasta_out_dir config_out_path = os.path.join(config_out_dir, 'config-sample-table.txt') pi_out_path = os.path.join(config_out_dir, 'pi-estimates.txt') config_stream = StringIO() pi_stream = StringIO() locus_count = 0 for i, loci_file_obj in enumerate(loci_file_objects): comparison_prefix = None try: comparison_prefix = os.path.basename(loci_file_obj).split( '.')[0] except: try: comparison_prefix = os.path.basename( loci_file_obj.name).split('.')[0] except: pass pass for j, locus in enumerate(dataio.LociFileIter(loci_file_obj)): locus_count += 1 # remove rows with many ambiguities seqs = seqfilter.row_filter( locus, character_list=(cls.alphabet.ambiguity_codes.keys() + [cls.alphabet.gap]), max_frequency=max_ambiguities_per_seq) # remove all columns with ambiguities seqs = seqfilter.column_filter( seqs, character_list=(cls.alphabet.ambiguity_codes.keys() + [cls.alphabet.gap]), max_frequency=0.00001) c = Comparison.get_comparison_from_seqs( sequences=seqs, id_component_delimiter=id_component_delimiter, id_component_index=id_component_index, comparison_prefix=comparison_prefix, ) if not ((c.alignment_length >= minimum_alignment_length) and (c.smallest_sample_size >= minimum_sample_size)): continue pymsbayes_comps = cls(comparisons=[c]) config_str, pi_str = pymsbayes_comps.write_comparisons( fasta_dir=fasta_out_dir, config_dir=config_out_dir, estimate_hky_parameters=estimate_hky_parameters) config_stream.write(config_str) pi_stream.write(pi_str) with OpenFile(config_out_path, 'w') as out: out.write(cls.sample_table_header) out.write(config_stream.getvalue()) out.write(cls.sample_table_footer) with OpenFile(pi_out_path, 'w') as out: out.write(cls.pi_header) out.write(pi_stream.getvalue()) return locus_count
def main_cli(): description = '{name} {version}\n\n{description}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('input_file', metavar='INPUT-SEQ-FILE', type=argparse_utils.arg_is_file, help=('Input sequence file to be vetted.')) comparison_args = parser.add_argument_group( 'Comparison Options', 'Options to control the number and nature of sequence comparisons') comparison_args.add_argument( '-n', '--num-samples', type=int, default=0, help=('The number of randomly sampled sequences to which each ' 'sequence will be compared. If less than 1 (the defualt is ' '0), all pairwise comparisons will be performed. For very ' 'large numbers of sequences, performing all pairwise ' 'comparisons will take a long time. This option will speed ' 'things up as long as the number specified is less than ' 'about half of the number of input sequences. If the ' 'number you are considering is close to half of the number ' 'sequences, you should probably specify zero and do all ' 'combinations. You should not specify a number greater than ' 'half the number of sequences, because it will take longer ' 'and be less thorough than the default.')) comparison_args.add_argument( '--seed', action='store', type=int, help=('Random number seed to use for the analysis. This option ' 'is only revelant if a number greater than 0 is specified ' 'for the `-n/--num-samples` option.')) comparison_args.add_argument( '--compare-translated', action='store_true', help=('Compare amino acid sequences encoded by the longest ' 'reading frame found in each sequence. To use this option, ' '`data-type` must be dna or rna. See "Translation Options" ' 'for controlling how the longest reading frame of each ' 'sequence is determined and translated.')) comparison_args.add_argument('--check-ids', action='store_true', help=('Check sequence IDs for duplicates.')) comparison_args.add_argument( '--summarize-reading-frame-lengths', action='store_true', help=('Report the length of the longest reading frame of ' 'each sequence. See "Translation Options" for controlling ' 'how reading frames are determined.')) comparison_args.add_argument( '-g', '--count-gaps', action='store_true', help=('Count gaps when calculating pairwise sequence distances. ' 'The default is to calculate (number of differences ' 'ignoring gaps / number of aligned sites ignoring sites ' 'with gaps) for each pairwise comparison. When this option ' 'is used, the distance is (number of differences including ' 'gap differences / total number of aligned sites).')) alignment_args = parser.add_argument_group( 'Alignment Options', ('These options control if/how sequences are to be aligned prior ' 'to calculating distances.')) alignment_args.add_argument( '-a', '--aligned', action='store_true', help=('Treat input sequences as aligned. I.e., do not perform ' 'pairwise alignment before calculating distances between ' 'sequences (except when calculating distances for reverse ' 'and complemented sequences).')) alignment_args.add_argument( '--aligner', type=argparse_utils.arg_is_executable, help=('Path to alignment program executable to use for pairwise' 'alignments of sequences. ' 'The default is to look for muscle and then mafft in PATH, ' 'and if neither are found use the (slow) built-in ' 'function. Even if the `-a`/`--aligned` option is ' 'specified, the aligner will still be used for pairwise ' 'alignments when calculating distances of reverse and ' 'complemented sequences.')) alignment_args.add_argument( '--msa', action='store_true', help=('Perform a full multiple sequence alignemnt prior to ' 'comparing sequences. The default is to align each ' 'pair of sequences being compared. This option is ' 'overruled by the `-a`/`--aligned` option. ' 'If this option is used ' 'the resulting alignment is written to file.')) alignment_args.add_argument( '--msa-aligner', type=argparse_utils.arg_is_executable, help=('Path to alignment program executable to use for full ' 'multiple sequence alignment. ' 'The default is to look for mafft and then muscle in PATH, ' 'and if neither are found the program will exit with an ' 'error message. If you do not have mafft or muscle ' 'you cannot use this option. ' 'This option is only used if the `-a`/`--aligned` option ' 'is not specified, and the `--msa` option is specified.')) translation_args = parser.add_argument_group( 'Translation Options', ('These options control translation from nucleotide to amino acid ' 'sequences.')) translation_args.add_argument( '--table', type=int, choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)), default=1, help=('The translation table to use for any options associated ' 'with translating nucleotide sequences to amino acids. ' 'Option should be the integer that corresponds to the ' 'desired translation table according to NCBI ' '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). ' 'The default is 1 (the "standard" code).')) translation_args.add_argument( '--allow-partial', action='store_true', default=False, help=('Allow partial reading frames at the beginning (no start ' 'codon) and end (no stop codon) of sequences.')) translation_args.add_argument( '--read-after-stop', action='store_true', default=False, help=('A new reading frame begins immediately after a stop codon. ' 'The default is to start reading frame at next start codon ' 'after a stop codon. This option might be useful for exons.')) data_args = parser.add_argument_group( 'Data Options', ('Options specifying the input data type and format')) data_args.add_argument( '-d', '--data-type', type=str, choices=VALID_DATA_TYPES, default='dna', help=('The type of sequence data. The default is dna. Valid ' 'options include: {0}.'.format(', '.join(VALID_DATA_TYPES)))) data_args.add_argument( '--format', dest='input_format', type=str, choices=FILE_FORMATS.supported_formats, help=('The format of the input sequence file. Valid options ' 'include: {0}. By default, the format is guessed based on ' 'the extension of the first input file. However, if ' 'provided, this option will always take precedence over ' 'the file extension.'.format(', '.join( FILE_FORMATS.supported_formats)))) output_args = parser.add_argument_group( 'Output Options', 'Options for controlling output of program') output_args.add_argument( '-o', '--output-dir', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the input file.')) messaging_args = parser.add_argument_group( 'Messaging Options', ('These options control verbosity of messaging.')) messaging_args.add_argument( '--log-frequency', type=argparse_utils.arg_is_nonnegative_int, default=1000, help=('The frequency at which to log progress. Default is to log ' 'every 1000 sequence comparisons.')) messaging_args.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') messaging_args.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## set up logging from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO" if args.quiet: os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING" if args.debug: os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG" log = get_logger(name=__name__) ########################################################################## ## package imports from seqsift.utils import GLOBAL_RNG, dataio, functions, alphabets from seqsift.seqops import seqsum, seqmod, seqstats from seqsift.utils.fileio import OpenFile ########################################################################## ## handle args ## set seed if randomly sampling sequences if args.num_samples > 0: if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) log.warning('Seed: {0}'.format(args.seed)) ## get input file format if not args.input_format: args.input_format = FILE_FORMATS.get_format_from_file_object( args.input_file) if not args.input_format: log.error("Could not determine input format.\n" "You must either provide the input format\n" "using the '--from' option or have a recognizable\n" "file extension on the input file name.\n" "Here are the supported file extensions:\n{0}".format( str(FILE_FORMATS))) sys.stderr.write(str(parser.print_help())) sys.exit(1) aligner_tools = ['muscle', 'mafft'] if args.aligner: aligner_tools = [args.aligner] full_aligner_tools = ['mafft', 'muscle'] if args.msa_aligner: full_aligner_tools = [args.msa_aligner] if not args.output_dir: args.output_dir = os.path.dirname(args.input_file) full_alignment_out_path = os.path.join(args.output_dir, 'seqvet-msa.txt') alphabet = alphabets.DnaAlphabet() if args.data_type in ['aa', 'protein']: alphabet = alphabets.ProteinAlphabet() if (args.summarize_reading_frame_lengths and (not args.data_type in ['dna', 'rna'])): log.error("`--summarize-reading-frame-lengths` is only compatible " "with DNA or RNA.") sys.stderr.write(str(parser.print_help())) sys.exit(1) if (args.compare_translated and (not args.data_type in ['dna', 'rna'])): log.error("`-compare-translated` is only compatible with DNA or RNA.") sys.stderr.write(str(parser.print_help())) sys.exit(1) ########################################################################## ## heavy lifting seqs = dataio.get_seq_iter([args.input_file], format=args.input_format, data_type=args.data_type) if args.summarize_reading_frame_lengths: log.info('Summarizing longest reading frame lengths...') if not isinstance(seqs, dataio.BufferedIter): seqs = dataio.BufferedIter(seqs) lengths = seqsum.summarize_longest_read_lengths( seqs, table=args.table, allow_partial=args.allow_partial, require_start_after_stop=(not args.read_after_stop)) length_path = os.path.join(args.output_dir, 'seqvet-reading-frame-lengths.txt') log.info('Writing longest reading frame lengths to file...') with OpenFile(length_path, 'w') as out: out.write('seq_id\tlrf\trev_comp_lrf\n') for (l, rc_l, seq_id) in lengths: out.write('{0}\t{1}\t{2}\n'.format(seq_id, l, rc_l)) if args.compare_translated: log.info('Translating longest reading frames for distance ' 'calculations...') seqs = seqmod.translate_longest_reading_frames( seqs, table=args.table, allow_partial=args.allow_partial, require_start_after_stop=(not args.read_after_stop)) alphabet = alphabets.ProteinAlphabet() if args.check_ids: log.info('Checking sequence IDs...') if not isinstance(seqs, dataio.BufferedIter): seqs = dataio.BufferedIter(seqs) dups = seqstats.get_duplicate_ids(seqs) if len(dups) > 0: dup_path = functions.get_new_path( os.path.join(args.output_dir, 'seqvet-duplicate-ids.txt')) log.warning('Duplicate IDs found! Writing them to ' '{0}'.format(dup_path)) with OpenFile(dup_path, 'w') as out: for dup in dups: out.write('{0}\n'.format(dup)) else: log.info('No duplicate sequence IDs were found.') log.info('Calculating pairwise distances...') distances, rev_comp_errors = seqsum.summarize_distances( seqs, sample_size=args.num_samples, per_site=True, aligned=args.aligned, ignore_gaps=(not args.count_gaps), alphabet=alphabet, do_full_alignment=args.msa, full_alignment_out_path=full_alignment_out_path, aligner_tools=aligner_tools, full_aligner_tools=full_aligner_tools, log_frequency=args.log_frequency) log.info('Done!') log.info('Writing mean distances to file...') distances = sorted([(k, v) for k, v in iteritems(distances)], key=lambda x: x[1].mean, reverse=True) mean_path = functions.get_new_path( os.path.join(args.output_dir, 'seqvet-mean-distances.txt')) with OpenFile(mean_path, 'w') as out: out.write('seq_id\tmean_distance\n') for (seq_id, dist) in distances: out.write('{0}\t{1}\n'.format(seq_id, dist.mean)) log.info('Writing max distances to file...') distances = sorted(distances, key=lambda x: x[1].maximum, reverse=True) max_path = functions.get_new_path( os.path.join(args.output_dir, 'seqvet-max-distances.txt')) with OpenFile(max_path, 'w') as out: out.write('seq_id\tmax_distance\n') for (seq_id, dist) in distances: out.write('{0}\t{1}\n'.format(seq_id, dist.maximum)) if rev_comp_errors: rev_comp_errors = sorted(rev_comp_errors) rce_set = set() rce = [] for (s1, s2, d, drc) in rev_comp_errors: pair = tuple(sorted([s1, s2])) if pair in rce_set: continue rce_set.add(pair) rce.append((pair[0], pair[1], d, drc)) log.info('Writing potential reverse-complement errors to file...') path = functions.get_new_path( os.path.join(args.output_dir, 'seqvet-reverse-complement-warnings.txt')) with OpenFile(path, 'w') as out: out.write('seq1\tseq2\tdistance\trev_comp_distance\n') for (seq1, seq2, d, drc) in rce: out.write('{0}\t{1}\t{2}\t{3}\n'.format(seq1, seq2, d, drc))