def msa(exe, seq, outfile='', trimming=False, verbose=False): """ General use function for multiple sequence alignment (MSA). :param exe: str, path to the executable of a MSA program. :param seq: str, path to the multiple sequence file (must in FASTA format). :param outfile: str, path to the aligned sequence output (FASTA) file, default: [basename].[aligner].fasta, where basename is the filename of the sequence file without known FASTA file extension, aligner is the name of the aligner program in lowercase, and fasta is the extension for fasta format file. :param trimming: bool, trim gaps and ambiguous sites if True, otherwise, leave them untouched. :param verbose: bool, invoke verbose or silent process mode, default: False, silent mode. :return: str, path to the aligned sequence output file (in FASTA format). """ level = logging.INFO if verbose else logging.ERROR logger.setLevel(level) if os.path.isfile(seq): sequence = os.path.abspath(seq) if exe: aligner, func = _guess(exe) if func is None: error('Invalid or unsupported aligner executable (exe): {}, ' 'alignment aborted.'.format(exe)) sys.exit(1) else: error('Invalid aligner executable (exe), empty string, sequence ' 'alignment aborted.') sys.exit(1) if not outfile: outfile = '.'.join([basename(sequence), aligner, 'fasta']) if os.path.isfile(outfile): info('Found pre-existing alignment file.') else: info('Aligning sequence {} using {}.'.format(sequence, aligner.upper())) outfile = func(exe, sequence, outfile) info('Successfully aligned sequence, alignment was saved to ' '{}.'.format(outfile)) else: error('Sequence: {} is not a file or does not exist.'.format(seq)) sys.exit(1) if trimming: clean = ''.join([basename(outfile), '.trimmed.fasta']) if os.path.isfile(clean): outfile = clean info('Found pre-existing trimmed alignment.') else: _, outfile = trim(outfile, outfile=clean, verbose=verbose) return outfile
def test_trim_fmt(self): msa = os.path.join(DATA, 'msa.phylip') trimmed = { 'A': 'ARGPSSSRILAILAVAFIL', 'B': 'ARGPSTSRFLVILAVAFIL', 'C': 'ARGPTNSRFLVILAVAFLL', 'D': 'VRVPSTSRFLVILAVAFLL' } self.assertDictEqual(trimmed, trim(msa, fmt='phylip-relaxed')[0])
def test_trim_default(self): msa = os.path.join(DATA, 'msa.fa') trimmed = { 'A': 'ARGPSSSRILAILAVAFIL', 'B': 'ARGPSTSRFLVILAVAFIL', 'C': 'ARGPTNSRFLVILAVAFLL', 'D': 'VRVPSTSRFLVILAVAFLL' } self.assertDictEqual(trimmed, trim(msa)[0])
def test_trim_outfile(self): msa = os.path.join(DATA, 'msa.phylip') name = os.path.join(DATA, 'trimmed.msa.fa') trimmed = { 'A': 'ARGPSSSRILAILAVAFIL', 'B': 'ARGPSTSRFLVILAVAFIL', 'C': 'ARGPTNSRFLVILAVAFLL', 'D': 'VRVPSTSRFLVILAVAFLL' } out, outfile = trim(msa, fmt='phylip-relaxed', outfile=name) self.assertTrue(os.path.isfile(name)) self.assertDictEqual(trimmed, out) rs = AlignIO.read(name, 'fasta') out = {r.id: str(r.seq) for r in rs} self.assertDictEqual(trimmed, out) self.rm = name
def _sequencing(sequence, tree, aligner, ancestor, wd, asr_model, verbose): """ Identify the type of the sequence file. :param sequence: str, path to a sequence data file. :param tree: str, path to a NEWICK tree file. :return: tuple, sequence, alignment, ancestor, and simulation data file. """ if tree: utilities.Tree(tree, leave=True) AA, lengths, aa = set(AMINO_ACIDS), [], [] with open(sequence) as handle: line = handle.readline().strip() if line.startswith('>'): handle.seek(0) records = SeqIO.parse(handle, 'fasta') for record in records: lengths.append(len(record.seq)) aa.append(set(record.seq).issubset(AA)) else: error('NEWICK format tree was provided, but the sequence file ' 'was not in the FASTA format.') sys.exit(1) if len(set(lengths)) == 1: alignment = sequence if all(aa): trimmed = alignment else: trimmed = ''.join( [utilities.basename(alignment), '.trimmed.fasta']) if os.path.isfile(trimmed): info('Using pre-existed trimmed alignment file.') else: _, trimmed = utilities.trim(alignment, outfile=trimmed) else: if aligner: aler, _ = msa._guess(aligner) outfile = ''.join( [utilities.basename(sequence), '.{}.fasta'.format(aler)]) if os.path.isfile(outfile): info('Using pre-existed alignment file') alignment = outfile trimmed = ''.join( [utilities.basename(alignment), '.trimmed.fasta']) if os.path.isfile(trimmed): info('Using pre-existed trimmed alignment file.') else: _, trimmed = utilities.trim(alignment, outfile=trimmed) else: trimmed = msa.msa(aligner, sequence, verbose=verbose, outfile=outfile, trimming=True) else: error('FASTA format sequence file was provided, but no ' 'alignment program was provided.') sys.exit(1) if trimmed: if ancestor: if trimmed.endswith('.trimmed.fasta'): name = trimmed.replace('.trimmed.fasta', '') else: name = trimmed aser, _ = asr._guess(ancestor) outfile = '{}.{}.tsv'.format(utilities.basename(name), aser) if os.path.isfile(outfile): info('Using pre-existed ancestral states sequence file.') sequence = outfile else: sequence = asr.asr(ancestor, trimmed, tree, asr_model, verbose=verbose, outfile=outfile) else: error('No ancestral reconstruction program was provided.') sys.exit(1) else: sys.exit(1) tree, rate, records, aps, size = _load(sequence) return tree, rate, records, aps, size, sequence