Esempio n. 1
0
def msa(exe, seq, outfile='', trimming=False, verbose=False):
    """
    General use function for multiple sequence alignment (MSA).

    :param exe: str, path to the executable of a MSA program.
    :param seq: str, path to the multiple sequence file (must in FASTA format).
    :param outfile: str, path to the aligned sequence output (FASTA) file,
        default: [basename].[aligner].fasta, where basename is the filename of 
        the sequence file without known FASTA file extension, aligner is the 
        name of the aligner program in lowercase, and fasta is the extension 
        for fasta format file.
    :param trimming: bool, trim gaps and ambiguous sites if True, otherwise,
        leave them untouched.
    :param verbose: bool, invoke verbose or silent process mode,
        default: False, silent mode.
    :return: str, path to the aligned sequence output file (in FASTA format).
    """
    
    level = logging.INFO if verbose else logging.ERROR
    logger.setLevel(level)
    
    if os.path.isfile(seq):
        sequence = os.path.abspath(seq)
        
        if exe:
            aligner, func = _guess(exe)
            if func is None:
                error('Invalid or unsupported aligner executable (exe): {}, '
                      'alignment aborted.'.format(exe))
                sys.exit(1)
        else:
            error('Invalid aligner executable (exe), empty string, sequence '
                  'alignment aborted.')
            sys.exit(1)
        
        if not outfile:
            outfile = '.'.join([basename(sequence), aligner, 'fasta'])
        
        if os.path.isfile(outfile):
            info('Found pre-existing alignment file.')
        else:
            info('Aligning sequence {} using {}.'.format(sequence,
                                                         aligner.upper()))
            outfile = func(exe, sequence, outfile)
            info('Successfully aligned sequence, alignment was saved to '
                 '{}.'.format(outfile))
    else:
        error('Sequence: {} is not a file or does not exist.'.format(seq))
        sys.exit(1)
    if trimming:
        clean = ''.join([basename(outfile), '.trimmed.fasta'])
        if os.path.isfile(clean):
            outfile = clean
            info('Found pre-existing trimmed alignment.')
        else:
            _, outfile = trim(outfile, outfile=clean, verbose=verbose)
    return outfile
Esempio n. 2
0
 def test_trim_fmt(self):
     msa = os.path.join(DATA, 'msa.phylip')
     trimmed = {
         'A': 'ARGPSSSRILAILAVAFIL',
         'B': 'ARGPSTSRFLVILAVAFIL',
         'C': 'ARGPTNSRFLVILAVAFLL',
         'D': 'VRVPSTSRFLVILAVAFLL'
     }
     self.assertDictEqual(trimmed, trim(msa, fmt='phylip-relaxed')[0])
Esempio n. 3
0
 def test_trim_default(self):
     msa = os.path.join(DATA, 'msa.fa')
     trimmed = {
         'A': 'ARGPSSSRILAILAVAFIL',
         'B': 'ARGPSTSRFLVILAVAFIL',
         'C': 'ARGPTNSRFLVILAVAFLL',
         'D': 'VRVPSTSRFLVILAVAFLL'
     }
     self.assertDictEqual(trimmed, trim(msa)[0])
Esempio n. 4
0
    def test_trim_outfile(self):
        msa = os.path.join(DATA, 'msa.phylip')
        name = os.path.join(DATA, 'trimmed.msa.fa')
        trimmed = {
            'A': 'ARGPSSSRILAILAVAFIL',
            'B': 'ARGPSTSRFLVILAVAFIL',
            'C': 'ARGPTNSRFLVILAVAFLL',
            'D': 'VRVPSTSRFLVILAVAFLL'
        }

        out, outfile = trim(msa, fmt='phylip-relaxed', outfile=name)
        self.assertTrue(os.path.isfile(name))
        self.assertDictEqual(trimmed, out)
        rs = AlignIO.read(name, 'fasta')
        out = {r.id: str(r.seq) for r in rs}
        self.assertDictEqual(trimmed, out)
        self.rm = name
Esempio n. 5
0
def _sequencing(sequence, tree, aligner, ancestor, wd, asr_model, verbose):
    """
    Identify the type of the sequence file.
    
    :param sequence: str, path to a sequence data file.
    :param tree: str, path to a NEWICK tree file.
    :return: tuple, sequence, alignment, ancestor, and simulation data file.
    """

    if tree:
        utilities.Tree(tree, leave=True)
        AA, lengths, aa = set(AMINO_ACIDS), [], []

        with open(sequence) as handle:
            line = handle.readline().strip()
            if line.startswith('>'):
                handle.seek(0)
                records = SeqIO.parse(handle, 'fasta')
                for record in records:
                    lengths.append(len(record.seq))
                    aa.append(set(record.seq).issubset(AA))
            else:
                error('NEWICK format tree was provided, but the sequence file '
                      'was not in the FASTA format.')
                sys.exit(1)

        if len(set(lengths)) == 1:
            alignment = sequence
            if all(aa):
                trimmed = alignment
            else:
                trimmed = ''.join(
                    [utilities.basename(alignment), '.trimmed.fasta'])
                if os.path.isfile(trimmed):
                    info('Using pre-existed trimmed alignment file.')
                else:
                    _, trimmed = utilities.trim(alignment, outfile=trimmed)
        else:
            if aligner:
                aler, _ = msa._guess(aligner)
                outfile = ''.join(
                    [utilities.basename(sequence), '.{}.fasta'.format(aler)])
                if os.path.isfile(outfile):
                    info('Using pre-existed alignment file')
                    alignment = outfile
                    trimmed = ''.join(
                        [utilities.basename(alignment), '.trimmed.fasta'])
                    if os.path.isfile(trimmed):
                        info('Using pre-existed trimmed alignment file.')
                    else:
                        _, trimmed = utilities.trim(alignment, outfile=trimmed)
                else:
                    trimmed = msa.msa(aligner,
                                      sequence,
                                      verbose=verbose,
                                      outfile=outfile,
                                      trimming=True)
            else:
                error('FASTA format sequence file was provided, but no '
                      'alignment program was provided.')
                sys.exit(1)

        if trimmed:
            if ancestor:
                if trimmed.endswith('.trimmed.fasta'):
                    name = trimmed.replace('.trimmed.fasta', '')
                else:
                    name = trimmed

                aser, _ = asr._guess(ancestor)
                outfile = '{}.{}.tsv'.format(utilities.basename(name), aser)
                if os.path.isfile(outfile):
                    info('Using pre-existed ancestral states sequence file.')
                    sequence = outfile
                else:
                    sequence = asr.asr(ancestor,
                                       trimmed,
                                       tree,
                                       asr_model,
                                       verbose=verbose,
                                       outfile=outfile)
            else:
                error('No ancestral reconstruction program was provided.')
                sys.exit(1)
        else:
            sys.exit(1)

    tree, rate, records, aps, size = _load(sequence)
    return tree, rate, records, aps, size, sequence