def mafft(fasta_file, output_dir, output_ext, seq_type, cpus, anysymbol):
    """Align sequences."""
    in_path = fasta_file
    if seq_type == 'aa':
        in_path = bio.adjust_aa_seqs(fasta_file, output_dir)

    cmd = [
        'mafft', '--amino' if seq_type == 'aa' else '--nuc',
        '--thread {}'.format(cpus), '--anysymbol' if anysymbol else ''
    ]

    if (bio.fasta_record_count(in_path) >= bio.SEQ_COUNT_CUTOFF
            or bio.longest_fasta_seq(in_path) >= bio.SEQ_LEN_CUTOFF):
        cmd.append('--auto')
    else:
        cmd += [
            '--genafpair', '--maxiterate {}'.format(MAX_ITERATE),
            '--anysymbol' if anysymbol else ''
        ]

    cmd.append(in_path)
    cmd = ' '.join(cmd)

    aligned = util.file_name(fasta_file, output_ext)

    with util.cd(output_dir):
        result = subprocess.check_output(cmd, shell=True)
        with open(aligned, 'wb') as out_file:
            out_file.write(result)

    return aligned
Esempio n. 2
0
def seq_too_long(fasta, seq_type):
    """Warn about really long sequences."""
    longest = bio.longest_fasta_seq(fasta)

    if bio.seqs_too_long(longest, seq_type):
        seq_count = bio.fasta_record_count(fasta)
        logging.warning(util.shorten("""{} has {} sequences.
            The longest is {} characters.
            This is too long and may crash the alignment process.
            """.format(fasta, seq_count, longest)))
def fa2tree(args):
    """Build trees from the fasta data."""
    for fasta in args.input_files:
        logging.info('fa2tree input: {}'.format(fasta))
        fasta = abspath(fasta)
        if args.bootstrap:
            fa2tree_bs(args, fasta)
        elif bio.fasta_record_count(fasta) >= bio.SEQ_COUNT_CUTOFF:
            fa2tree_big(args, fasta)
        else:
            fa2tree_default(args, fasta)
Esempio n. 4
0
def fasta_to_tree(args):
    """Build trees from the fasta data."""

    for fasta in util.get_input_files(args.input_dir, args.input_filter):
        if args.bootstrap:
            fasta_to_tree_bs(args, fasta)
        elif bio.fasta_record_count(fasta) >= bio.SEQ_COUNT_CUTOFF:
            pass
        else:
            aligned = mafft(
                fasta, args.output_dir, args.seq_type, args.cpus,
                args.anysymbol)
            cleaned = pxclsq(
                aligned, args.output_dir, args.seq_type, args.min_occupancy,
                args.min_seq_len)
            raxml(
                cleaned, args.output_dir, args.seq_type, args.cpus, args.seed)
Esempio n. 5
0
def too_few_records(fasta):
    """Check if the fasta file is too small to make a good tree."""
    if bio.fasta_record_count(fasta) < bio.MIN_SEQ:
        logging.warning('"{}" has fewer than {} records, skipping.'.format(
            fasta, bio.MIN_SEQ))