def mafft(fasta_file, output_dir, output_ext, seq_type, cpus, anysymbol): """Align sequences.""" in_path = fasta_file if seq_type == 'aa': in_path = bio.adjust_aa_seqs(fasta_file, output_dir) cmd = [ 'mafft', '--amino' if seq_type == 'aa' else '--nuc', '--thread {}'.format(cpus), '--anysymbol' if anysymbol else '' ] if (bio.fasta_record_count(in_path) >= bio.SEQ_COUNT_CUTOFF or bio.longest_fasta_seq(in_path) >= bio.SEQ_LEN_CUTOFF): cmd.append('--auto') else: cmd += [ '--genafpair', '--maxiterate {}'.format(MAX_ITERATE), '--anysymbol' if anysymbol else '' ] cmd.append(in_path) cmd = ' '.join(cmd) aligned = util.file_name(fasta_file, output_ext) with util.cd(output_dir): result = subprocess.check_output(cmd, shell=True) with open(aligned, 'wb') as out_file: out_file.write(result) return aligned
def seq_too_long(fasta, seq_type): """Warn about really long sequences.""" longest = bio.longest_fasta_seq(fasta) if bio.seqs_too_long(longest, seq_type): seq_count = bio.fasta_record_count(fasta) logging.warning(util.shorten("""{} has {} sequences. The longest is {} characters. This is too long and may crash the alignment process. """.format(fasta, seq_count, longest)))
def fa2tree(args): """Build trees from the fasta data.""" for fasta in args.input_files: logging.info('fa2tree input: {}'.format(fasta)) fasta = abspath(fasta) if args.bootstrap: fa2tree_bs(args, fasta) elif bio.fasta_record_count(fasta) >= bio.SEQ_COUNT_CUTOFF: fa2tree_big(args, fasta) else: fa2tree_default(args, fasta)
def fasta_to_tree(args): """Build trees from the fasta data.""" for fasta in util.get_input_files(args.input_dir, args.input_filter): if args.bootstrap: fasta_to_tree_bs(args, fasta) elif bio.fasta_record_count(fasta) >= bio.SEQ_COUNT_CUTOFF: pass else: aligned = mafft( fasta, args.output_dir, args.seq_type, args.cpus, args.anysymbol) cleaned = pxclsq( aligned, args.output_dir, args.seq_type, args.min_occupancy, args.min_seq_len) raxml( cleaned, args.output_dir, args.seq_type, args.cpus, args.seed)
def too_few_records(fasta): """Check if the fasta file is too small to make a good tree.""" if bio.fasta_record_count(fasta) < bio.MIN_SEQ: logging.warning('"{}" has fewer than {} records, skipping.'.format( fasta, bio.MIN_SEQ))