def main(): ''' Script execution starts here. ''' args = get_arguments() if args.align and not find_program('muscle'): quit_with_error( 'Muscle must be installed to produce an aligned output.') gene_seqs = dict(load_fasta_file(args.gene_seqs)) gene_seqs = { fix_allele_name(name): seq for name, seq in gene_seqs.iteritems() } mlst = MlstScheme(scheme_table=args.scheme) seqs_by_type_and_gene = {} # key = ST value = list of gene sequences for st_num in get_types(args, mlst): seqs_by_type_and_gene[st_num] = [] alleles = mlst.type_to_alleles[st_num] for allele in alleles: if allele not in gene_seqs: quit_with_error('Allele ' + allele + ' not in gene sequence FASTA') seqs_by_type_and_gene[st_num].append(gene_seqs[allele]) if args.align: seqs_by_type_and_gene = align_seqs(seqs_by_type_and_gene, args.muscle_args) cat_seqs = sorted([(st, ''.join(seqs)) for st, seqs in seqs_by_type_and_gene.iteritems()]) save_fasta(cat_seqs, args.out)
def align_seqs(seqs_by_type, muscle_args): ''' Uses Muscle to align the sequences. Alignments are performed independently for each gene (the sequences will be concatenated later). ''' st_nums = seqs_by_type.keys() aligned_seqs_by_type = {st_num: [] for st_num in st_nums} gene_count = len(seqs_by_type.itervalues().next()) command = ['muscle'] + muscle_args.split() for i in range(gene_count): gene_seqs = [(st_num, seqs[i]) for st_num, seqs in seqs_by_type.iteritems()] muscle_input = '' for gene_seq in gene_seqs: muscle_input += '>' + str(gene_seq[0]) + '\n' muscle_input += add_line_breaks_to_sequence(gene_seq[1], 60) process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) muscle_output, err = process.communicate(input=muscle_input) if '*** ERROR ***' in err: muscle_error = err.split('*** ERROR ***')[1].strip() quit_with_error('Muscle alignment failed\n' + muscle_error) if 'Invalid command line option' in err: muscle_error = err.split('\n')[0].strip() quit_with_error('Muscle alignment failed\n' + muscle_error) aligned_seqs = load_fasta_lines(muscle_output) for st_num, seq in aligned_seqs: aligned_seqs_by_type[int(st_num)].append(seq) return aligned_seqs_by_type
def align_seqs(seqs_by_type, muscle_args): """ Uses Muscle to align the sequences. Alignments are performed independently for each gene (the sequences will be concatenated later). """ st_nums = seqs_by_type.keys() aligned_seqs_by_type = {st_num: [] for st_num in st_nums} gene_count = len(seqs_by_type.itervalues().next()) command = ["muscle"] + muscle_args.split() for i in range(gene_count): gene_seqs = [(st_num, seqs[i]) for st_num, seqs in seqs_by_type.iteritems()] muscle_input = "" for gene_seq in gene_seqs: muscle_input += ">" + str(gene_seq[0]) + "\n" muscle_input += add_line_breaks_to_sequence(gene_seq[1], 60) process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) muscle_output, err = process.communicate(input=muscle_input) if "*** ERROR ***" in err: muscle_error = err.split("*** ERROR ***")[1].strip() quit_with_error("Muscle alignment failed\n" + muscle_error) if "Invalid command line option" in err: muscle_error = err.split("\n")[0].strip() quit_with_error("Muscle alignment failed\n" + muscle_error) aligned_seqs = load_fasta_lines(muscle_output) for st_num, seq in aligned_seqs: aligned_seqs_by_type[int(st_num)].append(seq) return aligned_seqs_by_type
def main(): """ Script execution starts here. """ args = get_arguments() if args.align and not find_program("muscle"): quit_with_error("Muscle must be installed to produce an aligned output.") gene_seqs = dict(load_fasta_file(args.gene_seqs)) gene_seqs = {fix_allele_name(name): seq for name, seq in gene_seqs.iteritems()} mlst = MlstScheme(scheme_table=args.scheme) seqs_by_type_and_gene = {} # key = ST value = list of gene sequences for st_num in get_types(args, mlst): seqs_by_type_and_gene[st_num] = [] alleles = mlst.type_to_alleles[st_num] for allele in alleles: if allele not in gene_seqs: quit_with_error("Allele " + allele + " not in gene sequence FASTA") seqs_by_type_and_gene[st_num].append(gene_seqs[allele]) if args.align: seqs_by_type_and_gene = align_seqs(seqs_by_type_and_gene, args.muscle_args) cat_seqs = sorted([(st, "".join(seqs)) for st, seqs in seqs_by_type_and_gene.iteritems()]) save_fasta(cat_seqs, args.out)
def get_types(args, mlst): """ This function returns a list of the sequence types to output based on the user's arguments. It also checks for problems with the chosen types. """ if args.all: types = mlst.type_to_alleles.keys() if not types: quit_with_error("The given MLST scheme has no sequence types.") else: types = args.types.split(",") types = sorted([string_to_int(x) for x in types]) if None in types: quit_with_error("One or more sequence types is incorrectly formatted.") for st_num in types: if st_num not in mlst.type_to_alleles: quit_with_error("Sequence type " + str(st_num) + " is not in the given MLST scheme.") if len(types) != len(set(types)): quit_with_error("Not all sequence types are unique (there are duplicates).") return types
def get_types(args, mlst): ''' This function returns a list of the sequence types to output based on the user's arguments. It also checks for problems with the chosen types. ''' if args.all: types = mlst.type_to_alleles.keys() if not types: quit_with_error('The given MLST scheme has no sequence types.') else: types = args.types.split(',') types = sorted([string_to_int(x) for x in types]) if None in types: quit_with_error('One or more sequence types is incorrectly formatted.') for st_num in types: if st_num not in mlst.type_to_alleles: quit_with_error('Sequence type ' + str(st_num) + ' is not in the given MLST scheme.') if len(types) != len(set(types)): quit_with_error( 'Not all sequence types are unique (there are duplicates).') return types