Esempio n. 1
0
def main():
    '''
    Script execution starts here.
    '''
    args = get_arguments()
    if args.align and not find_program('muscle'):
        quit_with_error(
            'Muscle must be installed to produce an aligned output.')
    gene_seqs = dict(load_fasta_file(args.gene_seqs))
    gene_seqs = {
        fix_allele_name(name): seq
        for name, seq in gene_seqs.iteritems()
    }
    mlst = MlstScheme(scheme_table=args.scheme)
    seqs_by_type_and_gene = {}  # key = ST value = list of gene sequences
    for st_num in get_types(args, mlst):
        seqs_by_type_and_gene[st_num] = []
        alleles = mlst.type_to_alleles[st_num]
        for allele in alleles:
            if allele not in gene_seqs:
                quit_with_error('Allele ' + allele +
                                ' not in gene sequence FASTA')
            seqs_by_type_and_gene[st_num].append(gene_seqs[allele])
    if args.align:
        seqs_by_type_and_gene = align_seqs(seqs_by_type_and_gene,
                                           args.muscle_args)
    cat_seqs = sorted([(st, ''.join(seqs))
                       for st, seqs in seqs_by_type_and_gene.iteritems()])
    save_fasta(cat_seqs, args.out)
Esempio n. 2
0
def align_seqs(seqs_by_type, muscle_args):
    '''
    Uses Muscle to align the sequences. Alignments are performed independently for each gene (the
    sequences will be concatenated later).
    '''
    st_nums = seqs_by_type.keys()
    aligned_seqs_by_type = {st_num: [] for st_num in st_nums}
    gene_count = len(seqs_by_type.itervalues().next())
    command = ['muscle'] + muscle_args.split()
    for i in range(gene_count):
        gene_seqs = [(st_num, seqs[i])
                     for st_num, seqs in seqs_by_type.iteritems()]
        muscle_input = ''
        for gene_seq in gene_seqs:
            muscle_input += '>' + str(gene_seq[0]) + '\n'
            muscle_input += add_line_breaks_to_sequence(gene_seq[1], 60)
        process = subprocess.Popen(command,
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)
        muscle_output, err = process.communicate(input=muscle_input)
        if '*** ERROR ***' in err:
            muscle_error = err.split('*** ERROR ***')[1].strip()
            quit_with_error('Muscle alignment failed\n' + muscle_error)
        if 'Invalid command line option' in err:
            muscle_error = err.split('\n')[0].strip()
            quit_with_error('Muscle alignment failed\n' + muscle_error)
        aligned_seqs = load_fasta_lines(muscle_output)
        for st_num, seq in aligned_seqs:
            aligned_seqs_by_type[int(st_num)].append(seq)
    return aligned_seqs_by_type
Esempio n. 3
0
def align_seqs(seqs_by_type, muscle_args):
    """
    Uses Muscle to align the sequences. Alignments are performed independently for each gene (the
    sequences will be concatenated later).
    """
    st_nums = seqs_by_type.keys()
    aligned_seqs_by_type = {st_num: [] for st_num in st_nums}
    gene_count = len(seqs_by_type.itervalues().next())
    command = ["muscle"] + muscle_args.split()
    for i in range(gene_count):
        gene_seqs = [(st_num, seqs[i]) for st_num, seqs in seqs_by_type.iteritems()]
        muscle_input = ""
        for gene_seq in gene_seqs:
            muscle_input += ">" + str(gene_seq[0]) + "\n"
            muscle_input += add_line_breaks_to_sequence(gene_seq[1], 60)
        process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        muscle_output, err = process.communicate(input=muscle_input)
        if "*** ERROR ***" in err:
            muscle_error = err.split("*** ERROR ***")[1].strip()
            quit_with_error("Muscle alignment failed\n" + muscle_error)
        if "Invalid command line option" in err:
            muscle_error = err.split("\n")[0].strip()
            quit_with_error("Muscle alignment failed\n" + muscle_error)
        aligned_seqs = load_fasta_lines(muscle_output)
        for st_num, seq in aligned_seqs:
            aligned_seqs_by_type[int(st_num)].append(seq)
    return aligned_seqs_by_type
Esempio n. 4
0
def main():
    """
    Script execution starts here.
    """
    args = get_arguments()
    if args.align and not find_program("muscle"):
        quit_with_error("Muscle must be installed to produce an aligned output.")
    gene_seqs = dict(load_fasta_file(args.gene_seqs))
    gene_seqs = {fix_allele_name(name): seq for name, seq in gene_seqs.iteritems()}
    mlst = MlstScheme(scheme_table=args.scheme)
    seqs_by_type_and_gene = {}  # key = ST value = list of gene sequences
    for st_num in get_types(args, mlst):
        seqs_by_type_and_gene[st_num] = []
        alleles = mlst.type_to_alleles[st_num]
        for allele in alleles:
            if allele not in gene_seqs:
                quit_with_error("Allele " + allele + " not in gene sequence FASTA")
            seqs_by_type_and_gene[st_num].append(gene_seqs[allele])
    if args.align:
        seqs_by_type_and_gene = align_seqs(seqs_by_type_and_gene, args.muscle_args)
    cat_seqs = sorted([(st, "".join(seqs)) for st, seqs in seqs_by_type_and_gene.iteritems()])
    save_fasta(cat_seqs, args.out)
Esempio n. 5
0
def get_types(args, mlst):
    """
    This function returns a list of the sequence types to output based on the user's arguments.
    It also checks for problems with the chosen types.
    """
    if args.all:
        types = mlst.type_to_alleles.keys()
        if not types:
            quit_with_error("The given MLST scheme has no sequence types.")
    else:
        types = args.types.split(",")
    types = sorted([string_to_int(x) for x in types])
    if None in types:
        quit_with_error("One or more sequence types is incorrectly formatted.")
    for st_num in types:
        if st_num not in mlst.type_to_alleles:
            quit_with_error("Sequence type " + str(st_num) + " is not in the given MLST scheme.")
    if len(types) != len(set(types)):
        quit_with_error("Not all sequence types are unique (there are duplicates).")
    return types
Esempio n. 6
0
def get_types(args, mlst):
    '''
    This function returns a list of the sequence types to output based on the user's arguments.
    It also checks for problems with the chosen types.
    '''
    if args.all:
        types = mlst.type_to_alleles.keys()
        if not types:
            quit_with_error('The given MLST scheme has no sequence types.')
    else:
        types = args.types.split(',')
    types = sorted([string_to_int(x) for x in types])
    if None in types:
        quit_with_error('One or more sequence types is incorrectly formatted.')
    for st_num in types:
        if st_num not in mlst.type_to_alleles:
            quit_with_error('Sequence type ' + str(st_num) +
                            ' is not in the given MLST scheme.')
    if len(types) != len(set(types)):
        quit_with_error(
            'Not all sequence types are unique (there are duplicates).')
    return types