def create_crosstable(sico_files, target_crosstable):
    """Create crosstable with vertically the orthologs, horizontally the genomes, and gene IDs at intersections."""
    with open(target_crosstable, mode='w') as write_handle:
        #Create dictionaries mapping genomes to gene IDs per sico file
        row_data = [(sico_file, dict(itemgetter(0, 2)('|'))
                         for fasta_record in SeqIO.parse(sico_file, 'fasta')))
                    for sico_file in sico_files]

        #Retrieve unique genomes across all sico files, just to be safe
        genomes = sorted(set(key for row in row_data for key in row[1].keys()))
        genome_dicts = select_genomes_by_ids(genomes).values()

        #Write out values to file
        write_handle.write('\t' + '\t'.join(genomes))
        for sico_file, row in row_data:
            ortholog = os.path.split(sico_file)[1].split('.')[0]
            write_handle.write(ortholog + '\t')
            write_handle.write('\t'.join(row.get(genome, '') for genome in genomes))

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(sico_file, 'fasta'))

            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))

            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #New line
Example #2
    def __init__(self, alignment, genomes):
        self.alignment = alignment
        self.nr_of_strains = len(alignment)
        self.sequence_lengths = len(alignment[0])

        self.values = defaultdict(int)

        # The most basic calculation added to the output file
        self.values[CODONS] = self.sequence_lengths // 3

        # Get the most recent gene name for the strains in a given clade_calcs instance
        self.values[PRODUCT] = get_most_recent_gene_name(genomes, self.alignment)
 def _occurences_and_cogs(genome_ids, ortholog_files):
     """Generator that returns how many sequences exist per genome in each ortholog in order and which COGs occur."""
     genomes = select_genomes_by_ids(genome_ids).values()
     for fasta_file in ortholog_files:
         records = tuple(SeqIO.parse(fasta_file, 'fasta'))
         ids = ['|')[0] for record in records]
         count_per_id = [ids.count(genome_id) for genome_id in genome_ids]
         cogs = sorted(find_cogs_in_sequence_records(records))
         ortholog_nr = os.path.splitext(os.path.split(fasta_file)[1])[0]
         for record in records:
             #SeqIO mucks up ids containing spaces, so we have to assign description as value for id
    = record.description
         product = get_most_recent_gene_name(genomes, records)
         yield count_per_id, ortholog_nr, cogs, product
Example #4
def _phipack_for_all_orthologs(run_dir, aligned_files, stats_file):
    """Filter aligned fasta files where there is evidence of recombination when inspecting PhiPack values.
    Return two collections of aligned files, the first without recombination, the second with recombination."""'Running PhiPack for %i orthologs to find recombination', len(aligned_files))

    #Create separate directory for phipack related values
    phipack_dir = create_directory('phipack', inside_dir=run_dir)

    with open(stats_file, mode='w') as write_handle:
                                      'Informative sites',
                                      'Max Chi^2',
                                      'Product']) + '\n')

        #Retrieve unique genomes from first ortholog file
        genome_ids = set('|')[0] for fasta_record in SeqIO.parse(aligned_files[0], 'fasta'))
        genome_dicts = select_genomes_by_ids(genome_ids).values()

        #Assign ortholog files to the correct collection based on whether they show recombination
        for ortholog_file in aligned_files:
            orth_name = os.path.split(ortholog_file)[1].split('.')[0]

            #Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree
            phipack_values = run_phipack(phipack_dir, ortholog_file)

            #Write PhiPack values to line
            write_handle.write('{0}\t{1[PhiPack sites]}\t{1[Phi]}\t{1[Max Chi^2]}\t{1[NSS]}'.format(orth_name,

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(ortholog_file, 'fasta'))
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #End line
Example #5
def create_crosstable(sico_files, target_crosstable):
    """Create crosstable with vertically the orthologs, horizontally the genomes, and gene IDs at intersections."""
    with open(target_crosstable, mode='w') as write_handle:
        #Create dictionaries mapping genomes to gene IDs per sico file
        row_data = [(sico_file,
                         itemgetter(0, 2)('|'))
                         for fasta_record in SeqIO.parse(sico_file, 'fasta')))
                    for sico_file in sico_files]

        #Retrieve unique genomes across all sico files, just to be safe
        genomes = sorted(set(key for row in row_data for key in row[1].keys()))
        genome_dicts = select_genomes_by_ids(genomes).values()

        #Write out values to file
        write_handle.write('\t' + '\t'.join(genomes))
        for sico_file, row in row_data:
            ortholog = os.path.split(sico_file)[1].split('.')[0]
            write_handle.write(ortholog + '\t')
                row.get(genome, '') for genome in genomes))

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(sico_file, 'fasta'))

            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))

            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #New line
Example #6
def calculate_tables(genome_ids_a, genome_ids_b, sico_files, oddeven=False):
    """Compute a spreadsheet of data points each for A and B based the SICO files, without duplicating computations."""
    #Convert file names into identifiers while preserving filenames, as filenames are used both for BioPython & PhiPack
    orth_files = [(os.path.split(sico_file)[1].split('.')[0], sico_file)
                  for sico_file in sico_files]

    #Find PhiPack values for each sico file
    orth_phipack_values = _phipack_values_for_sicos(orth_files)

    #Convert list of sico files into ortholog name mapped to BioPython Alignment object
    sico_alignments = [(ortholog,, 'fasta'))
                       for ortholog, sico_file in orth_files]

    #Only retrieve genomes once which we'll use to link gene names to orthologs
    all_genome_ids = list(genome_ids_a)
    genomes = select_genomes_by_ids(all_genome_ids).values()

    #For each ortholog, determine the newest gene name across taxa so unannotated taxa also get gene names
    ortholog_gene_names = dict(
        (ortholog, get_most_recent_gene_name(genomes, alignmnt))
        for ortholog, alignmnt in sico_alignments)

    #Split individual sico alignments into separate alignments for each of the clades per ortholog
    #These split alignments can later be reversed and/or subselections can be made to calculate for alternate alignments
    split_alignments = [
         MultipleSeqAlignment(seqr for seqr in alignmnt
                              if'|')[0] in genome_ids_a),
         MultipleSeqAlignment(seqr for seqr in alignmnt
                              if'|')[0] in genome_ids_b))
        for ortholog, alignmnt in sico_alignments

    #Calculate tables for normal sico alignments'Starting calculations for full alignments')
    table_a, table_b = _tables_for_split_alignments(split_alignments,

    if not oddeven:
        return table_a, table_b

    #As an alternate method of calculating number of substitutions for independent X-axis of eventual graph:
    #split each alignment for a and b into two further alignments of odd and even codons
    odd_even_split_orth_alignments = [
        (orthologname, _every_other_codon_alignments(alignment_x),
        for orthologname, alignment_x, alignment_y in split_alignments

    #Recover odd alignments as first from each pair of alignments
    odd_split_alignments = [(orthologname, odd_even_x[0], odd_even_y[0])
                            for orthologname, odd_even_x, odd_even_y in

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    odd_alignments_dir = tempfile.mkdtemp(prefix='odd_codon_alignments_')
    odd_files = dict(
        (ortholog, os.path.join(odd_alignments_dir, ortholog + '.ffn'))
        for ortholog, odd_x, odd_y in odd_split_alignments)
    for ortholog, odd_x, odd_y in odd_split_alignments:
        AlignIO.write([odd_x, odd_y], odd_files[ortholog], 'fasta')
    odd_phipack_vals = _phipack_values_for_sicos(odd_files.items())

    #Calculate tables for odd codon sico alignments'Starting calculations for odd alignments')
    table_a_odd, table_b_odd = _tables_for_split_alignments(
        odd_split_alignments, ortholog_gene_names, odd_phipack_vals)

    #Recover even alignments as second from each pair of alignments
    even_split_alignments = [(orthologname, odd_even_x[1], odd_even_y[1])
                             for orthologname, odd_even_x, odd_even_y in

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    even_alignments_dir = tempfile.mkdtemp(prefix='even_codon_alignments_')
    even_files = dict(
        (ortholog, os.path.join(even_alignments_dir, ortholog + '.ffn'))
        for ortholog, even_x, even_y in even_split_alignments)
    for ortholog, even_x, even_y in even_split_alignments:
        AlignIO.write([even_x, even_y], even_files[ortholog], 'fasta')
    even_phipack_vals = _phipack_values_for_sicos(even_files.items())

    #Calculate tables for even codon sico alignments'Starting calculations for even alignments')
    table_a_even, table_b_even = _tables_for_split_alignments(
        even_split_alignments, ortholog_gene_names, even_phipack_vals)

    #Concatenate tables and return their values
    table_a_full = tempfile.mkstemp(suffix='.tsv', prefix='table_a_full_')[1]
    table_b_full = tempfile.mkstemp(suffix='.tsv', prefix='table_b_full_')[1]
    concatenate(table_a_full, [table_a, table_a_odd, table_a_even])
    concatenate(table_b_full, [table_b, table_b_odd, table_b_even])
    return table_a_full, table_b_full
Example #7
def calculate_tables(genome_ids_a, genome_ids_b, sico_files, oddeven=False):
    """Compute a spreadsheet of data points each for A and B based the SICO files, without duplicating computations."""
    #Convert file names into identifiers while preserving filenames, as filenames are used both for BioPython & PhiPack
    orth_files = [(os.path.split(sico_file)[1].split('.')[0], sico_file) for sico_file in sico_files]

    #Find PhiPack values for each sico file
    orth_phipack_values = _phipack_values_for_sicos(orth_files)

    #Convert list of sico files into ortholog name mapped to BioPython Alignment object
    sico_alignments = [(ortholog,, 'fasta'))
                       for ortholog, sico_file in orth_files]

    #Only retrieve genomes once which we'll use to link gene names to orthologs
    all_genome_ids = list(genome_ids_a)
    genomes = select_genomes_by_ids(all_genome_ids).values()

    #For each ortholog, determine the newest gene name across taxa so unannotated taxa also get gene names
    ortholog_gene_names = dict((ortholog, get_most_recent_gene_name(genomes, alignmnt))
                               for ortholog, alignmnt in sico_alignments)

    #Split individual sico alignments into separate alignments for each of the clades per ortholog
    #These split alignments can later be reversed and/or subselections can be made to calculate for alternate alignments
    split_alignments = [(ortholog,
                         MultipleSeqAlignment(seqr for seqr in alignmnt if'|')[0] in genome_ids_a),
                         MultipleSeqAlignment(seqr for seqr in alignmnt if'|')[0] in genome_ids_b))
                        for ortholog, alignmnt in sico_alignments]

    #Calculate tables for normal sico alignments'Starting calculations for full alignments')
    table_a, table_b = _tables_for_split_alignments(split_alignments, ortholog_gene_names, orth_phipack_values)

    if not oddeven:
        return table_a, table_b

    #As an alternate method of calculating number of substitutions for independent X-axis of eventual graph:
    #split each alignment for a and b into two further alignments of odd and even codons
    odd_even_split_orth_alignments = [(orthologname,
                                      for orthologname, alignment_x, alignment_y in split_alignments]

    #Recover odd alignments as first from each pair of alignments
    odd_split_alignments = [(orthologname,
                            for orthologname, odd_even_x, odd_even_y in odd_even_split_orth_alignments]

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    odd_alignments_dir = tempfile.mkdtemp(prefix='odd_codon_alignments_')
    odd_files = dict((ortholog, os.path.join(odd_alignments_dir, ortholog + '.ffn'))
                     for ortholog, odd_x, odd_y in odd_split_alignments)
    for ortholog, odd_x, odd_y in odd_split_alignments:
        AlignIO.write([odd_x, odd_y], odd_files[ortholog], 'fasta')
    odd_phipack_vals = _phipack_values_for_sicos(odd_files.items())

    #Calculate tables for odd codon sico alignments'Starting calculations for odd alignments')
    table_a_odd, table_b_odd = _tables_for_split_alignments(odd_split_alignments, ortholog_gene_names, odd_phipack_vals)

    #Recover even alignments as second from each pair of alignments
    even_split_alignments = [(orthologname,
                            for orthologname, odd_even_x, odd_even_y in odd_even_split_orth_alignments]

    #Create files for all the odd codon alignments, so we can run PhiPack for them
    even_alignments_dir = tempfile.mkdtemp(prefix='even_codon_alignments_')
    even_files = dict((ortholog, os.path.join(even_alignments_dir, ortholog + '.ffn'))
                     for ortholog, even_x, even_y in even_split_alignments)
    for ortholog, even_x, even_y in even_split_alignments:
        AlignIO.write([even_x, even_y], even_files[ortholog], 'fasta')
    even_phipack_vals = _phipack_values_for_sicos(even_files.items())

    #Calculate tables for even codon sico alignments'Starting calculations for even alignments')
    table_a_even, table_b_even = _tables_for_split_alignments(even_split_alignments,

    #Concatenate tables and return their values
    table_a_full = tempfile.mkstemp(suffix='.tsv', prefix='table_a_full_')[1]
    table_b_full = tempfile.mkstemp(suffix='.tsv', prefix='table_b_full_')[1]
    concatenate(table_a_full, [table_a, table_a_odd, table_a_even])
    concatenate(table_b_full, [table_b, table_b_odd, table_b_even])
    return table_a_full, table_b_full