def create_crosstable(sico_files, target_crosstable):
    """Create crosstable with vertically the orthologs, horizontally the genomes, and gene IDs at intersections."""
    with open(target_crosstable, mode='w') as write_handle:
        #Create dictionaries mapping genomes to gene IDs per sico file
        row_data = [(sico_file, dict(itemgetter(0, 2)('|'))
                         for fasta_record in SeqIO.parse(sico_file, 'fasta')))
                    for sico_file in sico_files]

        #Retrieve unique genomes across all sico files, just to be safe
        genomes = sorted(set(key for row in row_data for key in row[1].keys()))
        genome_dicts = select_genomes_by_ids(genomes).values()

        #Write out values to file
        write_handle.write('\t' + '\t'.join(genomes))
        for sico_file, row in row_data:
            ortholog = os.path.split(sico_file)[1].split('.')[0]
            write_handle.write(ortholog + '\t')
            write_handle.write('\t'.join(row.get(genome, '') for genome in genomes))

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(sico_file, 'fasta'))

            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))

            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #New line
def _extract_cog_digits_and_letters(clade_calcs):
    '''Add the COG digits and letters to the clade_calcs.values dictionary for all strains in clade_calcs.alignment.'''
    cog_digits = []
    cog_letters = []
    for cog in find_cogs_in_sequence_records(clade_calcs.alignment):
        # Match digits and letters separately
        matchobj = re.match('(COG[0-9]+)([A-Z]*)', cog)
        if matchobj:
    # Join the found digits and letters using a comma
    clade_calcs.values[COG_DIGITS] = ','.join(cog_digits)
    clade_calcs.values[COG_LETTERS] = ','.join(cog_letters)
Esempio n. 3
 def _occurences_and_cogs(genome_ids, ortholog_files):
     """Generator that returns how many sequences exist per genome in each ortholog in order and which COGs occur."""
     genomes = select_genomes_by_ids(genome_ids).values()
     for fasta_file in ortholog_files:
         records = tuple(SeqIO.parse(fasta_file, 'fasta'))
         ids = ['|')[0] for record in records]
         count_per_id = [ids.count(genome_id) for genome_id in genome_ids]
         cogs = sorted(find_cogs_in_sequence_records(records))
         ortholog_nr = os.path.splitext(os.path.split(fasta_file)[1])[0]
         for record in records:
             #SeqIO mucks up ids containing spaces, so we have to assign description as value for id
    = record.description
         product = get_most_recent_gene_name(genomes, records)
         yield count_per_id, ortholog_nr, cogs, product
def _group_cog_issues(sico_files):
    """Find issues with COG assignments within SICO files by looking at COG conflicts, transferable and missing COGs."""
    cog_conflicts = {}
    cog_transferable = {}
    cog_missing = []
    for sico_file in sico_files:
        cogs = find_cogs_in_sequence_records(SeqIO.parse(sico_file, 'fasta'), include_none=True)
        if 0 == len(cogs):
        if 1 < len(cogs):
            if None in cogs:
                if len(cogs) == 1:
                    cog_transferable[sico_file] = cogs.pop()
            cog_conflicts[sico_file] = cogs
    return cog_conflicts, cog_transferable, cog_missing
def _phipack_for_all_orthologs(run_dir, aligned_files, stats_file):
    """Filter aligned fasta files where there is evidence of recombination when inspecting PhiPack values.
    Return two collections of aligned files, the first without recombination, the second with recombination."""'Running PhiPack for %i orthologs to find recombination', len(aligned_files))

    #Create separate directory for phipack related values
    phipack_dir = create_directory('phipack', inside_dir=run_dir)

    with open(stats_file, mode='w') as write_handle:
                                      'Informative sites',
                                      'Max Chi^2',
                                      'Product']) + '\n')

        #Retrieve unique genomes from first ortholog file
        genome_ids = set('|')[0] for fasta_record in SeqIO.parse(aligned_files[0], 'fasta'))
        genome_dicts = select_genomes_by_ids(genome_ids).values()

        #Assign ortholog files to the correct collection based on whether they show recombination
        for ortholog_file in aligned_files:
            orth_name = os.path.split(ortholog_file)[1].split('.')[0]

            #Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree
            phipack_values = run_phipack(phipack_dir, ortholog_file)

            #Write PhiPack values to line
            write_handle.write('{0}\t{1[PhiPack sites]}\t{1[Phi]}\t{1[Max Chi^2]}\t{1[NSS]}'.format(orth_name,

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(ortholog_file, 'fasta'))
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #End line
def _perform_calculations(alignment, codeml_values):
    """Perform actual calculations on the alignment to determine pN, pS, SFS & the number of ignored cases per SICO."""
    synonymous_sfs = {}
    four_fold_syn_sfs = {}
    non_synonymous_sfs = {}
    four_fold_synonymous_sites = 0
    mixed_synonymous_polymorphisms = 0
    multiple_site_polymorphisms = 0

    #Calculate sequence_lengths here so we can handle alignments that are not multiples of three
    sequence_lengths = len(alignment[0]) - len(alignment[0]) % 3
    #Split into codon_alignments
    codon_alignments = (alignment[:, index:index + 3]
                        for index in range(0, sequence_lengths, 3))
    for codon_alignment in codon_alignments:
        #Get string representations of codons for simplicity
        codons = [str(seqr.seq) for seqr in codon_alignment]

        #As per AEW: ignore codons with gaps, and codons with unresolved bases: Basically anything but ACGT
        if 0 < len(''.join(codons).translate(None, 'ACGTactg')):

        #Skip codons where any of the alignment codons is a stopcodon, same as in codeml
        if any(codon in BACTERIAL_CODON_TABLE.stop_codons for codon in codons):

        #Retrieve translations of codons now that inconclusive & stop-codons have been removed
        translations = [
            BACTERIAL_CODON_TABLE.forward_table.get(codon) for codon in codons

        #Count unique translations across strains
        translation_usage = dict(
            (aa, translations.count(aa)) for aa in set(translations))

        #Mutations are synonymous when all codons encode the same AA, and there are no skipped codons
        synonymous = len(translation_usage) == 1 and len(translations) == len(

        #Retrieve nucleotides per site within the codon
        site1 = [nucl for nucl in codon_alignment[:, 0]]
        site2 = [nucl for nucl in codon_alignment[:, 1]]
        site3 = [nucl for nucl in codon_alignment[:, 2]]

        #Count occurrences of distinct nucleotides across strains
        site1_usage = dict((nucl, site1.count(nucl)) for nucl in set(site1))
        site2_usage = dict((nucl, site2.count(nucl)) for nucl in set(site2))
        site3_usage = dict((nucl, site3.count(nucl)) for nucl in set(site3))

        #Sites are polymorphic if they contain more than one nucleotide
        site1_polymorphic = 1 < len(site1_usage)
        site2_polymorphic = 1 < len(site2_usage)
        site3_polymorphic = 1 < len(site3_usage)
        polymorphisms = site1_polymorphic, site2_polymorphic, site3_polymorphic

        #Continue with next codon if none of the sites is polymorphic
        if not any(polymorphisms):
            #But do increase the number of 4-fold synonymous sites if the pattern matches
            codon = codons[0]
            if re.match(FOUR_FOLD_DEGENERATE_PATTERN, codon):
                #Increase by one, as this site is for fold degenerate, even if it is not polymorphic
                four_fold_synonymous_sites += 1

        #Determine if only one site is polymorphic by using boolean xor and not all
        single_site_polymorphism = site1_polymorphic ^ site2_polymorphic ^ site3_polymorphic and not all(

        #Skip multiple site polymorphisms, but do keep a count of how many we encounter
        if not single_site_polymorphism:
            multiple_site_polymorphisms += 1

        #Determine which site_usage is the single site polymorphism
        polymorph_site_usage = site1_usage if site1_polymorphic else site2_usage if site2_polymorphic else site3_usage

        #Find the 'reference' nucleotide as (one of) the most occurring occupations in this site, so we can -1 later
        psu_values = polymorph_site_usage.values()
        reference_allele_count = max(psu_values)

        #Calculate the local site frequency spectrum, to be added to the gene-wide SFS later
        #We'll be using Site Frequency Spectrum to calculate the number of synonymous and non synonymous polymorphisms
        #Note: this requires a complete SFS across synonymous & non_synonymous polymorphisms, be careful when updating
        local_sfs = dict(
            (ntimes, psu_values.count(ntimes)) for ntimes in set(psu_values))
        #Deduct one for the reference_allele_count, which should not count towards the SFS
            reference_allele_count] = local_sfs[reference_allele_count] - 1
        #Remove empty value as possible result of the above decrement operation
        if local_sfs[reference_allele_count] == 0:
            del local_sfs[reference_allele_count]

        def _update_sfs_with_local_sfs(sfs, local_sfs):
            """Add values from local_sfs to gene-wide sfs"""
            for maf, count in local_sfs.iteritems():
                prev_occupations = sfs.get(maf, 0)
                sfs[maf] = prev_occupations + count

        if synonymous:
            #If all polymorphisms encode for the same AA, we have multiple synonymous polymorphisms, where:
            #2 nucleotides = 1 polymorphism, 3 nucleotides = 2 polymorphisms, 4 nucleotides = 3 polymorphisms

            #Update synonymous SFS by adding values from local SFS
            _update_sfs_with_local_sfs(synonymous_sfs, local_sfs)

            #Codon is four fold degenerate if it matches FOUR_FOLD_DEGENERATE_PATTERN
            if site3_polymorphic:
                codon = codons[0]
                if re.match(FOUR_FOLD_DEGENERATE_PATTERN, codon):
                    #Update four fold degenerate SFS by adding values from local SFS
                    _update_sfs_with_local_sfs(four_fold_syn_sfs, local_sfs)
                    #Increase the number of four_fold synonymous sites here as well
                    four_fold_synonymous_sites += 1
        else:  #not synonymous
            if len(polymorph_site_usage) == len(translation_usage):
                #If all polymorphisms encode for different AA, we have multiple non-synonymous polymorphisms, where:
                #2 nucleotides = 1 polymorphism, 3 nucleotides = 2 polymorphisms, 4 nucleotides = 3 polymorphisms

                #Update non synonymous SFS by adding values from local SFS
                _update_sfs_with_local_sfs(non_synonymous_sfs, local_sfs)
                #Some, but not all polymorphisms encode for different AA, making it unclear how this should be scored
                mixed_synonymous_polymorphisms += 1

    #Compute combined values from the above counted statistics
    computed_values = _compute_values_from_statistics(
        len(alignment), sequence_lengths, codeml_values, synonymous_sfs,
        non_synonymous_sfs, four_fold_syn_sfs, four_fold_synonymous_sites)

    #Miscellaneous additional values
    computed_values['codons'] = sequence_lengths // 3
        'multiple site polymorphisms'] = multiple_site_polymorphisms
        'complex codons (with both synonymous and non-synonymous polymorphisms segregating)'] = mixed_synonymous_polymorphisms

    #Add COGs to output file in split columns
    cog_digits = []
    cog_letters = []
    for cog in find_cogs_in_sequence_records(alignment):
        matchobj = re.match('(COG[0-9]+)([A-Z]*)', cog)
        if matchobj:
    computed_values['cog digits'] = ','.join(cog_digits)
    computed_values['cog letters'] = ','.join(cog_letters)

    return computed_values
