def create_crosstable(sico_files, target_crosstable):
    """Create crosstable with vertically the orthologs, horizontally the genomes, and gene IDs at intersections."""
    with open(target_crosstable, mode='w') as write_handle:
        #Create dictionaries mapping genomes to gene IDs per sico file
        row_data = [(sico_file, dict(itemgetter(0, 2)(fasta_record.id.split('|'))
                         for fasta_record in SeqIO.parse(sico_file, 'fasta')))
                    for sico_file in sico_files]

        #Retrieve unique genomes across all sico files, just to be safe
        genomes = sorted(set(key for row in row_data for key in row[1].keys()))
        genome_dicts = select_genomes_by_ids(genomes).values()

        #Write out values to file
        write_handle.write('\t' + '\t'.join(genomes))
        write_handle.write('\tCOGs\tProduct\n')
        for sico_file, row in row_data:
            ortholog = os.path.split(sico_file)[1].split('.')[0]
            write_handle.write(ortholog + '\t')
            write_handle.write('\t'.join(row.get(genome, '') for genome in genomes))

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(sico_file, 'fasta'))

            #COGs
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))

            #Product
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #New line
            write_handle.write('\n')
def _extract_cog_digits_and_letters(clade_calcs):
    '''Add the COG digits and letters to the clade_calcs.values dictionary for all strains in clade_calcs.alignment.'''
    cog_digits = []
    cog_letters = []
    for cog in find_cogs_in_sequence_records(clade_calcs.alignment):
        # Match digits and letters separately
        matchobj = re.match('(COG[0-9]+)([A-Z]*)', cog)
        if matchobj:
            cog_digits.append(matchobj.groups()[0])
            cog_letters.append(matchobj.groups()[1])
    # Join the found digits and letters using a comma
    clade_calcs.values[COG_DIGITS] = ','.join(cog_digits)
    clade_calcs.values[COG_LETTERS] = ','.join(cog_letters)
 def _occurences_and_cogs(genome_ids, ortholog_files):
     """Generator that returns how many sequences exist per genome in each ortholog in order and which COGs occur."""
     genomes = select_genomes_by_ids(genome_ids).values()
     for fasta_file in ortholog_files:
         records = tuple(SeqIO.parse(fasta_file, 'fasta'))
         ids = [record.id.split('|')[0] for record in records]
         count_per_id = [ids.count(genome_id) for genome_id in genome_ids]
         cogs = sorted(find_cogs_in_sequence_records(records))
         ortholog_nr = os.path.splitext(os.path.split(fasta_file)[1])[0]
         for record in records:
             #SeqIO mucks up ids containing spaces, so we have to assign description as value for id
             record.id = record.description
         product = get_most_recent_gene_name(genomes, records)
         yield count_per_id, ortholog_nr, cogs, product
Exemple #4
0
def _group_cog_issues(sico_files):
    """Find issues with COG assignments within SICO files by looking at COG conflicts, transferable and missing COGs."""
    cog_conflicts = {}
    cog_transferable = {}
    cog_missing = []
    for sico_file in sico_files:
        cogs = find_cogs_in_sequence_records(SeqIO.parse(sico_file, 'fasta'), include_none=True)
        if 0 == len(cogs):
            cog_missing.append(sico_file)
            continue
        if 1 < len(cogs):
            if None in cogs:
                cogs.remove(None)
                if len(cogs) == 1:
                    cog_transferable[sico_file] = cogs.pop()
                    continue
            cog_conflicts[sico_file] = cogs
    return cog_conflicts, cog_transferable, cog_missing
Exemple #5
0
def _phipack_for_all_orthologs(run_dir, aligned_files, stats_file):
    """Filter aligned fasta files where there is evidence of recombination when inspecting PhiPack values.
    Return two collections of aligned files, the first without recombination, the second with recombination."""

    log.info('Running PhiPack for %i orthologs to find recombination', len(aligned_files))

    #Create separate directory for phipack related values
    phipack_dir = create_directory('phipack', inside_dir=run_dir)

    with open(stats_file, mode='w') as write_handle:
        write_handle.write('\t'.join(['Ortholog',
                                      'Informative sites',
                                      'Phi',
                                      'Max Chi^2',
                                      'NSS',
                                      'COGs',
                                      'Product']) + '\n')

        #Retrieve unique genomes from first ortholog file
        genome_ids = set(fasta_record.id.split('|')[0] for fasta_record in SeqIO.parse(aligned_files[0], 'fasta'))
        genome_dicts = select_genomes_by_ids(genome_ids).values()

        #Assign ortholog files to the correct collection based on whether they show recombination
        for ortholog_file in aligned_files:
            orth_name = os.path.split(ortholog_file)[1].split('.')[0]

            #Parse tree file to ensure all genome_ids_a & genome_ids_b group together in the tree
            phipack_values = run_phipack(phipack_dir, ortholog_file)

            #Write PhiPack values to line
            write_handle.write('{0}\t{1[PhiPack sites]}\t{1[Phi]}\t{1[Max Chi^2]}\t{1[NSS]}'.format(orth_name,
                                                                                                    phipack_values))

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(ortholog_file, 'fasta'))
            #COGs
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))
            #Product
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #End line
            write_handle.write('\n')
Exemple #6
0
def create_crosstable(sico_files, target_crosstable):
    """Create crosstable with vertically the orthologs, horizontally the genomes, and gene IDs at intersections."""
    with open(target_crosstable, mode='w') as write_handle:
        #Create dictionaries mapping genomes to gene IDs per sico file
        row_data = [(sico_file,
                     dict(
                         itemgetter(0, 2)(fasta_record.id.split('|'))
                         for fasta_record in SeqIO.parse(sico_file, 'fasta')))
                    for sico_file in sico_files]

        #Retrieve unique genomes across all sico files, just to be safe
        genomes = sorted(set(key for row in row_data for key in row[1].keys()))
        genome_dicts = select_genomes_by_ids(genomes).values()

        #Write out values to file
        write_handle.write('\t' + '\t'.join(genomes))
        write_handle.write('\tCOGs\tProduct\n')
        for sico_file, row in row_data:
            ortholog = os.path.split(sico_file)[1].split('.')[0]
            write_handle.write(ortholog + '\t')
            write_handle.write('\t'.join(
                row.get(genome, '') for genome in genomes))

            #Parse sequence records again, but now to retrieve cogs and products
            seq_records = list(SeqIO.parse(sico_file, 'fasta'))

            #COGs
            cogs = find_cogs_in_sequence_records(seq_records)
            write_handle.write('\t' + ','.join(cogs))

            #Product
            product = get_most_recent_gene_name(genome_dicts, seq_records)
            write_handle.write('\t' + product)

            #New line
            write_handle.write('\n')
Exemple #7
0
def _perform_calculations(alignment, codeml_values):
    """Perform actual calculations on the alignment to determine pN, pS, SFS & the number of ignored cases per SICO."""
    synonymous_sfs = {}
    four_fold_syn_sfs = {}
    non_synonymous_sfs = {}
    four_fold_synonymous_sites = 0
    mixed_synonymous_polymorphisms = 0
    multiple_site_polymorphisms = 0

    #Calculate sequence_lengths here so we can handle alignments that are not multiples of three
    sequence_lengths = len(alignment[0]) - len(alignment[0]) % 3
    #Split into codon_alignments
    codon_alignments = (alignment[:, index:index + 3]
                        for index in range(0, sequence_lengths, 3))
    for codon_alignment in codon_alignments:
        #Get string representations of codons for simplicity
        codons = [str(seqr.seq) for seqr in codon_alignment]

        #As per AEW: ignore codons with gaps, and codons with unresolved bases: Basically anything but ACGT
        if 0 < len(''.join(codons).translate(None, 'ACGTactg')):
            continue

        #Skip codons where any of the alignment codons is a stopcodon, same as in codeml
        if any(codon in BACTERIAL_CODON_TABLE.stop_codons for codon in codons):
            continue

        #Retrieve translations of codons now that inconclusive & stop-codons have been removed
        translations = [
            BACTERIAL_CODON_TABLE.forward_table.get(codon) for codon in codons
        ]

        #Count unique translations across strains
        translation_usage = dict(
            (aa, translations.count(aa)) for aa in set(translations))

        #Mutations are synonymous when all codons encode the same AA, and there are no skipped codons
        synonymous = len(translation_usage) == 1 and len(translations) == len(
            codon_alignment)

        #Retrieve nucleotides per site within the codon
        site1 = [nucl for nucl in codon_alignment[:, 0]]
        site2 = [nucl for nucl in codon_alignment[:, 1]]
        site3 = [nucl for nucl in codon_alignment[:, 2]]

        #Count occurrences of distinct nucleotides across strains
        site1_usage = dict((nucl, site1.count(nucl)) for nucl in set(site1))
        site2_usage = dict((nucl, site2.count(nucl)) for nucl in set(site2))
        site3_usage = dict((nucl, site3.count(nucl)) for nucl in set(site3))

        #Sites are polymorphic if they contain more than one nucleotide
        site1_polymorphic = 1 < len(site1_usage)
        site2_polymorphic = 1 < len(site2_usage)
        site3_polymorphic = 1 < len(site3_usage)
        polymorphisms = site1_polymorphic, site2_polymorphic, site3_polymorphic

        #Continue with next codon if none of the sites is polymorphic
        if not any(polymorphisms):
            #But do increase the number of 4-fold synonymous sites if the pattern matches
            codon = codons[0]
            if re.match(FOUR_FOLD_DEGENERATE_PATTERN, codon):
                #Increase by one, as this site is for fold degenerate, even if it is not polymorphic
                four_fold_synonymous_sites += 1
            continue

        #Determine if only one site is polymorphic by using boolean xor and not all
        single_site_polymorphism = site1_polymorphic ^ site2_polymorphic ^ site3_polymorphic and not all(
            polymorphisms)

        #Skip multiple site polymorphisms, but do keep a count of how many we encounter
        if not single_site_polymorphism:
            multiple_site_polymorphisms += 1
            continue

        #Determine which site_usage is the single site polymorphism
        polymorph_site_usage = site1_usage if site1_polymorphic else site2_usage if site2_polymorphic else site3_usage

        #Find the 'reference' nucleotide as (one of) the most occurring occupations in this site, so we can -1 later
        psu_values = polymorph_site_usage.values()
        reference_allele_count = max(psu_values)

        #Calculate the local site frequency spectrum, to be added to the gene-wide SFS later
        #We'll be using Site Frequency Spectrum to calculate the number of synonymous and non synonymous polymorphisms
        #Note: this requires a complete SFS across synonymous & non_synonymous polymorphisms, be careful when updating
        local_sfs = dict(
            (ntimes, psu_values.count(ntimes)) for ntimes in set(psu_values))
        #Deduct one for the reference_allele_count, which should not count towards the SFS
        local_sfs[
            reference_allele_count] = local_sfs[reference_allele_count] - 1
        #Remove empty value as possible result of the above decrement operation
        if local_sfs[reference_allele_count] == 0:
            del local_sfs[reference_allele_count]

        def _update_sfs_with_local_sfs(sfs, local_sfs):
            """Add values from local_sfs to gene-wide sfs"""
            for maf, count in local_sfs.iteritems():
                prev_occupations = sfs.get(maf, 0)
                sfs[maf] = prev_occupations + count

        if synonymous:
            #If all polymorphisms encode for the same AA, we have multiple synonymous polymorphisms, where:
            #2 nucleotides = 1 polymorphism, 3 nucleotides = 2 polymorphisms, 4 nucleotides = 3 polymorphisms

            #Update synonymous SFS by adding values from local SFS
            _update_sfs_with_local_sfs(synonymous_sfs, local_sfs)

            #Codon is four fold degenerate if it matches FOUR_FOLD_DEGENERATE_PATTERN
            if site3_polymorphic:
                codon = codons[0]
                if re.match(FOUR_FOLD_DEGENERATE_PATTERN, codon):
                    #Update four fold degenerate SFS by adding values from local SFS
                    _update_sfs_with_local_sfs(four_fold_syn_sfs, local_sfs)
                    #Increase the number of four_fold synonymous sites here as well
                    four_fold_synonymous_sites += 1
        else:  #not synonymous
            if len(polymorph_site_usage) == len(translation_usage):
                #If all polymorphisms encode for different AA, we have multiple non-synonymous polymorphisms, where:
                #2 nucleotides = 1 polymorphism, 3 nucleotides = 2 polymorphisms, 4 nucleotides = 3 polymorphisms

                #Update non synonymous SFS by adding values from local SFS
                _update_sfs_with_local_sfs(non_synonymous_sfs, local_sfs)
            else:
                #Some, but not all polymorphisms encode for different AA, making it unclear how this should be scored
                mixed_synonymous_polymorphisms += 1

    #Compute combined values from the above counted statistics
    computed_values = _compute_values_from_statistics(
        len(alignment), sequence_lengths, codeml_values, synonymous_sfs,
        non_synonymous_sfs, four_fold_syn_sfs, four_fold_synonymous_sites)

    #Miscellaneous additional values
    computed_values['codons'] = sequence_lengths // 3
    computed_values[
        'multiple site polymorphisms'] = multiple_site_polymorphisms
    computed_values[
        'complex codons (with both synonymous and non-synonymous polymorphisms segregating)'] = mixed_synonymous_polymorphisms

    #Add COGs to output file in split columns
    cog_digits = []
    cog_letters = []
    for cog in find_cogs_in_sequence_records(alignment):
        matchobj = re.match('(COG[0-9]+)([A-Z]*)', cog)
        if matchobj:
            cog_digits.append(matchobj.groups()[0])
            cog_letters.append(matchobj.groups()[1])
    computed_values['cog digits'] = ','.join(cog_digits)
    computed_values['cog letters'] = ','.join(cog_letters)

    return computed_values
def _perform_calculations(alignment, codeml_values):
    """Perform actual calculations on the alignment to determine pN, pS, SFS & the number of ignored cases per SICO."""
    synonymous_sfs = {}
    four_fold_syn_sfs = {}
    non_synonymous_sfs = {}
    four_fold_synonymous_sites = 0
    mixed_synonymous_polymorphisms = 0
    multiple_site_polymorphisms = 0

    #Calculate sequence_lengths here so we can handle alignments that are not multiples of three
    sequence_lengths = len(alignment[0]) - len(alignment[0]) % 3
    #Split into codon_alignments
    codon_alignments = (alignment[:, index:index + 3] for index in range(0, sequence_lengths, 3))
    for codon_alignment in codon_alignments:
        #Get string representations of codons for simplicity
        codons = [str(seqr.seq) for seqr in codon_alignment]

        #As per AEW: ignore codons with gaps, and codons with unresolved bases: Basically anything but ACGT
        if 0 < len(''.join(codons).translate(None, 'ACGTactg')):
            continue

        #Skip codons where any of the alignment codons is a stopcodon, same as in codeml
        if any(codon in BACTERIAL_CODON_TABLE.stop_codons for codon in codons):
            continue

        #Retrieve translations of codons now that inconclusive & stop-codons have been removed
        translations = [BACTERIAL_CODON_TABLE.forward_table.get(codon) for codon in codons]

        #Count unique translations across strains
        translation_usage = dict((aa, translations.count(aa)) for aa in set(translations))

        #Mutations are synonymous when all codons encode the same AA, and there are no skipped codons
        synonymous = len(translation_usage) == 1 and len(translations) == len(codon_alignment)

        #Retrieve nucleotides per site within the codon
        site1 = [nucl for nucl in codon_alignment[:, 0]]
        site2 = [nucl for nucl in codon_alignment[:, 1]]
        site3 = [nucl for nucl in codon_alignment[:, 2]]

        #Count occurrences of distinct nucleotides across strains
        site1_usage = dict((nucl, site1.count(nucl)) for nucl in set(site1))
        site2_usage = dict((nucl, site2.count(nucl)) for nucl in set(site2))
        site3_usage = dict((nucl, site3.count(nucl)) for nucl in set(site3))

        #Sites are polymorphic if they contain more than one nucleotide
        site1_polymorphic = 1 < len(site1_usage)
        site2_polymorphic = 1 < len(site2_usage)
        site3_polymorphic = 1 < len(site3_usage)
        polymorphisms = site1_polymorphic, site2_polymorphic, site3_polymorphic

        #Continue with next codon if none of the sites is polymorphic
        if not any(polymorphisms):
            #But do increase the number of 4-fold synonymous sites if the pattern matches
            codon = codons[0]
            if re.match(FOUR_FOLD_DEGENERATE_PATTERN, codon):
                #Increase by one, as this site is for fold degenerate, even if it is not polymorphic
                four_fold_synonymous_sites += 1
            continue

        #Determine if only one site is polymorphic by using boolean xor and not all
        single_site_polymorphism = site1_polymorphic ^ site2_polymorphic ^ site3_polymorphic and not all(polymorphisms)

        #Skip multiple site polymorphisms, but do keep a count of how many we encounter
        if not single_site_polymorphism:
            multiple_site_polymorphisms += 1
            continue

        #Determine which site_usage is the single site polymorphism
        polymorph_site_usage = site1_usage if site1_polymorphic else site2_usage if site2_polymorphic else site3_usage

        #Find the 'reference' nucleotide as (one of) the most occurring occupations in this site, so we can -1 later
        psu_values = polymorph_site_usage.values()
        reference_allele_count = max(psu_values)

        #Calculate the local site frequency spectrum, to be added to the gene-wide SFS later
        #We'll be using Site Frequency Spectrum to calculate the number of synonymous and non synonymous polymorphisms
        #Note: this requires a complete SFS across synonymous & non_synonymous polymorphisms, be careful when updating
        local_sfs = dict((ntimes, psu_values.count(ntimes)) for ntimes in set(psu_values))
        #Deduct one for the reference_allele_count, which should not count towards the SFS
        local_sfs[reference_allele_count] = local_sfs[reference_allele_count] - 1
        #Remove empty value as possible result of the above decrement operation
        if local_sfs[reference_allele_count] == 0:
            del local_sfs[reference_allele_count]

        def _update_sfs_with_local_sfs(sfs, local_sfs):
            """Add values from local_sfs to gene-wide sfs"""
            for maf, count in local_sfs.iteritems():
                prev_occupations = sfs.get(maf, 0)
                sfs[maf] = prev_occupations + count

        if synonymous:
            #If all polymorphisms encode for the same AA, we have multiple synonymous polymorphisms, where:
            #2 nucleotides = 1 polymorphism, 3 nucleotides = 2 polymorphisms, 4 nucleotides = 3 polymorphisms

            #Update synonymous SFS by adding values from local SFS
            _update_sfs_with_local_sfs(synonymous_sfs, local_sfs)

            #Codon is four fold degenerate if it matches FOUR_FOLD_DEGENERATE_PATTERN
            if site3_polymorphic:
                codon = codons[0]
                if re.match(FOUR_FOLD_DEGENERATE_PATTERN, codon):
                    #Update four fold degenerate SFS by adding values from local SFS
                    _update_sfs_with_local_sfs(four_fold_syn_sfs, local_sfs)
                    #Increase the number of four_fold synonymous sites here as well
                    four_fold_synonymous_sites += 1
        else:  #not synonymous
            if len(polymorph_site_usage) == len(translation_usage):
                #If all polymorphisms encode for different AA, we have multiple non-synonymous polymorphisms, where:
                #2 nucleotides = 1 polymorphism, 3 nucleotides = 2 polymorphisms, 4 nucleotides = 3 polymorphisms

                #Update non synonymous SFS by adding values from local SFS
                _update_sfs_with_local_sfs(non_synonymous_sfs, local_sfs)
            else:
                #Some, but not all polymorphisms encode for different AA, making it unclear how this should be scored
                mixed_synonymous_polymorphisms += 1

    #Compute combined values from the above counted statistics
    computed_values = _compute_values_from_statistics(len(alignment), sequence_lengths, codeml_values,
                                                      synonymous_sfs, non_synonymous_sfs, four_fold_syn_sfs,
                                                      four_fold_synonymous_sites)

    #Miscellaneous additional values
    computed_values['codons'] = sequence_lengths // 3
    computed_values['multiple site polymorphisms'] = multiple_site_polymorphisms
    computed_values['complex codons (with both synonymous and non-synonymous polymorphisms segregating)'] = mixed_synonymous_polymorphisms

    #Add COGs to output file in split columns
    cog_digits = []
    cog_letters = []
    for cog in find_cogs_in_sequence_records(alignment):
        matchobj = re.match('(COG[0-9]+)([A-Z]*)', cog)
        if matchobj:
            cog_digits.append(matchobj.groups()[0])
            cog_letters.append(matchobj.groups()[1])
    computed_values['cog digits'] = ','.join(cog_digits)
    computed_values['cog letters'] = ','.join(cog_letters)

    return computed_values