def createRandomizedSeqs_CDS_with_3UTR(
        cds,
        newShuffleIds,
        shuffleType=db.Sources.
    ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation,
        taxId=None):
    #NucleotidePermutationRandomization, CDSand3UTRRandomization
    cdsRand = SynonymousCodonPermutingRandomization(cds.getTranslationTable())
    utrRand = NucleotidePermutationRandomization()

    if shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation:
        shuffler = CDSand3UTRRandomization(cdsRand, utrRand)

    elif shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation_Including_Next_CDS:
        shuffler = CDSand3UTRRandomizationIncludingNextCDS(
            cdsRand, utrRand, taxId=taxId, constantOverlaps=False)

    elif shuffleType == db.Sources.ShuffleCDS_synon_perm_and_3UTR_nucleotide_permutation_Including_Next_CDS_Constant_Overlaps:
        shuffler = CDSand3UTRRandomizationIncludingNextCDS(
            cdsRand, utrRand, taxId=taxId, constantOverlaps=True)

    else:
        raise Exception("Unknown shuffleType={}".format(shuffleType))

    genomeModel = getGenomeModelFromCache(cds.getTaxId())

    nativeSeq = cds.sequence()
    stopCodonPos = cds.CDSlength()
    #print(nativeSeq[:10])

    newShuffles = []
    for shuffleId in newShuffleIds:
        totalPermutationsCount, identity, newseq = None, None, None

        try:
            totalPermutationsCount, identity, newseq = shuffler.randomize(
                nativeSeq, cds.getProtId())
        except Exception as e:
            print(e)
            raise

        assert ((identity <= 1.0) and (identity > 0.0))

        if (identity > 0.95):
            print(
                "Warning: Identity of randomized sequence is high - %.3g%% (length=%d nt, total permutations=%.2g)"
                % (identity * 100.0, len(newseq), totalPermutationsCount))

        if (totalPermutationsCount < 500):
            raise Exception(
                "Low number of possible permutations %.2g (length=%d nt, identity=%.3g%%)"
                % (totalPermutationsCount, len(newseq), identity * 100.0))
        newShuffles.append(newseq)

    return newShuffles
def getIdentifiersConversionTableUsingGff3():
    global altIdentifiers

    if altIdentifiers:
        return altIdentifiers

    gm = getGenomeModelFromCache(taxId)

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper(taxId, protId)
        geneId = cds.getGeneId()
        alts = gm.findEquivalentIdentifiers(geneId)
        for i in alts:
            altIdentifiers[i] = protId
        altIdentifiers[geneId] = protId
Exemple #3
0
def testCDSand3UTRRandomizationIncludingNextCDS(
        taxId: int = 511145,
        geneticCode: int = 11,
        constantOverlaps: bool = False) -> int:
    from data_helpers import SpeciesCDSSource
    from genome_model import getGenomeModelFromCache

    rand = CDSand3UTRRandomizationIncludingNextCDS(
        SynonymousCodonPermutingRandomization(geneticCode=geneticCode),
        NucleotidePermutationRandomization(),
        taxId,
        constantOverlaps=constantOverlaps)

    #for protId in SpeciesCDSSource(taxId):
    countOK = 0
    countNotOK = 0
    countNotOK2 = 0
    countSkipped = 0

    for protId in getGenomeModelFromCache(taxId).allCDSSource():
        try:
            cds = CDSHelper(taxId, protId)
            seq = cds.sequence()

            #if str(seq).find("n") != -1:
            #    countSkipped += 1
            #    continue

        except Exception as e:
            countNotOK += 1
            continue

        for i in range(20):
            try:
                ret = rand.randomize(seq, protId)

            except Exception as e:
                print(
                    "Caught exception during call to randomize(), protId={}!".
                    format(protId))
                print(e)
                countNotOK += 1
                countNotOK2 += 1
                continue

            if ret[0] < 1e5:
                print(protId)

            if not (len(ret[2]) == len(seq)):
                print(ret)
                rand.randomize(seq, protId)
            assert (len(ret[2]) == len(seq))

        countOK += 1

        #print("{} -> {}".format( protId, ret ))

    print("OK: {}, NotOK: {}, Skipped: {}, Total: {}".format(
        countOK, countNotOK, countSkipped,
        countOK + countNotOK + countSkipped))
    print("randomize exception: {}".format(countNotOK2))

    return 0
def processGenome(args, taxId):

    alreadyProcessedGenes = {}
    totalProteinsProcessed = 0
    totalSkipped = 0

    seqsForWriting=[]
    recordsForWriting={}
    
    gm = getGenomeModelFromCache( taxId )

    for protId in SpeciesCDSSource(taxId):
        cds = CDSHelper( taxId, protId )
        totalProteinsProcessed += 1

        #feature = gm.findFeatureById( protId )
        geneId = cds.getGeneId()

        #flanking3UTRRegionLengthNt = cds.flankingRegion3UtrLength()

        feature = gm.findFeatureById( protId )
        #feature = cds.getMatchingFeatureFromGenomeModel()
        #print(feature)
        strand = feature[1].data['strand']

        if strand=='+':
            otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] )

            if otherFeature is None:
                totalSkipped += 1
                continue

            assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end)
            flanking3UTRRegionLengthNt = otherFeature['curr-feature'].begin       -  otherFeature['downstream-feature'].end

            threePrimeUTRCoords = (feature[1].begin-20, feature[1].begin+2, False) # include the first 3 nucleotides of the CDS

        else:
            otherFeature = gm.moleculeModels[ feature[0] ].find5PrimeFlankingRegion( feature[1] )

            if otherFeature is None:
                totalSkipped += 1
                continue

            assert( otherFeature['downstream-feature'].begin <= otherFeature['downstream-feature'].end)
            flanking3UTRRegionLengthNt = otherFeature['downstream-feature'].begin - otherFeature['curr-feature'].end

            threePrimeUTRCoords = (feature[1].end-3, feature[1].end+20, True) # include the first 3 nucleotides of the CDS

        threePrimeUTR = gm.moleculeModels[ feature[0] ].getSequence( *threePrimeUTRCoords )

        if flanking3UTRRegionLengthNt < -50:
            print("Warning: found gene with apparent long overlap: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
            #totalSkipped += 1
            #continue

        if threePrimeUTR.seq[-2:] != 'TG':
            print("Warning: skipping gene with start codon at the correct place: {},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
            totalSkipped += 1
            continue

        # All done - emit the output
        #fout.write("{},{},{},{},{}".format( protId, geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq ))
        recordsForWriting[protId] = (geneId, strand, flanking3UTRRegionLengthNt, threePrimeUTR.seq )

        seqsForWriting.append( SeqRecord( Seq(threePrimeUTR.seq[:-3], NucleotideAlphabet), id=protId) )

    aSD = calculateaSDEnergies( seqsForWriting, args, taxId )
    print(len(aSD))

    with open( outputData.format(taxId), 'wt') as fout:
        for protId, record in recordsForWriting.items():
            aSDval = aSD.get(protId, None)
            vals = (protId,) + record + (aSDval,)
            fout.write("{},{},{},{},{},{}\n".format( *vals ))
    

    print("Processed {} coding sequences for taxid {}".format( totalProteinsProcessed, taxId ))
    print("Skipped {} coding sequences".format( totalSkipped ))
    def readFile(self):

        self.geneInfo = {}

        with open(ODB4_csv_path) as csvfile:
            reader = csv.reader(csvfile, delimiter='\t')
            rowNum = 0
            for row in reader:
                rowNum += 1
                if rowNum == 1: continue  # skip header line

                taxId = int(row[ODB4_format.TaxId])
                if (not self.taxIdFilter is None) and (taxId !=
                                                       self.taxIdFilter):
                    continue  # filter rows by taxId (if specified)

                gm = getGenomeModelFromCache(taxId)

                operonName = row[ODB4_format.OperonName]

                genesInOrder = row[ODB4_format.GenesOrder].split(',')

                # For some reason, the genes in ODB4 appear relative to the positive strand, even for operons transcribed from the negative strand...
                # Consequently, to determine the real order of genes we need to determine the strand for this operon.
                # First, collect the strands for all genes in this operon
                strands = []
                for gid in genesInOrder:  # for each gene, find equivalent identifiers
                    idents = gm.findEquivalentIdentifiers(
                        gid
                    )  # try each identifier to find one associated with a gff3 feature
                    if idents is None:
                        if gid[:2] == "HP" and gid[:3] != "HP_":  # Workaround for H. pylori
                            idents = gm.findEquivalentIdentifiers("HP_" +
                                                                  gid[2:])

                        if idents is None:
                            continue

                    for i in idents:
                        f = gm.findFeatureById(i)
                        if not f is None:
                            strands.append(
                                f[1].data['strand'])  # store all strands
                            break

                if not strands:  # With no strand found, we cannot use this operon
                    print("Missing info for {}".format(operonName))
                    continue

                # Make sure all strands are the same
                if not allItemsAreEqual(strands):
                    print("Conflicting info for {}".format(operonName))
                    continue
                strand = strands[0]
                assert (strand in ('+', '-'))

                # Store position information for every gene in this operon
                for pos, gene in enumerate(genesInOrder):
                    if gene[:2] == "HP" and gene[:3] != "HP_":  # Workaround for H. pylori
                        gene = "HP_" + gene[2:]

                    if strand == '+':
                        geneData = (pos, len(genesInOrder), operonName)
                    else:
                        if taxId != 169963:
                            geneData = (len(genesInOrder) - pos - 1,
                                        len(genesInOrder), operonName)
                        else:  # Workaround for Listeria
                            geneData = (pos, len(genesInOrder), operonName)

                    if self.taxIdFilter is None:  # No filter defined; store the taxId with the entry
                        self.geneInfo[self.getGeneIdentifier(
                            gene, taxId=taxId)] = geneData
                    else:  # Taxonomy filtered to single species; no need to store taxId with entry
                        self.geneInfo[self.getGeneIdentifier(gene)] = geneData