Esempio n. 1
0
 def getFunctionality(self):
     bonding_sites = [x for x in self.Bond_Dict.values()]
     bonding_sites_flat = flatten_list(
         [x[1:] for x in self.Bond_Dict.values()])
     totF = len(bonding_sites_flat)
     noTypeBond = len(bonding_sites)
     return totF, noTypeBond, [
         x[0].getCompleteSymbol() for x in bonding_sites
     ], [len(x) - 1 for x in bonding_sites]
Esempio n. 2
0
def frequency_dictionary_create(tknz_texts):
    dict_frequency = defaultdict(int)
    tokens = [i for i in flatten_list(tknz_texts)]
    for token in tokens:
        dict_frequency[token] += 1

    # сортировка словаря по значению
    sort_dict = sorted(dict_frequency.items(),
                       key=operator.itemgetter(1),
                       reverse=True)
    dict_frequency_df = pd.DataFrame(sort_dict, columns=['token', 'quantity'])

    dict_frequency_df['freq'] = dict_frequency_df['quantity'] / sum(
        dict_frequency_df['quantity'])
    dict_frequency_df.sort_values('freq', ascending=False)
    return dict_frequency_df
Esempio n. 3
0
def bigrams_dictionary_create(tknz_texts):
    m = Mystem()

    # сформируем датафрейм словаря частотности токенов:
    dfr = frequency_dictionary_create(tknz_texts)
    # список для кортежей, состоящих из токенов и их частей речи
    ws_sbj = []
    for tx in tknz_texts:
        temp_ws_sbj = []
        for anlys_dict in m.analyze(" ".join(tx)):
            try:
                sbj = re.sub('[^A-Z]', '', anlys_dict['analysis'][0]['gr'])
                w = anlys_dict['text']
                temp_ws_sbj.append((w, sbj))
            except Exception:
                print("Exception:", anlys_dict, w, sbj)
        ws_sbj.append(temp_ws_sbj)

        # оставим только прилагательные и существительные:
    ws_sbj_sa = [[t for t in x if t[1] in ['A', 'S']] for x in ws_sbj]

    # удалим пустые, если такие есть:
    ws_sbj_sa = [x for x in ws_sbj_sa if x != []]

    # кандидаты на биграммы: AS, SS (возможно нужно перенести в параметры):
    bigrams_candidate = []
    for q_list in ws_sbj_sa:
        bigrams_candidate.append([
            x for x in sliceArray(q_list, length=2, stride=2)
            if ''.join([x[0][1], x[1][1]]) in ['AS', 'SS']
        ])

    bigrams_candidate = [x for x in bigrams_candidate if x != []]
    bigrams_candidate_sep = [[(''.join([x[0][0], x[1][0]]), x[0][0], x[1][0])
                              for x in bg] for bg in bigrams_candidate]

    # сделаем список биграмм "плоским"
    flatit = flatten_list(bigrams_candidate_sep)
    bigrams_candidate_sep = [x for x in flatit]

    # создадим пандас датафрейм из кандидатов в биграммы
    bigrams_candidate_df = pd.DataFrame(bigrams_candidate_sep,
                                        columns=['bigrams', 'w1', 'w2'])

    # посчитаем частотность биграмм и их токенов
    # посчитаем частотность биграмм:
    bgms_cand_freq = bigrams_candidate_df[['bigrams', 'w1'
                                           ]].groupby('bigrams',
                                                      as_index=False).count()
    bgms_cand_freq.rename(columns={'w1': 'quantity'}, inplace=True)
    bgms_cand_freq['freq'] = bgms_cand_freq['quantity'] / sum(
        bgms_cand_freq['quantity'])

    # вернем слова:
    bgms_freq_words = pd.merge(bgms_cand_freq,
                               bigrams_candidate_df,
                               how='left',
                               on='bigrams',
                               copy=False)
    bgms_freq_words.drop_duplicates(inplace=True)

    # объединим частотный словарь токенов и словарь биграмм
    dfr_w1 = dfr.rename(columns={'freq': 'w1_freq', 'token': 'w1'})
    bigrams_est = pd.merge(bgms_freq_words, dfr_w1[['w1', 'w1_freq']], on='w1')

    dfr_w2 = dfr.rename(columns={'freq': 'w2_freq', 'token': 'w2'})
    bigrams_est = pd.merge(bigrams_est, dfr_w2[['w2', 'w2_freq']], on='w2')
    bigrams_est.rename(columns={'freq': 'bigrms_freq'}, inplace=True)

    # теперь все готово к оценке вероятности того, насколько данная биграмма похожа на УСС:
    # количество слов корпуса, участвующих в построении биграмм
    # в каждой биграмме 2 слова
    n = 2 * sum(bigrams_est['quantity'])

    # оценка взаимной информации для слов, входящих в биграммы:
    bigrams_est['estimate'] = np.log(
        (n * bigrams_est['bigrms_freq']**3) /
        (bigrams_est['w1_freq'] * bigrams_est['w2_freq']))
    bigrams_est_sort_df = bigrams_est.sort_values('estimate', ascending=False)
    return bigrams_est_sort_df
 def writeStandard(self,noBondDesc=False):
     # write the string with backbone as the main chain
     # ends 
     if noBondDesc==True:
         self.noWriteBondDesc=True;
     ## get all the binding sites
     # get lists of binding sites corresponding to different BigSMILES bonding descriptors
     bonding_sites = flatten_list([x[1:] for x in self.Bond_Dict.values()])
     # exit if there are less than two bonding sites
     #if len(bonding_sites) < 2:
     #    print('Error in writing standardized repeat unit: expected at least 2 bonding sites but '+str(len(bonding_sites))+' found')
     #    return None
     # choose the first two as the ends
     
     if len(bonding_sites) == 0:
         smilesStr = self.write()
         self.noWriteBondDesc=False
         return smilesStr
     elif len(bonding_sites) == 1:
         source = bonding_sites[0]
         smilesStr = self.write(source)
         self.noWriteBondDesc=False
         return smilesStr
     else:
         source = bonding_sites[0]
         target = bonding_sites[1]
     
     # get the backbone (defined as the shortest path between the two ends)
     path = nx.shortest_path(self.G,source=source,target=target)
     #print(len(path))
     #print(path)
     #G_copy = self.G.copy() # don't bother to copy
     
     for i in range(1,len(path)-1):
         atom = path[i]
         next_atom = path[i+1]
         prev_atom = path[i-1]
         L = self.G.nodes[atom]['neighList']
         
         # first rotate the list L so that the previous atom is at the front
         rotCount = 0
         while L[0] != prev_atom:
             L = deque(L)
             L.rotate(-1)
             rotCount += 1
         L = list(L)
         
         # then check if the next_atom need to be swapped to the end
         swapCount = 0
         if L.index(next_atom) == len(L)-1: 
             #print('no swapp on atom '+str(atom))
             # no need to change things around as the next_atom is at the end of the list
             # i.e. on main chain and not on branch
             pass
         else:
             #print('swapped bonds on atom '+str(atom))
             # need to swap the next_atom with the last atom
             L[L.index(next_atom)] = L[-1]
             L[-1] = next_atom
             self.G.nodes[atom]['neighList'] = L
             swapCount += 1
             #print(L)
         
         # change chirality accordingly
         if self.G.nodes[atom]['chiral'] == '':
             # no chirality specified, whew, nothing to change
             pass
         else:
             nchiral = len(self.G.nodes[atom]['chiral']) -1
             nchiral = ((nchiral + rotCount + swapCount) % 2) + 1
             self.G.nodes[atom]['chiral'] = '@'*nchiral
     
     # done changing G_copy, start from source, and writeLinear will give chain according to path
         
     # now deal with the loops
     # get bfs tree from source. this would ensure all loops are cut on chains other than the main desired one
     self.T = nx.bfs_tree(self.G,source,reverse=True).to_undirected()
     
     self.ringDict = dict()
     self.usedRingID = [False]*100
     for edge in self.G.edges():
         if not tuple(edge) in self.T.edges():
             self.ringDict[edge] = -1
     
     #print(self.ringDict)
     #print(self.T.edges())
     #smilesStr = self.writeLinear((None,source))
     
     
     smilesStr = self.writeComponents(source)
     self.noWriteBondDesc=False
     
     return smilesStr
class BigSmilesPattern(SmilesPattern):
    
#### DEFINITIONS of patterns involved in BigSMILES_Bond ####
    _BigSmilesBondChar = "$<>"
    _BondDesc =  Word(_BigSmilesBondChar,exact=1).setResultsName('BigSMILES_Bondtype') + \
                ( (Word(nums,exact=1).setResultsName('BigSMILES_Bondid') | \
                  Literal('%')+Word(nums,exact=2).setResultsName('BigSMILES_Bondid') ) )*(0,1) 
#    _ladderBondDesc = Word(_bigsmilesBondChar,exact=1).setResultsName('BigSMILES_outerBondtype') + \
#                      '[' + _bondDesc + ']' + \
#                      (Word(nums,exact=1).setResultsName('BigSMILES_outerbondid') | \
#                      Literal('%')+Word(nums,exact=2).setResultsName('BigSMILES_outerbondid') ) 
                      
#    _bigsmilesBond = _ladderBondDesc.setResultsName('BigSMILES_ladderBond') | _bondDesc.setResultsName('BigSMILES_Bond')
    _BigSmilesBond = (Literal('[') + _BondDesc.setResultsName('BigSMILES_Bond') + Literal(']') )

#### DEFINITIONS of patterns involved in Augmented_SMILES ####
    
    # redefinition for the elements used in parsing of Augmented SMILES strings 
    _AugmentedSmilesChar = SmilesPattern._smilesChar | _BigSmilesBond
    
    _AugmentedBranchContent = _AugmentedSmilesChar*(1,None)
    _AugmentedBranchContent.setParseAction(lambda toks: ''.join(toks))
    
    _AugmentedBranch = nestedExpr('(',')',content=_AugmentedBranchContent)
    #_AugmentedBranch.setParseAction(lambda toks: '('+''.join([str(item) for sublist in toks for item in sublist])+')')
    _AugmentedBranch.setParseAction(lambda toks: '('+''.join(flatten_list(toks,str))+')')
    
    # _AugmentedSmilesElement explicitly used in Augmented_SMILES()
    _AugmentedSmilesElement = _AugmentedSmilesChar | _AugmentedBranch.setResultsName('branch')
    _AugmentedSmilesElement.addParseAction(SmilesPattern.addRawStr)

    
    
#### DEFINITIONS of stochastic object
    _StoObjSepChar = ",;"
    #_BracketedBond = (Literal('[') + _BigSmilesBond + Literal(']')).setResultsName('BigSMILES_bracketedBond')
    _TerminalBond = (Literal('[') + (_BondDesc*(0,1)).setResultsName('BigSMILES_Bond') + Literal(']')).setResultsName('BigSMILES_terminalBond')
    _opener = StringStart() + _TerminalBond
    _opener.setParseAction(SmilesPattern.addRawStr)
    _closer = _TerminalBond + StringEnd()
    _closer.setParseAction(SmilesPattern.addRawStr)
    _StoObjSep = Word(',;',exact=1)
    
    printableExceptCurly = printables.replace('{', '').replace('}', '')
    _StoObjContent = Word(printableExceptCurly)#.setResultsName('StoObjCont')
    _StoObjContent.setParseAction(lambda toks: ''.join(toks))
    _StoObj = nestedExpr('{','}',content=_StoObjContent)
    _StoObj.setParseAction(lambda toks: '{'+''.join(flatten_list(toks,str))+'}')
    #_StoObjDummy = (Literal('{') + Word(nums).setResultsName('StoObjId') + Literal('}')).setResultsName('BigSMILES_StoObj')
    
    
    def separateList(toks):
        L = [x for x in toks if x != ',']
        if not 'endGrp' in toks.keys():
            toks['repUnit'] = L
            toks['endGrp'] = list()
        else:
            n = L.index(';')
            toks['repUnit'] = L[:n]
            toks['endGrp'] = L[n+1:]
        toks['rawStr'] = ''.join(toks)
        return toks
    
    printableExceptSemicolon = printables.replace(';', '')
    printableExceptCommaSemicolon = printableExceptSemicolon.replace(',', '')
    _StoObjUnit = Word(printableExceptCommaSemicolon)
    _StoObjList = _StoObjUnit + ("," + _StoObjUnit)*(0,None)
    _StoObjLists = _StoObjList.setResultsName('repUnit') + \
                   (Literal(';') + _StoObjList.setResultsName('endGrp'))*(0,1)
    _StoObjLists.setParseAction(separateList)
    
    
    
#### DEFINITIONS of patterns involved in BigSMILES() ####
    _BigSmilesChar = SmilesPattern._smilesChar | _BigSmilesBond | _StoObj.setResultsName('BigSMILES_StoObj')
    
    _BigSmilesBranchContent = _BigSmilesChar*(1,None)
    _BigSmilesBranchContent.setParseAction(lambda toks: ''.join(toks))
    
    _BigSmilesBranch = nestedExpr('(',')',content=_BigSmilesBranchContent)
    _BigSmilesBranch.setParseAction(lambda toks: '('+''.join(flatten_list(toks,str))+')')
    
    _BigSmilesElement = _BigSmilesChar | _BigSmilesBranch.setResultsName('branch')
    _BigSmilesElement.addParseAction(SmilesPattern.addRawStr)
    
    

    # additional definition of augmented SMILES for parsing the entire augmented SMILES segments (BigSMILES Chain Objects)
#    _augBranchContent = (SmilesPattern._smilesChar | _bigsmilesBond)*(1,None)
#    _augBranch = nestedExpr('(',')',content=_augBranchContent)
#    _augBranch.setParseAction(lambda toks: '('+''.join([str(item) for sublist in toks for item in sublist])+')')
#    _augSmilesElement = SmilesPattern._smilesChar | _augBranch 
#    _bigsmilesChar = _augSmilesElement | _bigsmilesBond
                     
#    _bigsmileschainObj = _bigsmilesChar*(1,None)
#    _bigsmileschainObj.setParseAction(lambda toks: ''.join(toks))

    
    # bracketed bonds and definition for starting/ending patterns for stochastic objects
    # _opener, _closer, _stoObjExactContent explicitly used in BigSMILES_StoObj()
#    _bracketedBond = (Literal('[') + _bigsmilesBond + Literal(']')).setResultsName('BigSMILES_bracketedBond')
#    _opener = StringStart()+(_bracketedBond)*(0,1)
#    _opener.setParseAction(SmilesPattern.addRawStr)
#    _closer = (_bracketedBond)*(0,1)+StringEnd()
#    _closer.setParseAction(SmilesPattern.addRawStr)
#    _stoObjSep = Word(',;',exact=1)
    
#    _stoObjContent = (_AugmentedSmilesChar | _stoObjSep | _bracketedBond)*(1,None)
#    _stoObjContent.setParseAction(lambda toks: ''.join(toks))
    
#    _stoObj = nestedExpr('{','}',content=_stoObjContent)
    #_stoObj.setParseAction(lambda toks: '{'+''.join([str(item) for sublist in toks for item in sublist])+'}')
#    _stoObj.setParseAction(lambda toks: '{'+''.join(flatten_list(toks,str))+'}')
    
#    _bigsmilesElement = _bigsmileschainObj.setResultsName('Augmented_SMILES') | \
#                        _stoObj.setResultsName('BigSMILES_StoObj')
#    _bigsmilesElement.setParseAction(SmilesPattern.addRawStr)
    
    def separateList(toks):
        L = [x for x in toks if x != ',']
        if not 'endGrp' in toks.keys():
            toks['repUnit'] = L
            toks['endGrp'] = list()
        else:
            n = L.index(';')
            toks['repUnit'] = L[:n]
            toks['endGrp'] = L[n+1:]
        toks['rawStr'] = ''.join(toks)
        return toks