def getFunctionality(self): bonding_sites = [x for x in self.Bond_Dict.values()] bonding_sites_flat = flatten_list( [x[1:] for x in self.Bond_Dict.values()]) totF = len(bonding_sites_flat) noTypeBond = len(bonding_sites) return totF, noTypeBond, [ x[0].getCompleteSymbol() for x in bonding_sites ], [len(x) - 1 for x in bonding_sites]
def frequency_dictionary_create(tknz_texts): dict_frequency = defaultdict(int) tokens = [i for i in flatten_list(tknz_texts)] for token in tokens: dict_frequency[token] += 1 # сортировка словаря по значению sort_dict = sorted(dict_frequency.items(), key=operator.itemgetter(1), reverse=True) dict_frequency_df = pd.DataFrame(sort_dict, columns=['token', 'quantity']) dict_frequency_df['freq'] = dict_frequency_df['quantity'] / sum( dict_frequency_df['quantity']) dict_frequency_df.sort_values('freq', ascending=False) return dict_frequency_df
def bigrams_dictionary_create(tknz_texts): m = Mystem() # сформируем датафрейм словаря частотности токенов: dfr = frequency_dictionary_create(tknz_texts) # список для кортежей, состоящих из токенов и их частей речи ws_sbj = [] for tx in tknz_texts: temp_ws_sbj = [] for anlys_dict in m.analyze(" ".join(tx)): try: sbj = re.sub('[^A-Z]', '', anlys_dict['analysis'][0]['gr']) w = anlys_dict['text'] temp_ws_sbj.append((w, sbj)) except Exception: print("Exception:", anlys_dict, w, sbj) ws_sbj.append(temp_ws_sbj) # оставим только прилагательные и существительные: ws_sbj_sa = [[t for t in x if t[1] in ['A', 'S']] for x in ws_sbj] # удалим пустые, если такие есть: ws_sbj_sa = [x for x in ws_sbj_sa if x != []] # кандидаты на биграммы: AS, SS (возможно нужно перенести в параметры): bigrams_candidate = [] for q_list in ws_sbj_sa: bigrams_candidate.append([ x for x in sliceArray(q_list, length=2, stride=2) if ''.join([x[0][1], x[1][1]]) in ['AS', 'SS'] ]) bigrams_candidate = [x for x in bigrams_candidate if x != []] bigrams_candidate_sep = [[(''.join([x[0][0], x[1][0]]), x[0][0], x[1][0]) for x in bg] for bg in bigrams_candidate] # сделаем список биграмм "плоским" flatit = flatten_list(bigrams_candidate_sep) bigrams_candidate_sep = [x for x in flatit] # создадим пандас датафрейм из кандидатов в биграммы bigrams_candidate_df = pd.DataFrame(bigrams_candidate_sep, columns=['bigrams', 'w1', 'w2']) # посчитаем частотность биграмм и их токенов # посчитаем частотность биграмм: bgms_cand_freq = bigrams_candidate_df[['bigrams', 'w1' ]].groupby('bigrams', as_index=False).count() bgms_cand_freq.rename(columns={'w1': 'quantity'}, inplace=True) bgms_cand_freq['freq'] = bgms_cand_freq['quantity'] / sum( bgms_cand_freq['quantity']) # вернем слова: bgms_freq_words = pd.merge(bgms_cand_freq, bigrams_candidate_df, how='left', on='bigrams', copy=False) bgms_freq_words.drop_duplicates(inplace=True) # объединим частотный словарь токенов и словарь биграмм dfr_w1 = dfr.rename(columns={'freq': 'w1_freq', 'token': 'w1'}) bigrams_est = pd.merge(bgms_freq_words, dfr_w1[['w1', 'w1_freq']], on='w1') dfr_w2 = dfr.rename(columns={'freq': 'w2_freq', 'token': 'w2'}) bigrams_est = pd.merge(bigrams_est, dfr_w2[['w2', 'w2_freq']], on='w2') bigrams_est.rename(columns={'freq': 'bigrms_freq'}, inplace=True) # теперь все готово к оценке вероятности того, насколько данная биграмма похожа на УСС: # количество слов корпуса, участвующих в построении биграмм # в каждой биграмме 2 слова n = 2 * sum(bigrams_est['quantity']) # оценка взаимной информации для слов, входящих в биграммы: bigrams_est['estimate'] = np.log( (n * bigrams_est['bigrms_freq']**3) / (bigrams_est['w1_freq'] * bigrams_est['w2_freq'])) bigrams_est_sort_df = bigrams_est.sort_values('estimate', ascending=False) return bigrams_est_sort_df
def writeStandard(self,noBondDesc=False): # write the string with backbone as the main chain # ends if noBondDesc==True: self.noWriteBondDesc=True; ## get all the binding sites # get lists of binding sites corresponding to different BigSMILES bonding descriptors bonding_sites = flatten_list([x[1:] for x in self.Bond_Dict.values()]) # exit if there are less than two bonding sites #if len(bonding_sites) < 2: # print('Error in writing standardized repeat unit: expected at least 2 bonding sites but '+str(len(bonding_sites))+' found') # return None # choose the first two as the ends if len(bonding_sites) == 0: smilesStr = self.write() self.noWriteBondDesc=False return smilesStr elif len(bonding_sites) == 1: source = bonding_sites[0] smilesStr = self.write(source) self.noWriteBondDesc=False return smilesStr else: source = bonding_sites[0] target = bonding_sites[1] # get the backbone (defined as the shortest path between the two ends) path = nx.shortest_path(self.G,source=source,target=target) #print(len(path)) #print(path) #G_copy = self.G.copy() # don't bother to copy for i in range(1,len(path)-1): atom = path[i] next_atom = path[i+1] prev_atom = path[i-1] L = self.G.nodes[atom]['neighList'] # first rotate the list L so that the previous atom is at the front rotCount = 0 while L[0] != prev_atom: L = deque(L) L.rotate(-1) rotCount += 1 L = list(L) # then check if the next_atom need to be swapped to the end swapCount = 0 if L.index(next_atom) == len(L)-1: #print('no swapp on atom '+str(atom)) # no need to change things around as the next_atom is at the end of the list # i.e. on main chain and not on branch pass else: #print('swapped bonds on atom '+str(atom)) # need to swap the next_atom with the last atom L[L.index(next_atom)] = L[-1] L[-1] = next_atom self.G.nodes[atom]['neighList'] = L swapCount += 1 #print(L) # change chirality accordingly if self.G.nodes[atom]['chiral'] == '': # no chirality specified, whew, nothing to change pass else: nchiral = len(self.G.nodes[atom]['chiral']) -1 nchiral = ((nchiral + rotCount + swapCount) % 2) + 1 self.G.nodes[atom]['chiral'] = '@'*nchiral # done changing G_copy, start from source, and writeLinear will give chain according to path # now deal with the loops # get bfs tree from source. this would ensure all loops are cut on chains other than the main desired one self.T = nx.bfs_tree(self.G,source,reverse=True).to_undirected() self.ringDict = dict() self.usedRingID = [False]*100 for edge in self.G.edges(): if not tuple(edge) in self.T.edges(): self.ringDict[edge] = -1 #print(self.ringDict) #print(self.T.edges()) #smilesStr = self.writeLinear((None,source)) smilesStr = self.writeComponents(source) self.noWriteBondDesc=False return smilesStr
class BigSmilesPattern(SmilesPattern): #### DEFINITIONS of patterns involved in BigSMILES_Bond #### _BigSmilesBondChar = "$<>" _BondDesc = Word(_BigSmilesBondChar,exact=1).setResultsName('BigSMILES_Bondtype') + \ ( (Word(nums,exact=1).setResultsName('BigSMILES_Bondid') | \ Literal('%')+Word(nums,exact=2).setResultsName('BigSMILES_Bondid') ) )*(0,1) # _ladderBondDesc = Word(_bigsmilesBondChar,exact=1).setResultsName('BigSMILES_outerBondtype') + \ # '[' + _bondDesc + ']' + \ # (Word(nums,exact=1).setResultsName('BigSMILES_outerbondid') | \ # Literal('%')+Word(nums,exact=2).setResultsName('BigSMILES_outerbondid') ) # _bigsmilesBond = _ladderBondDesc.setResultsName('BigSMILES_ladderBond') | _bondDesc.setResultsName('BigSMILES_Bond') _BigSmilesBond = (Literal('[') + _BondDesc.setResultsName('BigSMILES_Bond') + Literal(']') ) #### DEFINITIONS of patterns involved in Augmented_SMILES #### # redefinition for the elements used in parsing of Augmented SMILES strings _AugmentedSmilesChar = SmilesPattern._smilesChar | _BigSmilesBond _AugmentedBranchContent = _AugmentedSmilesChar*(1,None) _AugmentedBranchContent.setParseAction(lambda toks: ''.join(toks)) _AugmentedBranch = nestedExpr('(',')',content=_AugmentedBranchContent) #_AugmentedBranch.setParseAction(lambda toks: '('+''.join([str(item) for sublist in toks for item in sublist])+')') _AugmentedBranch.setParseAction(lambda toks: '('+''.join(flatten_list(toks,str))+')') # _AugmentedSmilesElement explicitly used in Augmented_SMILES() _AugmentedSmilesElement = _AugmentedSmilesChar | _AugmentedBranch.setResultsName('branch') _AugmentedSmilesElement.addParseAction(SmilesPattern.addRawStr) #### DEFINITIONS of stochastic object _StoObjSepChar = ",;" #_BracketedBond = (Literal('[') + _BigSmilesBond + Literal(']')).setResultsName('BigSMILES_bracketedBond') _TerminalBond = (Literal('[') + (_BondDesc*(0,1)).setResultsName('BigSMILES_Bond') + Literal(']')).setResultsName('BigSMILES_terminalBond') _opener = StringStart() + _TerminalBond _opener.setParseAction(SmilesPattern.addRawStr) _closer = _TerminalBond + StringEnd() _closer.setParseAction(SmilesPattern.addRawStr) _StoObjSep = Word(',;',exact=1) printableExceptCurly = printables.replace('{', '').replace('}', '') _StoObjContent = Word(printableExceptCurly)#.setResultsName('StoObjCont') _StoObjContent.setParseAction(lambda toks: ''.join(toks)) _StoObj = nestedExpr('{','}',content=_StoObjContent) _StoObj.setParseAction(lambda toks: '{'+''.join(flatten_list(toks,str))+'}') #_StoObjDummy = (Literal('{') + Word(nums).setResultsName('StoObjId') + Literal('}')).setResultsName('BigSMILES_StoObj') def separateList(toks): L = [x for x in toks if x != ','] if not 'endGrp' in toks.keys(): toks['repUnit'] = L toks['endGrp'] = list() else: n = L.index(';') toks['repUnit'] = L[:n] toks['endGrp'] = L[n+1:] toks['rawStr'] = ''.join(toks) return toks printableExceptSemicolon = printables.replace(';', '') printableExceptCommaSemicolon = printableExceptSemicolon.replace(',', '') _StoObjUnit = Word(printableExceptCommaSemicolon) _StoObjList = _StoObjUnit + ("," + _StoObjUnit)*(0,None) _StoObjLists = _StoObjList.setResultsName('repUnit') + \ (Literal(';') + _StoObjList.setResultsName('endGrp'))*(0,1) _StoObjLists.setParseAction(separateList) #### DEFINITIONS of patterns involved in BigSMILES() #### _BigSmilesChar = SmilesPattern._smilesChar | _BigSmilesBond | _StoObj.setResultsName('BigSMILES_StoObj') _BigSmilesBranchContent = _BigSmilesChar*(1,None) _BigSmilesBranchContent.setParseAction(lambda toks: ''.join(toks)) _BigSmilesBranch = nestedExpr('(',')',content=_BigSmilesBranchContent) _BigSmilesBranch.setParseAction(lambda toks: '('+''.join(flatten_list(toks,str))+')') _BigSmilesElement = _BigSmilesChar | _BigSmilesBranch.setResultsName('branch') _BigSmilesElement.addParseAction(SmilesPattern.addRawStr) # additional definition of augmented SMILES for parsing the entire augmented SMILES segments (BigSMILES Chain Objects) # _augBranchContent = (SmilesPattern._smilesChar | _bigsmilesBond)*(1,None) # _augBranch = nestedExpr('(',')',content=_augBranchContent) # _augBranch.setParseAction(lambda toks: '('+''.join([str(item) for sublist in toks for item in sublist])+')') # _augSmilesElement = SmilesPattern._smilesChar | _augBranch # _bigsmilesChar = _augSmilesElement | _bigsmilesBond # _bigsmileschainObj = _bigsmilesChar*(1,None) # _bigsmileschainObj.setParseAction(lambda toks: ''.join(toks)) # bracketed bonds and definition for starting/ending patterns for stochastic objects # _opener, _closer, _stoObjExactContent explicitly used in BigSMILES_StoObj() # _bracketedBond = (Literal('[') + _bigsmilesBond + Literal(']')).setResultsName('BigSMILES_bracketedBond') # _opener = StringStart()+(_bracketedBond)*(0,1) # _opener.setParseAction(SmilesPattern.addRawStr) # _closer = (_bracketedBond)*(0,1)+StringEnd() # _closer.setParseAction(SmilesPattern.addRawStr) # _stoObjSep = Word(',;',exact=1) # _stoObjContent = (_AugmentedSmilesChar | _stoObjSep | _bracketedBond)*(1,None) # _stoObjContent.setParseAction(lambda toks: ''.join(toks)) # _stoObj = nestedExpr('{','}',content=_stoObjContent) #_stoObj.setParseAction(lambda toks: '{'+''.join([str(item) for sublist in toks for item in sublist])+'}') # _stoObj.setParseAction(lambda toks: '{'+''.join(flatten_list(toks,str))+'}') # _bigsmilesElement = _bigsmileschainObj.setResultsName('Augmented_SMILES') | \ # _stoObj.setResultsName('BigSMILES_StoObj') # _bigsmilesElement.setParseAction(SmilesPattern.addRawStr) def separateList(toks): L = [x for x in toks if x != ','] if not 'endGrp' in toks.keys(): toks['repUnit'] = L toks['endGrp'] = list() else: n = L.index(';') toks['repUnit'] = L[:n] toks['endGrp'] = L[n+1:] toks['rawStr'] = ''.join(toks) return toks