def preProcess(self): # коннект к базе, параметры надо вынести в конфиг и получать через accessor self.dbConnection = pymysql.connect(host='localhost', port=3306, user='******', passwd='', charset='utf8', db='wikiparse') # курсор коннекта для запросов self.dbCursor = self.dbConnection.cursor() # класс для получения self.posTagger = POSTagger() # индекс редиректов self.redirects = self.accessor.getIndex(RedirectsIndex) # индекс текстов статей, очищенных от вики-разметки self.plainTextIndex = self.accessor.getIndex(WikiPlainTextIndex) self.clear() # список начальных форм глаголов self.stems = {} # разделялка на слова self.wordSplitter = TokenSplitter() # запросы self.addStemQuery = "INSERT INTO verbs(stem) VALUES (%s)" self.getStemIdQuery = "SELECT id FROM verbs WHERE stem LIKE %s" self.insertVerbToDocQuery = "INSERT INTO verb_to_doc(doc_id,verb_id,is_ambig,position,score) VALUES " self.queryElement = "(%s, %s, %s, %s, %s)" # выбираем те записи, которые уже есть self.dbCursor.execute("SELECT * FROM verbs ORDER BY id") for stem in self.dbCursor.fetchall(): self.stems[stem[1]] = stem[0]
def __init__(self, directory=None, file=None, text=None, clearWrap=True): if file: cleaner = TextCleaner(directory, clearWrap) self.directory = directory self.file = file with codecs.open(directory + self.file, 'r', "utf-8") as myfile: self.text = myfile.readlines() self.text = cleaner.clean(self.text) elif text: self.text = text else: print('There is no text or file to parse') self.text = '' self.tokenSplitter = TokenSplitter() self.posTagger = POSTagger()
def __init__(self, accessor, headerIndexPrefix): self.accessor = accessor self.headerIndex = HeadersFileIndex(accessor, headerIndexPrefix) self.prefix = headerIndexPrefix self.tokenSplitter = TokenSplitter() self.posTagger = POSTagger() FragmentConfig(accessor.directory)
def __init__(self, accessor, headerIndexPrefix=None, configuration=None): self.accessor = accessor self.headerIndex = HeadersFileIndex(accessor, '') self.prefix = headerIndexPrefix self.tokenSplitter = TokenSplitter() self.posTagger = POSTagger() if configuration: self.fragmentTypesToHeaders = configuration else: FragmentConfig(accessor.directory) self.fragmentTypesToHeaders = FragmentConfig.fragmentTypesToHeaders
def __init__(self, accessor, prefix): #self.hists #self.tfidf #self.patterns self.tokenSplitter = TokenSplitter() self.posTagger = POSTagger() self.flSelector = FormalLanguagesMatcher() self.defisWordsBuilder = DefisWordsBuilder() self.initialsWordsBuilder = InitialsWordsBuilder() self.formalLanguagesMatcher = FormalLanguagesMatcher() self.headersMatcher = HeaderMatcher() self.sentenceSplitter = SentenceSplitter() self.posListIndex = POSListIndex(accessor, prefix) self.collocatonsGrammars = CollocationGrammars(accessor, prefix) self.fragmentTypes = self.collocatonsGrammars.getFunctionalTypes() self.verbs = self.posListIndex.getVerbsHistsForAllTypes() self.patternMatcher = PatternMatcher() self.sq = lambda x: x * x self.sums = {} for fType in self.verbs: self.sums[fType] = self.module(self.verbs[fType])
class TextStat: def __init__(self, directory=None, file=None, text=None, clearWrap=True): if file: cleaner = TextCleaner(directory, clearWrap) self.directory = directory self.file = file with codecs.open(directory + self.file, 'r', "utf-8") as myfile: self.text = myfile.readlines() self.text = cleaner.clean(self.text) elif text: self.text = text else: print('There is no text or file to parse') self.text = '' self.tokenSplitter = TokenSplitter() self.posTagger = POSTagger() def getStem(self, token): return token.POS[0]['normalForm'].replace('ё', 'е') def addToken(self, token): if token.tokenType == TYPE_SIGN: if token.token in ";,": self.sumCommas += 1 if token.token in ".?!": self.sumDots += 1 else: pos = token.getBestPOS() if pos in ['VERB', 'INFN', 'PRTF', 'PRTS', 'GRND']: self.sumVerb += 1 self.setVerb[self.getStem(token)] += 1 if pos in ['NOUN', 'NPRO']: self.sumNoun += 1 self.setNoun[self.getStem(token)] += 1 if pos in ['ADJF', 'ADJS', 'COMP', 'ADVB', 'PRED']: self.sumAdj += 1 self.setAdj[self.getStem(token)] += 1 if pos in ['PREP', 'CONJ', 'PRCL', 'INTJ']: self.sumFunc += 1 self.setFunc[self.getStem(token)] += 1 def removeToken(self, token): if token.tokenType == TYPE_SIGN: if token.token in ";,": self.sumCommas -= 1 if token.token in ".?!": self.sumDots -= 1 else: pos = token.getBestPOS() if pos in ['VERB', 'INFN', 'PRTF', 'PRTS', 'GRND']: self.sumVerb -= 1 self.setVerb -= ({self.getStem(token): 1}) if pos in ['NOUN', 'NPRO']: self.sumNoun -= 1 self.setNoun -= ({self.getStem(token): 1}) if pos in ['ADJF', 'ADJS', 'PRED', 'COMP', 'ADVB']: self.sumAdj -= 1 self.setAdj -= ({self.getStem(token): 1}) if pos in ['PREP', 'CONJ', 'PRCL', 'INTJ']: self.sumFunc -= 1 self.setFunc -= ({self.getStem(token): 1}) def prepareText(self, text): self.tokenSplitter.split(self.text) tokens = self.tokenSplitter.getTokenArray() self.posTagger.posTagging(tokens) parsedTokens = self.clearTokens(tokens) return parsedTokens def clearTokens(self, tokens): parsedTokens = [] signs = ";,.!?" for token in tokens: if token.tokenType == TYPE_SIGN: if token.token in signs: parsedTokens.append(token) if (token.tokenType in [TYPE_TOKEN, TYPE_WORD] and token.allCyr() and token.getBestPOS()): parsedTokens.append(token) return parsedTokens def getSlice(self, parsedTokens, size): res = {} res["DOTS"] = [] res["COMMAS"] = [] res["NOUNS"] = [] res["VERBS"] = [] res["ADJS"] = [] res["FUNC"] = [] res["UNIQUE_NOUNS"] = [] res["UNIQUE_VERBS"] = [] res["UNIQUE_ADJS"] = [] res["UNIQUE_FUNC"] = [] self.setNoun = Counter() self.setAdj = Counter() self.setVerb = Counter() self.setFunc = Counter() self.sumNoun = 0 self.sumVerb = 0 self.sumAdj = 0 self.sumFunc = 0 self.sumDots = 0 self.sumCommas = 0 tokenAdded = False for tokenId in range(0, len(parsedTokens) - 1): token = parsedTokens[tokenId] self.addToken(token) tokenAdded = False if tokenId >= size: prevToken = parsedTokens[tokenId - size] self.removeToken(prevToken) if tokenId >= size - 1: res["DOTS"].append(self.sumDots) res["COMMAS"].append(self.sumCommas) res["NOUNS"].append(self.sumNoun) res["VERBS"].append(self.sumVerb) res["ADJS"].append(self.sumAdj) res["FUNC"].append(self.sumFunc) res["UNIQUE_NOUNS"].append(len(self.setNoun)) res["UNIQUE_VERBS"].append(len(self.setVerb)) res["UNIQUE_ADJS"].append(len(self.setAdj)) res["UNIQUE_FUNC"].append(len(self.setFunc)) tokenAdded = True if not tokenAdded: res["DOTS"].append(self.sumDots) res["COMMAS"].append(self.sumCommas) res["NOUNS"].append(self.sumNoun) res["VERBS"].append(self.sumVerb) res["ADJS"].append(self.sumAdj) res["FUNC"].append(self.sumFunc) res["UNIQUE_NOUNS"].append(len(self.setNoun)) res["UNIQUE_VERBS"].append(len(self.setVerb)) res["UNIQUE_ADJS"].append(len(self.setAdj)) res["UNIQUE_FUNC"].append(len(self.setFunc)) return res def buildPOSStat(self): parsedTokens = self.prepareText(self.text) surfaceSlice = self.getSlice(parsedTokens, len(parsedTokens)) return surfaceSlice def buildPOSSurface(self, minWindowSize=10, maxWindowSize=1000, step=5, saveToFile=True): parsedTokens = self.prepareText(self.text) self.data = { "DOTS": {}, "COMMAS": {}, "NOUNS": {}, "VERBS": {}, "ADJS": {}, "FUNC": {}, "UNIQUE_NOUNS": {}, "UNIQUE_VERBS": {}, "UNIQUE_ADJS": {}, "UNIQUE_FUNC": {}, } for windowSize in range(minWindowSize, maxWindowSize, step): surfaceSlice = self.getSlice(parsedTokens, windowSize) for key in surfaceSlice: self.data[key][windowSize] = surfaceSlice[key] if saveToFile: with open(self.directory + self.file + '-surface.pcl', 'wb') as f: pickle.dump(self.data, f, pickle.HIGHEST_PROTOCOL) else: return pickle.dumps(self.data, pickle.HIGHEST_PROTOCOL)
class VerbListBuilder(WikiIterator): # Порог точности определения части речи __TRESHOLD = 0.0005 # Инициализация. Параметры: # accessor - класс, который содержит конфигурацию для доступа к индексам дампа Википедии (см. модуль wiki_accessor) # docIds = None - список обрабатываемых документов, если надо обработать не все # prefix='' - префикс файлов индекса (или таблиц индекса) def __init__(self, accessor, docIds=None, prefix=''): super(VerbListBuilder, self).__init__(accessor, 1000, docIds, prefix) # Функция сохранения данных, вызывается каждые N записей def processSave(self, articlesCount): pass # Функция подготовки к построению индекса def preProcess(self): # коннект к базе, параметры надо вынести в конфиг и получать через accessor self.dbConnection = pymysql.connect(host='localhost', port=3306, user='******', passwd='', charset='utf8', db='wikiparse') # курсор коннекта для запросов self.dbCursor = self.dbConnection.cursor() # класс для получения self.posTagger = POSTagger() # индекс редиректов self.redirects = self.accessor.getIndex(RedirectsIndex) # индекс текстов статей, очищенных от вики-разметки self.plainTextIndex = self.accessor.getIndex(WikiPlainTextIndex) self.clear() # список начальных форм глаголов self.stems = {} # разделялка на слова self.wordSplitter = TokenSplitter() # запросы self.addStemQuery = "INSERT INTO verbs(stem) VALUES (%s)" self.getStemIdQuery = "SELECT id FROM verbs WHERE stem LIKE %s" self.insertVerbToDocQuery = "INSERT INTO verb_to_doc(doc_id,verb_id,is_ambig,position,score) VALUES " self.queryElement = "(%s, %s, %s, %s, %s)" # выбираем те записи, которые уже есть self.dbCursor.execute("SELECT * FROM verbs ORDER BY id") for stem in self.dbCursor.fetchall(): self.stems[stem[1]] = stem[0] # выполняется после завершения построения def postProcess(self): pass # очистка индекса def clear(self): pass # определяет или генерирует идентификатор словарной формы глагола def getStemId(self, stem): stem = stem.replace("ё", "е") stem_id = self.stems.get(stem, None) if not stem_id: self.dbCursor.execute(self.addStemQuery, (stem)) self.dbConnection.commit() self.dbCursor.execute(self.getStemIdQuery, (stem)) stem_id = self.dbCursor.fetchone() if not stem_id: print(stem) self.stems[stem] = stem_id[0] return stem_id # обработка документа def processDocument(self, docId): # если редирект, то пропускаем if self.redirects.isRedirect(docId): return # берем очищенный текст cleanText = self.plainTextIndex.getTextById(docId) if cleanText == None: return verbs = [] # делим текст на токены self.wordSplitter.split(cleanText) tokens = self.wordSplitter.getTokenArray() # помечаем токены частями речи self.posTagger.posTagging(tokens) # выбираем те токены, которые представляют глаголы for token in tokens: if (token.tokenType == TYPE_TOKEN and not token.hasDigits()): #if (not token.allCyr()) or token.token.lower() in self.notAVerbs: # break; #parse_result = self.pos.parse(token.token) #if(parse_result == None): # continue isVerb = False isOnlyVerb = True normalForm = None verbScore = 0 for res in token.POS: if (res['POS'] == 'VERB'): isVerb = True normalForm = res['normalForm'] verbScore = res['score'] else: isOnlyVerb = False if (isVerb): verb = { "stem": self.getStemId(normalForm), "is_ambig": isOnlyVerb, "pos": token.tokenNum, "score": verbScore } verbs.append(verb) # составляем запрос, который добавит все наши глаголы в базу query = [] params = [] for verb in verbs: query.append(self.queryElement) params.append(docId) params.append(verb["stem"]) params.append(verb["is_ambig"]) params.append(verb["pos"]) params.append(verb["score"]) # выполняем запрос if len(query) > 0: self.dbCursor.execute(self.insertVerbToDocQuery + ",".join(query), params) self.dbConnection.commit()
class TextFragmentator: def __init__(self, accessor, prefix): #self.hists #self.tfidf #self.patterns self.tokenSplitter = TokenSplitter() self.posTagger = POSTagger() self.flSelector = FormalLanguagesMatcher() self.defisWordsBuilder = DefisWordsBuilder() self.initialsWordsBuilder = InitialsWordsBuilder() self.formalLanguagesMatcher = FormalLanguagesMatcher() self.headersMatcher = HeaderMatcher() self.sentenceSplitter = SentenceSplitter() self.posListIndex = POSListIndex(accessor, prefix) self.collocatonsGrammars = CollocationGrammars(accessor, prefix) self.fragmentTypes = self.collocatonsGrammars.getFunctionalTypes() self.verbs = self.posListIndex.getVerbsHistsForAllTypes() self.patternMatcher = PatternMatcher() self.sq = lambda x: x * x self.sums = {} for fType in self.verbs: self.sums[fType] = self.module(self.verbs[fType]) def module(self, data): return sqrt(sum(map(self.sq, data.values()))) def findPatterns(self, fType, sentence): patternsWeight = 0 if not sentence.internalTokens: return patternsWeight patterns = self.collocatonsGrammars.getGrammars(fType, border=0.00005) for pattern in patterns: if (pattern['freq'] / pattern['total_freq'] <= 1): continue self.patternMatcher.setParameters(pattern['grammar'], fType) self.patternMatcher.combineTokens(sentence.internalTokens, False) if len(self.patternMatcher.newTokens) > 0: sentence.setFlag(fType, True) patternsWeight += pattern['freq'] / pattern['total_freq'] return patternsWeight def estimateLexicalSimilarity(self, fType, hists): typeVerbs = self.verbs[fType] est = 0 for v in hists: est += hists[v] * typeVerbs[v] module = self.module(hists) if module == 0: return 0 est /= self.sums[fType] * module return est def estimate(self, fType, sentence, hists): patternsCount = self.findPatterns(fType, sentence) lexicalSimilarity = self.estimateLexicalSimilarity(fType, hists) #print (fType+": "+str(patternsCount)) return { 'pattern': patternsCount, 'lexical': lexicalSimilarity, } def genFragments(self, text, border=0.1): tokens = self.tokenSplitter.split(text) self.newTokens = [] self.posTagger.posTagging(tokens) self.defisWordsBuilder.combineTokens(tokens) self.initialsWordsBuilder.combineTokens(tokens) self.headersMatcher.combineTokens(tokens) self.formalLanguagesMatcher.combineTokens(tokens) self.sentenceSplitter.combineTokens(tokens) lexicalEstimations = np.zeros((len(tokens), len(self.fragmentTypes)), dtype=np.float) patternEstimations = np.zeros((len(tokens), len(self.fragmentTypes)), dtype=np.float) totalEstimations = np.zeros((len(tokens), len(self.fragmentTypes)), dtype=np.float) for ind in range(len(tokens)): #print(tokens[ind].token) if not tokens[ind].internalTokens: hists = calcHist([tokens[ind]])['VERB'] else: hists = calcHist(tokens[ind].internalTokens)['VERB'] for fTypeInd in range(len(self.fragmentTypes)): fType = self.fragmentTypes[fTypeInd] oneEstimation = self.estimate(fType, tokens[ind], hists) lexicalEstimations[ind][fTypeInd] = oneEstimation['lexical'] patternEstimations[ind][fTypeInd] = oneEstimation['pattern'] lexicalEstimations = lexicalEstimations / np.amax(lexicalEstimations) patternEstimations = patternEstimations / np.amax(patternEstimations) totalEstimations = patternEstimations + 0.5 * lexicalEstimations #for ind in range(len(tokens)): # for fTypeInd in range(len(self.fragmentTypes)): # totalEstimations[ind][fTypeInd] = patternEstimations[ind][fTypeInd] + 0.5 * patternEstimations[ind][fTypeInd] totalEstimations = totalEstimations / np.amax(totalEstimations) orderedLexicalTypes = np.argsort(lexicalEstimations, axis=1) orderedPatternTypes = np.argsort(patternEstimations, axis=1) orderedTotalTypes = np.argsort(totalEstimations, axis=1) self.combineTokens( tokens, { 'lexical': { 'values': lexicalEstimations, "order": orderedLexicalTypes }, 'pattern': { 'values': patternEstimations, "order": orderedPatternTypes }, 'total': { 'values': totalEstimations, "order": orderedTotalTypes } }, border) return tokens def calcType(self, border, estC, estL=None, estR=None): votes = Counter() weight = Counter() #neibVotes = Counter() for parameter in range(len(estC)): estCBest = estC[parameter]['order'][-1] estC2Best = estC[parameter]['order'][-2] if estC[parameter]['value'][estCBest] > border: votes[self.fragmentTypes[estCBest]] += 6 weight[self.fragmentTypes[estCBest]] += estC[parameter][ 'value'][estCBest] if estC[parameter]['value'][ estC2Best] > 0.7 * estC[parameter]['value'][estCBest]: votes[self.fragmentTypes[estC2Best]] += 4 else: votes[self.fragmentTypes[estC2Best]] += 2 weight[self.fragmentTypes[estC2Best]] += estC[parameter][ 'value'][estC2Best] if estL: estLBest = estL[parameter]['order'][-1] if estL[parameter]['value'][estLBest] > border: votes[self.fragmentTypes[estLBest]] += 2 #neibVotes[self.fragmentTypes[estLBest]] += 1 if estR: estRBest = estR[parameter]['order'][-1] if estR[parameter]['value'][estRBest] > border: votes[self.fragmentTypes[estRBest]] += 2 #neibVotes[self.fragmentTypes[estRBest]] += 1 #print (votes.most_common(4)) #print (weight.most_common(4)) commons = votes.most_common(2) commonWeights = weight.most_common(2) if len(commonWeights) == 0: return None if len(commons) == 0 or commons[0][1] == 0: return None if (len(commons) == 1 or (len(commonWeights) > 1 and commonWeights[1][1] < 0.9 * commonWeights[0][1]) or commons[1][1] != commons[0][1]): return commons[0][0] #neibCommons = votes.most_common(2) #if neibCommons[0][1] == 0: # return None #if len(neibCommons) > 1 and neibCommons[0][1] != neibCommons[1][1]: # return neibCommons[0][0] return None ''' for parameter in estC: if estC[parameter]['fType']: votes[estC[parameter]['fType']] += 4 if estC[parameter]['all'][1][1] > 0.7 * estC[parameter]['estimation']: votes[estC[parameter]['all'][1][0]] += 3 else: votes[estC[parameter]['all'][1][0]] += 1 if estL and estL[parameter]['fType']: votes[estL[parameter]['fType']] += 2 neibVotes[estL[parameter]['fType']] += 1 if estR and estR[parameter]['fType']: votes[estR[parameter]['fType']] += 2 neibVotes[estR[parameter]['fType']] += 1 print (votes.most_common(4)) commons = votes.most_common(2) if commons[0][1] != commons[1][1]: return commons[0][0] neibCommons = votes.most_common(2) if neibCommons[0][1] != neibCommons[1][1]: return neibCommons[0][0] return None ''' def __getOneEstimation(self, estimations, indToken): return [ { 'order': estimations['lexical']['order'][indToken], 'value': estimations['lexical']['values'][indToken] }, { 'order': estimations['pattern']['order'][indToken], 'value': estimations['pattern']['values'][indToken] }, { 'order': estimations['total']['order'][indToken], 'value': estimations['total']['values'][indToken] }, ] def __convertEstimationToNativeFormat(self, estimation): res = [['lexical', []], ['pattern', []], ['total', []]] for criteria in range(len(res)): for fTypeInd in estimation[criteria]['order']: res[criteria][1].append( (self.fragmentTypes[fTypeInd], estimation[criteria]['value'][fTypeInd])) return res def combineTokens(self, tokens, estimations, border): tokenTypes = [] for indToken in range(len(tokens)): estC = self.__getOneEstimation(estimations, indToken) estL = None estR = None if indToken > 0: estL = self.__getOneEstimation(estimations, indToken - 1) if indToken < len(tokens) - 1: estR = self.__getOneEstimation(estimations, indToken - 1) tokenTypes.append(self.calcType(border, estC, estL, estR)) #print (tokenTypes[indToken]) tokens[indToken].setAdditionalInfo('functionalType', tokenTypes[indToken]) if tokenTypes[indToken]: self.newTokens.append(tokens[indToken]) tokens[indToken].setAdditionalInfo( 'ft_estimations', self.__convertEstimationToNativeFormat(estC)) #print (tokens[indToken].token) '''
lm = ListMatcher() lm.combineTokens(tokens) print(len(tokens)) if __name__ == '__main__': from pytextutils.token_splitter import TokenSplitter, POSTagger text = ''' (1) замена ИГ на ИГ, которой соответствует вышестоящий концепт: губернатор — глава, область — регион, дума — парламент; (2) замена имени экземпляра (имени собственного или названиядескрипции) на ИГ, которой соответствует родительский концепт экземпляра: МЧС — министерство, Приморский край — край; (3) замена ИГ с семантикой базового концепта на ИГ с семантикой аспекта: компания — ритейлер; (4) замена-трансформация: администрация края — краевая администрация, губернатор Приморья — приморский губернатор; (5) «метонимическая» замена: адвокат X-а (реляционное имя) — адвокат (род занятий); вариант такой замены — подмена референта, например: Московская область — Подмосковье; (6) синонимическая и квазисинонимическая замена: адвокат (род занятий) — правозащитник, мэр — градоначальник, городской голова; (7) ассоциативная замена: премьер-министр — политик, министр — чиновник, федеральное агентство — ведомство. ''' ts = TokenSplitter() ts.split(text) tokens = ts.getTokenArray() print(len(tokens)) POSTagger().posTagging(tokens) hm = HeaderMatcher() hm.combineTokens(tokens) lm = ListMatcher() lm.combineTokens(tokens) print(len(tokens))