def preProcess(self): self.redirects = self.accessor.getIndex(RedirectsIndex) self.doctypeIndex = DocumentTypeIndex(self.accessor) self.headersExtractor = HeadersExtractor(self.getHeaderId) self.clear() self.headersToIds = {} self.idsToHeaders = [] self.headerDocuments = {} self.documentHeaders = {}
def preProcess(self): self.dbConnection = pymysql.connect(host='localhost', port=3306, user='******', passwd='',charset='utf8', db='wikiparse') self.dbCursor = self.dbConnection.cursor() self.redirects = self.accessor.getIndex(RedirectsIndex) self.doctypeIndex = DocumentTypeIndex(self.accessor) self.headersExtractor = HeadersExtractor(self.getHeaderId) self.clear() self.headers = {} self.addHeaderQuery = "INSERT INTO headers(text) VALUES (%s)" self.getHeaderIdQuery = "SELECT id FROM headers WHERE text LIKE %s" self.insertHeaderToDocQuery = "INSERT INTO header_to_doc(doc_id,header_id,pos_start,pos_end,type) VALUES " self.isDocAlreadySaveQuery = "SELECT count(id) as cnt FROM `header_to_doc` WHERE doc_id = %s group by doc_id" self.queryElement = "(%s, %s, %s, %s, %s)" self.dbCursor.execute("SELECT * FROM headers ORDER BY id") for element in self.dbCursor.fetchall(): self.headers[element[1]] = element[0]
class HeadersFileBuilder(WikiIterator): def __init__(self, accessor, docIds=None, prefix=''): super(HeadersFileBuilder, self).__init__(accessor, 1000, docIds, prefix) def processSave(self, articlesCount): pass def preProcess(self): self.redirects = self.accessor.getIndex(RedirectsIndex) self.doctypeIndex = DocumentTypeIndex(self.accessor) self.headersExtractor = HeadersExtractor(self.getHeaderId) self.clear() self.headersToIds = {} self.idsToHeaders = [] self.headerDocuments = {} self.documentHeaders = {} def postProcess(self): with open(self.getFullFileName('HeadersToIds.pcl'), 'wb') as f: pickle.dump(self.headersToIds, f, pickle.HIGHEST_PROTOCOL) with open(self.getFullFileName('IdsToHeaders.pcl'), 'wb') as f: pickle.dump(self.idsToHeaders, f, pickle.HIGHEST_PROTOCOL) with open(self.getFullFileName('HeaderDocuments.pcl'), 'wb') as f: pickle.dump(self.headerDocuments, f, pickle.HIGHEST_PROTOCOL) with open(self.getFullFileName('DocumentHeaders.pcl'), 'wb') as f: pickle.dump(self.documentHeaders, f, pickle.HIGHEST_PROTOCOL) def clear(self): pass def getHeaderId(self, header): header = header.replace("ё", "е") header = header.replace("\\", "").strip() if not self.headersToIds.get(header, None): self.headersToIds[header] = len(self.idsToHeaders) self.idsToHeaders.append(header) return self.headersToIds[header] def processDocument(self, docId): if self.redirects.isRedirect(docId): return if self.doctypeIndex.isDocType(docId, 'wiki_stuff'): return headers = self.headersExtractor.getHeadersForDoc( docId, self.wikiIndex.getTextArticleById(docId)) self.documentHeaders[docId] = headers for h in headers: if not self.headerDocuments.get(h['header'], None): self.headerDocuments[h['header']] = [] self.headerDocuments[h['header']].append(docId)
class HeadersDBBuilder(WikiIterator): def __init__(self, accessor, docIds=None): super(HeadersDBBuilder, self).__init__(accessor, 1000, docIds) def processSave(self, articlesCount): pass def preProcess(self): self.dbConnection = pymysql.connect(host='localhost', port=3306, user='******', passwd='', charset='utf8', db='wikiparse') self.dbCursor = self.dbConnection.cursor() self.redirects = self.accessor.getIndex(RedirectsIndex) self.doctypeIndex = DocumentTypeIndex(self.accessor) self.headersExtractor = HeadersExtractor(self.getHeaderId) self.clear() self.headers = {} self.addHeaderQuery = "INSERT INTO headers(text) VALUES (%s)" self.getHeaderIdQuery = "SELECT id FROM headers WHERE text LIKE %s" self.insertHeaderToDocQuery = "INSERT INTO header_to_doc(doc_id,header_id,pos_start,pos_end,type) VALUES " self.isDocAlreadySaveQuery = "SELECT count(id) as cnt FROM `header_to_doc` WHERE doc_id = %s group by doc_id" self.queryElement = "(%s, %s, %s, %s, %s)" self.dbCursor.execute("SELECT * FROM headers ORDER BY id") for element in self.dbCursor.fetchall(): self.headers[element[1]] = element[0] def postProcess(self): pass def clear(self): pass # Проверяем, был ли документ обработан ранее def isDocAlreadySave(self, docId): self.dbCursor.execute(self.isDocAlreadySaveQuery, (docId)) count = self.dbCursor.fetchone() if not count: return False return count[0] > 0 # Определяет или генерирует идентификатор заголовка def getHeaderId(self, header): header = header.replace("ё", "е") header = header.replace("\\", "").strip() header_id = self.headers.get(header, None) if not header_id: self.dbCursor.execute(self.addHeaderQuery, (header)) self.dbConnection.commit() self.dbCursor.execute(self.getHeaderIdQuery, (header)) header_id = self.dbCursor.fetchone() if not header_id: print(header) else: self.headers[header] = header_id[0] return header_id # Обработка документа def processDocument(self, docId): #страницы-редиректы не обрабатываем if self.redirects.isRedirect(docId): return # служебные страницы не обрабатываем if self.doctypeIndex.isDocType(docId, 'wiki_stuff'): return # уже сохраненные не обрабатываем if self.isDocAlreadySave(docId): return # получаем текст статьи text = self.wikiIndex.getTextArticleById(docId) # получаем из текста заголовки в виде структурок headers = self.headersExtractor.getHeadersForDoc(docId, text) # формируем запрос query = [] params = [] for header_id in range(0, len(headers) - 1): query.append(self.queryElement) params.append(docId) params.append(headers[header_id]["header"]) params.append(headers[header_id]["position_start"]) if header_id != len(headers) - 1: params.append(headers[header_id + 1]["position_match"]) else: params.append(len(text)) params.append(headers[header_id]["type"]) # Выполняем запрос if len(query) > 0: self.dbCursor.execute( self.insertHeaderToDocQuery + ",".join(query), params) self.dbConnection.commit()
'id': element[0], 'text': element[1], 'cnt': element[2] }) return res if __name__ == "__main__": #regex1 = re.compile('\n[ \t]*==([^=]*)==[ \t\r]*\n') #text = " kdkd\n == kdkd==\n" #match = regex1.search(text) #print(match.end()) from pywikiaccessor.title_index import TitleIndex directory = "C:\\WORK\\science\\onpositive_data\\python\\" accessor = WikiAccessor(directory) docTypesIndex = DocumentTypeIndex(accessor) docIds = docTypesIndex.getDocsOfType("substance") titleIndex = accessor.getIndex(TitleIndex) for docId in docIds: print(titleIndex.getTitleById(docId)) doc_id = titleIndex.getIdByTitle("ALCAM") print(docTypesIndex.getDocTypeById(doc_id)) #hb = HeadersDBBuilder(accessor,list(docIds)) #hb.build() #hb.preProcess() #hb.processDocument(doc_id) #hi = HeadersDBIndex(accessor) #hi.getCountHeadersForDoc(docIds) #stat = hi.getAllStat(docIds) #for s in stat: # print (s['text']+": "+str(s['cnt']))