def preProcess(self):
        self.redirects = self.accessor.getIndex(RedirectsIndex)
        self.doctypeIndex = DocumentTypeIndex(self.accessor)
        self.headersExtractor = HeadersExtractor(self.getHeaderId)

        self.clear()
        self.headersToIds = {}
        self.idsToHeaders = []
        self.headerDocuments = {}
        self.documentHeaders = {}
Exemple #2
0
    def preProcess(self):
        self.dbConnection = pymysql.connect(host='localhost', port=3306, user='******', passwd='',charset='utf8', db='wikiparse')
        self.dbCursor = self.dbConnection.cursor()
        self.redirects = self.accessor.getIndex(RedirectsIndex) 
        self.doctypeIndex = DocumentTypeIndex(self.accessor)
        self.headersExtractor = HeadersExtractor(self.getHeaderId)
 
        self.clear()
        self.headers = {}
        self.addHeaderQuery = "INSERT INTO headers(text) VALUES (%s)"
        self.getHeaderIdQuery = "SELECT id FROM headers WHERE text LIKE %s"
        self.insertHeaderToDocQuery = "INSERT INTO header_to_doc(doc_id,header_id,pos_start,pos_end,type) VALUES "
        self.isDocAlreadySaveQuery = "SELECT count(id) as cnt FROM `header_to_doc` WHERE doc_id = %s group by doc_id"

        self.queryElement = "(%s, %s, %s, %s, %s)"
        
        self.dbCursor.execute("SELECT * FROM headers ORDER BY id")
        for element in self.dbCursor.fetchall():
            self.headers[element[1]] = element[0] 
class HeadersFileBuilder(WikiIterator):
    def __init__(self, accessor, docIds=None, prefix=''):
        super(HeadersFileBuilder, self).__init__(accessor, 1000, docIds,
                                                 prefix)

    def processSave(self, articlesCount):
        pass

    def preProcess(self):
        self.redirects = self.accessor.getIndex(RedirectsIndex)
        self.doctypeIndex = DocumentTypeIndex(self.accessor)
        self.headersExtractor = HeadersExtractor(self.getHeaderId)

        self.clear()
        self.headersToIds = {}
        self.idsToHeaders = []
        self.headerDocuments = {}
        self.documentHeaders = {}

    def postProcess(self):
        with open(self.getFullFileName('HeadersToIds.pcl'), 'wb') as f:
            pickle.dump(self.headersToIds, f, pickle.HIGHEST_PROTOCOL)
        with open(self.getFullFileName('IdsToHeaders.pcl'), 'wb') as f:
            pickle.dump(self.idsToHeaders, f, pickle.HIGHEST_PROTOCOL)
        with open(self.getFullFileName('HeaderDocuments.pcl'), 'wb') as f:
            pickle.dump(self.headerDocuments, f, pickle.HIGHEST_PROTOCOL)
        with open(self.getFullFileName('DocumentHeaders.pcl'), 'wb') as f:
            pickle.dump(self.documentHeaders, f, pickle.HIGHEST_PROTOCOL)

    def clear(self):
        pass

    def getHeaderId(self, header):
        header = header.replace("ё", "е")
        header = header.replace("\\", "").strip()
        if not self.headersToIds.get(header, None):
            self.headersToIds[header] = len(self.idsToHeaders)
            self.idsToHeaders.append(header)
        return self.headersToIds[header]

    def processDocument(self, docId):
        if self.redirects.isRedirect(docId):
            return
        if self.doctypeIndex.isDocType(docId, 'wiki_stuff'):
            return
        headers = self.headersExtractor.getHeadersForDoc(
            docId, self.wikiIndex.getTextArticleById(docId))
        self.documentHeaders[docId] = headers
        for h in headers:
            if not self.headerDocuments.get(h['header'], None):
                self.headerDocuments[h['header']] = []
            self.headerDocuments[h['header']].append(docId)
class HeadersDBBuilder(WikiIterator):
    def __init__(self, accessor, docIds=None):
        super(HeadersDBBuilder, self).__init__(accessor, 1000, docIds)

    def processSave(self, articlesCount):
        pass

    def preProcess(self):
        self.dbConnection = pymysql.connect(host='localhost',
                                            port=3306,
                                            user='******',
                                            passwd='',
                                            charset='utf8',
                                            db='wikiparse')
        self.dbCursor = self.dbConnection.cursor()
        self.redirects = self.accessor.getIndex(RedirectsIndex)
        self.doctypeIndex = DocumentTypeIndex(self.accessor)
        self.headersExtractor = HeadersExtractor(self.getHeaderId)

        self.clear()
        self.headers = {}
        self.addHeaderQuery = "INSERT INTO headers(text) VALUES (%s)"
        self.getHeaderIdQuery = "SELECT id FROM headers WHERE text LIKE %s"
        self.insertHeaderToDocQuery = "INSERT INTO header_to_doc(doc_id,header_id,pos_start,pos_end,type) VALUES "
        self.isDocAlreadySaveQuery = "SELECT count(id) as cnt FROM `header_to_doc` WHERE doc_id = %s group by doc_id"

        self.queryElement = "(%s, %s, %s, %s, %s)"

        self.dbCursor.execute("SELECT * FROM headers ORDER BY id")
        for element in self.dbCursor.fetchall():
            self.headers[element[1]] = element[0]

    def postProcess(self):
        pass

    def clear(self):
        pass

    # Проверяем, был ли документ обработан ранее
    def isDocAlreadySave(self, docId):
        self.dbCursor.execute(self.isDocAlreadySaveQuery, (docId))
        count = self.dbCursor.fetchone()
        if not count:
            return False
        return count[0] > 0

    # Определяет или генерирует идентификатор заголовка
    def getHeaderId(self, header):
        header = header.replace("ё", "е")
        header = header.replace("\\", "").strip()
        header_id = self.headers.get(header, None)
        if not header_id:
            self.dbCursor.execute(self.addHeaderQuery, (header))
            self.dbConnection.commit()
            self.dbCursor.execute(self.getHeaderIdQuery, (header))
            header_id = self.dbCursor.fetchone()
            if not header_id:
                print(header)
            else:
                self.headers[header] = header_id[0]
        return header_id

    # Обработка документа
    def processDocument(self, docId):
        #страницы-редиректы не обрабатываем
        if self.redirects.isRedirect(docId):
            return
        # служебные страницы не обрабатываем
        if self.doctypeIndex.isDocType(docId, 'wiki_stuff'):
            return
        # уже сохраненные не обрабатываем
        if self.isDocAlreadySave(docId):
            return
        # получаем текст статьи
        text = self.wikiIndex.getTextArticleById(docId)
        # получаем из текста заголовки в виде структурок
        headers = self.headersExtractor.getHeadersForDoc(docId, text)

        # формируем запрос
        query = []
        params = []
        for header_id in range(0, len(headers) - 1):
            query.append(self.queryElement)
            params.append(docId)
            params.append(headers[header_id]["header"])
            params.append(headers[header_id]["position_start"])
            if header_id != len(headers) - 1:
                params.append(headers[header_id + 1]["position_match"])
            else:
                params.append(len(text))
            params.append(headers[header_id]["type"])
        # Выполняем запрос
        if len(query) > 0:
            self.dbCursor.execute(
                self.insertHeaderToDocQuery + ",".join(query), params)
            self.dbConnection.commit()
                'id': element[0],
                'text': element[1],
                'cnt': element[2]
            })
        return res


if __name__ == "__main__":
    #regex1 = re.compile('\n[ \t]*==([^=]*)==[ \t\r]*\n')
    #text = " kdkd\n == kdkd==\n"
    #match = regex1.search(text)
    #print(match.end())
    from pywikiaccessor.title_index import TitleIndex
    directory = "C:\\WORK\\science\\onpositive_data\\python\\"
    accessor = WikiAccessor(directory)
    docTypesIndex = DocumentTypeIndex(accessor)
    docIds = docTypesIndex.getDocsOfType("substance")
    titleIndex = accessor.getIndex(TitleIndex)
    for docId in docIds:
        print(titleIndex.getTitleById(docId))
    doc_id = titleIndex.getIdByTitle("ALCAM")
    print(docTypesIndex.getDocTypeById(doc_id))
#hb = HeadersDBBuilder(accessor,list(docIds))
#hb.build()
#hb.preProcess()
#hb.processDocument(doc_id)
#hi = HeadersDBIndex(accessor)
#hi.getCountHeadersForDoc(docIds)
#stat = hi.getAllStat(docIds)
#for s in stat:
#    print (s['text']+": "+str(s['cnt']))