def __init__(self, accessor, headerIndexPrefix): self.accessor = accessor self.headerIndex = HeadersFileIndex(accessor, headerIndexPrefix) self.prefix = headerIndexPrefix self.tokenSplitter = TokenSplitter() self.posTagger = POSTagger() FragmentConfig(accessor.directory)
def __init__(self, accessor, headerIndexPrefix=None, configuration=None): self.accessor = accessor self.headerIndex = HeadersFileIndex(accessor, '') self.prefix = headerIndexPrefix self.tokenSplitter = TokenSplitter() self.posTagger = POSTagger() if configuration: self.fragmentTypesToHeaders = configuration else: FragmentConfig(accessor.directory) self.fragmentTypesToHeaders = FragmentConfig.fragmentTypesToHeaders
def buildHeaders(categories, prefix): directory = "C:\\[Study]\\Diploma\\wiki_indexes\\" accessor = WikiAccessor(directory) pages = getArticles(categories, accessor) print(len(pages)) hb = HeadersFileBuilder(accessor, list(pages), prefix) hb.build() hi = HeadersFileIndex(accessor, prefix) stat = hi.getAllStat() with codecs.open(directory + 'headers.txt', 'w', 'utf-8') as f: for item in stat: if item['cnt'] == 1: break print(item['text'] + ": " + str(item['cnt'])) f.write(item['text'] + ": " + str(item['cnt']) + '\n') f.close()
class HeadersExtractor: directory = "C:\\[Study]\\Diploma\\wiki_indexes\\" accessor = wiki_accessor.WikiAccessor(directory) bld = CategoryIndex(accessor) hid = HeadersFileIndex(accessor) def getCategoryHeaders(self, categoryId): categoryPages = self.bld.getDirectPages(categoryId) headersSet = set() # множество с id заголовков статей категории for page in categoryPages: for h in self.hid.headersByDoc(page): headersSet.add(h['header']) return headersSet # for header in headersSet: # print(hid.headerText(header)) def getHeadersForTree(self, categories): allheaders = set() for category in categories: allheaders.update(self.getCategoryHeaders(category)) headersArray = [] for h in allheaders: headersDict = {} headersDict['id'] = h headersDict['text'] = self.hid.headerText(h) headersArray.append(headersDict) return headersArray def getCategoryHeaders_better(self, categoryId): categoryPages = self.bld.getDirectPages(categoryId) headersDict = {} for page in categoryPages: for h in self.hid.headersByDoc(page): if headersDict.get(h['header'], False): headersDict[h['header']].append(page) else: headersDict[h['header']] = [] headersDict[h['header']].append(page) return headersDict def getHeadersForTree_better(self, categories): headersDict = {} for category in categories: categoryPages = self.bld.getDirectPages(category) for page in categoryPages: for h in self.hid.headersByDoc(page): if headersDict.get(h['header'], False): headersDict[h['header']].append(page) else: headersDict[h['header']] = [] headersDict[h['header']].append(page) headersArray = [] for key, val in headersDict.items(): headersDict = {} headersDict['id'] = key headersDict['text'] = self.hid.headerText(key) headersDict['amount'] = len(val) headersDict['docs'] = val headersArray.append(headersDict) return headersArray
class AbstractFragmentIterator(metaclass=ABCMeta): def __init__(self, accessor, headerIndexPrefix): self.accessor = accessor self.headerIndex = HeadersFileIndex(accessor, headerIndexPrefix) self.prefix = headerIndexPrefix self.tokenSplitter = TokenSplitter() self.posTagger = POSTagger() FragmentConfig(accessor.directory) @abstractmethod def preProcess(self): pass @abstractmethod def postProcess(self): pass @abstractmethod def processFragmentStart(self, fType): pass @abstractmethod def processFragmentEnd(self, fType): pass @abstractmethod def processDocument(self, fType, headerId, docId): pass def build(self): self.preProcess() for fType in FragmentConfig.fragmentTypesToHeaders.keys(): print("Process " + fType) self.processFragmentStart(fType) docCount = 0 for header in FragmentConfig.fragmentTypesToHeaders[fType]: headerId = self.headerIndex.headerId(header) docs = self.headerIndex.documentsByHeader(header) for docId in docs: self.processDocument(fType, headerId, docId) docCount += 1 if docCount % 100 == 0: print("\tProcess " + str(docCount) + " documents") print("\tProcess " + str(docCount) + " documents") self.processFragmentEnd(fType) self.postProcess()