Ejemplo n.º 1
0
 def __init__(self, accessor, headerIndexPrefix):
     self.accessor = accessor
     self.headerIndex = HeadersFileIndex(accessor, headerIndexPrefix)
     self.prefix = headerIndexPrefix
     self.tokenSplitter = TokenSplitter()
     self.posTagger = POSTagger()
     FragmentConfig(accessor.directory)
Ejemplo n.º 2
0
 def __init__(self, accessor, headerIndexPrefix=None, configuration=None):
     self.accessor = accessor
     self.headerIndex = HeadersFileIndex(accessor, '')
     self.prefix = headerIndexPrefix
     self.tokenSplitter = TokenSplitter()
     self.posTagger = POSTagger()
     if configuration:
         self.fragmentTypesToHeaders = configuration
     else:
         FragmentConfig(accessor.directory)
         self.fragmentTypesToHeaders = FragmentConfig.fragmentTypesToHeaders
Ejemplo n.º 3
0
def buildHeaders(categories, prefix):
    directory = "C:\\[Study]\\Diploma\\wiki_indexes\\"
    accessor = WikiAccessor(directory)
    pages = getArticles(categories, accessor)
    print(len(pages))
    hb = HeadersFileBuilder(accessor, list(pages), prefix)
    hb.build()
    hi = HeadersFileIndex(accessor, prefix)
    stat = hi.getAllStat()
    with codecs.open(directory + 'headers.txt', 'w', 'utf-8') as f:
        for item in stat:
            if item['cnt'] == 1:
                break
            print(item['text'] + ": " + str(item['cnt']))
            f.write(item['text'] + ": " + str(item['cnt']) + '\n')
        f.close()
Ejemplo n.º 4
0
class HeadersExtractor:
    directory = "C:\\[Study]\\Diploma\\wiki_indexes\\"
    accessor = wiki_accessor.WikiAccessor(directory)
    bld = CategoryIndex(accessor)
    hid = HeadersFileIndex(accessor)

    def getCategoryHeaders(self, categoryId):
        categoryPages = self.bld.getDirectPages(categoryId)
        headersSet = set()  # множество с id заголовков статей категории
        for page in categoryPages:
            for h in self.hid.headersByDoc(page):
                headersSet.add(h['header'])
        return headersSet
        # for header in headersSet:
        #     print(hid.headerText(header))
    def getHeadersForTree(self, categories):
        allheaders = set()
        for category in categories:
            allheaders.update(self.getCategoryHeaders(category))
        headersArray = []
        for h in allheaders:
            headersDict = {}
            headersDict['id'] = h
            headersDict['text'] = self.hid.headerText(h)
            headersArray.append(headersDict)
        return headersArray

    def getCategoryHeaders_better(self, categoryId):
        categoryPages = self.bld.getDirectPages(categoryId)
        headersDict = {}
        for page in categoryPages:
            for h in self.hid.headersByDoc(page):
                if headersDict.get(h['header'], False):
                    headersDict[h['header']].append(page)
                else:
                    headersDict[h['header']] = []
                    headersDict[h['header']].append(page)
        return headersDict

    def getHeadersForTree_better(self, categories):
        headersDict = {}
        for category in categories:
            categoryPages = self.bld.getDirectPages(category)
            for page in categoryPages:
                for h in self.hid.headersByDoc(page):
                    if headersDict.get(h['header'], False):
                        headersDict[h['header']].append(page)
                    else:
                        headersDict[h['header']] = []
                        headersDict[h['header']].append(page)
        headersArray = []
        for key, val in headersDict.items():
            headersDict = {}
            headersDict['id'] = key
            headersDict['text'] = self.hid.headerText(key)
            headersDict['amount'] = len(val)
            headersDict['docs'] = val
            headersArray.append(headersDict)
        return headersArray
Ejemplo n.º 5
0
class AbstractFragmentIterator(metaclass=ABCMeta):
    def __init__(self, accessor, headerIndexPrefix):
        self.accessor = accessor
        self.headerIndex = HeadersFileIndex(accessor, headerIndexPrefix)
        self.prefix = headerIndexPrefix
        self.tokenSplitter = TokenSplitter()
        self.posTagger = POSTagger()
        FragmentConfig(accessor.directory)

    @abstractmethod
    def preProcess(self):
        pass

    @abstractmethod
    def postProcess(self):
        pass

    @abstractmethod
    def processFragmentStart(self, fType):
        pass

    @abstractmethod
    def processFragmentEnd(self, fType):
        pass

    @abstractmethod
    def processDocument(self, fType, headerId, docId):
        pass

    def build(self):
        self.preProcess()
        for fType in FragmentConfig.fragmentTypesToHeaders.keys():
            print("Process " + fType)
            self.processFragmentStart(fType)
            docCount = 0
            for header in FragmentConfig.fragmentTypesToHeaders[fType]:
                headerId = self.headerIndex.headerId(header)
                docs = self.headerIndex.documentsByHeader(header)
                for docId in docs:
                    self.processDocument(fType, headerId, docId)
                    docCount += 1
                    if docCount % 100 == 0:
                        print("\tProcess " + str(docCount) + " documents")
            print("\tProcess " + str(docCount) + " documents")
            self.processFragmentEnd(fType)
        self.postProcess()