Example #1
0
 def __init__(self, domain, threads, mode='DEV'):
     self.domain = domain
     self.mode = mode
     self.MAX_THREADS = threads
     self.buildQueue = []
     self.readSemaphore = True
     self.invertedIndexSemaphore = True
     self.hubAuthFile = 'domains/' + self.domain + '/' + self.domain + "_HubAuth.json"
     self.hubAuthScores = FileIO.readJsonFile(self.hubAuthFile)
     self.pageRankFile = 'domains/' + self.domain + '/' + self.domain + "_pageRank.json"
     self.pageRanks = FileIO.readJsonFile(self.pageRankFile)
Example #2
0
    def build(self):
        filePath = 'domains/' + self.domain + '/' + self.domain + "_index.txt"
        pageRankFile = 'domains/' + self.domain + '/' + self.domain + "_pageRank.json"

        rawData = FileIO.readJsonFile(filePath)

        count = 0
        for entry in rawData.keys():
            count += 1
            doc = rawData[entry]

            if doc['title'] == None:
                doc['title'] = 'No Title'

            self.addDocumentToCollection(
                url=entry,
                title=doc['title'],
                body=doc['body'],
                description=doc['description'],
                pageRank=self.pageRanks[entry],
                hub=self.hubAuthScores[doc['title']][0],
                authority=self.hubAuthScores[doc['title']][1])
            self.buildInvertedIndex(doc['body'], entry)

            if self.mode == 'DEV' and count >= 5:
                break
Example #3
0
 def runParser(self):
     if not os.path.isfile(self.crawledFile):
         log('error', 'No crawled file.')
         return self
     self.links = FileIO.fileToSet(self.crawledFile)
     if not self.links:
         log('error', 'Crawled file is empty')
         return self
     data = FileIO.readJsonFile(self.indexFile)
     for link in self.links:
         if link not in data:
             obj = extractData(link)
             data[link] = {
                 'docId': DataParser.docId,
                 'title': obj['title'],
                 'body': obj['body']
             }
             DataParser.docId += 1
     FileIO.deleteFileContents(self.indexFile)
     FileIO.writeJsonFile(data, self.indexFile)