def __init__(self, domain, threads, mode='DEV'): self.domain = domain self.mode = mode self.MAX_THREADS = threads self.buildQueue = [] self.readSemaphore = True self.invertedIndexSemaphore = True self.hubAuthFile = 'domains/' + self.domain + '/' + self.domain + "_HubAuth.json" self.hubAuthScores = FileIO.readJsonFile(self.hubAuthFile) self.pageRankFile = 'domains/' + self.domain + '/' + self.domain + "_pageRank.json" self.pageRanks = FileIO.readJsonFile(self.pageRankFile)
def build(self): filePath = 'domains/' + self.domain + '/' + self.domain + "_index.txt" pageRankFile = 'domains/' + self.domain + '/' + self.domain + "_pageRank.json" rawData = FileIO.readJsonFile(filePath) count = 0 for entry in rawData.keys(): count += 1 doc = rawData[entry] if doc['title'] == None: doc['title'] = 'No Title' self.addDocumentToCollection( url=entry, title=doc['title'], body=doc['body'], description=doc['description'], pageRank=self.pageRanks[entry], hub=self.hubAuthScores[doc['title']][0], authority=self.hubAuthScores[doc['title']][1]) self.buildInvertedIndex(doc['body'], entry) if self.mode == 'DEV' and count >= 5: break
def runParser(self): if not os.path.isfile(self.crawledFile): log('error', 'No crawled file.') return self self.links = FileIO.fileToSet(self.crawledFile) if not self.links: log('error', 'Crawled file is empty') return self data = FileIO.readJsonFile(self.indexFile) for link in self.links: if link not in data: obj = extractData(link) data[link] = { 'docId': DataParser.docId, 'title': obj['title'], 'body': obj['body'] } DataParser.docId += 1 FileIO.deleteFileContents(self.indexFile) FileIO.writeJsonFile(data, self.indexFile)