def runParser(self): FileIO.deleteFileContents(self.indexFile) if not os.path.isfile(self.crawledFile): log('error', 'No crawled file.') return self self.links = FileIO.fileToSet(self.crawledFile) self.linksList = list(self.links) if not self.links: log('error', 'Crawled file is empty') return self threadPool = [] for i in range(0, self.MAX_THREADS): newThread = Thread(name='parser_' + str(i), target=self.parserWorker) threadPool.append(newThread) for i in range(0, self.MAX_THREADS): threadPool[i].start() for i in range(0, self.MAX_THREADS): threadPool[i].join() self.saveLinkGraphs()
def runSpider(self, iterations): startTime = time.time() for i in range(0, iterations): self.queue = FileIO.fileToSet(self.queueFile) self.crawled = FileIO.fileToSet(self.crawledFile) newLinks = set() newCrawledLinks = set() while (len(self.queue) != 0): nextLink = self.queue.pop() res = self.crawlPage(nextLink) newCrawledLinks.add(nextLink) newLinks = newLinks.union(res) FileIO.deleteFileContents(self.queueFile) FileIO.setToFile(newLinks, self.queueFile) FileIO.setToFile(newCrawledLinks, self.crawledFile) FileIO.writeJsonFile(self.outlinkGraph.nodes, self.outlinkGraphFile) FileIO.writeJsonFile(self.inlinkGraph.nodes, self.inlinkGraphFile) log( 'time', "Crawler for " + self.siteName + " execution Finished. Runtime: " + str(time.time() - startTime) + "seconds. Total links crawled: " + str(self.numCrawled))
def runSitemapCrawler(self): startTime = time.time() headers = { 'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Mobile Safari/537.36" } xmlQueue = set() xmlQueue.add(self.sitemapURL) htmlQueue = set() log('sitemap', 'Crawling XML Sitemap for ' + self.siteName) while (len(xmlQueue) != 0): nextParse = requests.get(xmlQueue.pop(), headers=headers) newXMLLinks = self.findNewLinksXML(nextParse) for link in newXMLLinks: if '.xml' in link: if 'archive' not in link: xmlQueue.add(link) else: htmlQueue.add(link) FileIO.deleteFileContents(self.crawledFile) FileIO.setToFile(htmlQueue, self.crawledFile) log( 'time', 'Finished crawling XML sitemap for ' + self.siteName + ' in ' + str(time.time() - startTime) + ' seconds')
def runParser(self): if not os.path.isfile(self.crawledFile): log('error', 'No crawled file.') return self self.links = FileIO.fileToSet(self.crawledFile) if not self.links: log('error', 'Crawled file is empty') return self data = FileIO.readJsonFile(self.indexFile) for link in self.links: if link not in data: obj = extractData(link) data[link] = { 'docId': DataParser.docId, 'title': obj['title'], 'body': obj['body'] } DataParser.docId += 1 FileIO.deleteFileContents(self.indexFile) FileIO.writeJsonFile(data, self.indexFile)