def scrapeURL(self, url): returnArr = [] self.siteUrl = url if config.get('scraper_mode') == 'online': headersArr = {} scrapedRawData = Network.fetch(url, headersArr) if (scrapedRawData['code'] == 200): self.siteHeaders = scrapedRawData['headers']['requested'] self.siteHeaders['referer'] = self.siteUrl self.scrapedRawData = scrapedRawData['body'] elif config.get('scraper_mode') == 'offline': filePath = os.path.realpath(__file__) currentFileName = os.path.basename(__file__) filePath = filePath.replace(currentFileName, '') file = open(f"{filePath}/sample_data/{url}") self.scrapedRawData = file.read() if self.scrapedRawData is not None: result = self.processRawData() returnArr = result return returnArr
def fetch(url, headersArr): returnArr = {"code": 0} try: if len(headersArr) == 0: useragent = UserAgent() headersArr.update(useragent.getRandom()) if config.get('proxy_enabled') is True: proxies = {'https': config.get('proxy_url_ip')} response = requests.get(url, headers=headersArr, proxies=proxies) else: response = requests.get(url, headers=headersArr) returnArr = { "code": response.status_code, "headers": { "requested": headersArr, "received": response.headers }, "body": response.text } except Exception as e: tb = sys.exc_info()[2] print(e.with_traceback(tb)) return returnArr