def fetch(self): self.job.url = WebMirror.util.urlFuncs.urlClean(self.job.url) # print('Dispatch URL', url) netloc = urllib.parse.urlsplit(self.job.url.lower()).netloc isGdoc, realUrl = gdp.isGdocUrl(self.job.url) isGfile, fileUrl = gdp.isGFileUrl(self.job.url) # print('Fetching: ', self.job.url, 'distance', self.job.distance) # print(isGdoc, isGfile) if 'drive.google.com' in netloc: self.log.info("Google Drive content!") response = self.extractGoogleDriveFolder(self.job) elif isGdoc: self.log.info("Google Docs content!") response = self.retreiveGoogleDoc(self.job, realUrl) elif isGfile: self.log.info("Google File content!") response = self.retreiveGoogleFile(self.job, realUrl) else: response = self.retreivePlainResource(self.job) if 'title' in response and 'contents' in response: self.job.title = response['title'] self.job.content = response['contents'] self.job.mimetype = 'text/html' self.job.is_text = True self.job.state = 'complete' return response
def processNewUrl(self, url, baseUrl=None, istext=True): if not url.lower().startswith("http"): if baseUrl: # If we have a base-url to extract the scheme from, we pull that out, concatenate # it onto the rest of the url segments, and then unsplit that back into a full URL scheme = urllib.parse.urlsplit(baseUrl.lower()).scheme rest = urllib.parse.urlsplit(baseUrl.lower())[1:] params = (scheme, ) + rest # self.log.info("Had to add scheme (%s) to URL: '%s'", scheme, url) url = urllib.parse.urlunsplit(params) elif self.ignoreBadLinks: self.log.error("Skipping a malformed URL!") self.log.error("Bad URL: '%s'", url) return else: raise ValueError("Url isn't a url: '%s'" % url) if gdp.isGdocUrl(url) or gdp.isGFileUrl(url): if gdp.trimGDocUrl(url) != url: raise ValueError("Invalid link crept through! Link: '%s'" % url) if not url.lower().startswith('http'): raise ValueError("Failure adding scheme to URL: '%s'" % url) if not self.checkDomain(url) and istext: raise ValueError("Invalid url somehow got through: '%s'" % url) if '/view/export?format=zip' in url: raise ValueError("Wat?") return url
def dispatchUrlRequest(self, url, pageDistance): url = TextScrape.urlFuncs.urlClean(url) # Snip off leading slashes that have shown up a few times. if url.startswith("//"): url = "http://" + url[2:] # print('Dispatch URL', url) netloc = urllib.parse.urlsplit(url.lower()).netloc isGdoc, realUrl = gdp.isGdocUrl(url) isGfile, fileUrl = gdp.isGFileUrl(url) # print('Fetching: ', url, 'distance', pageDistance) # print(isGdoc, isGfile) if "drive.google.com" in netloc: self.log.info("Google Drive content!") response = self.extractGoogleDriveFolder(url) elif isGdoc: self.log.info("Google Docs content!") response = self.retreiveGoogleDoc(realUrl) elif isGfile: self.log.info("Google File content!") response = self.retreiveGoogleFile(realUrl) else: response = self.retreivePlainResource(url) if "title" in response and "contents" in response: self.updateDbEntry( url=url, title=response["title"], contents=response["contents"], mimetype="text/html", dlstate=2, istext=True, ) self.processResponse(response, pageDistance)
def dispatchUrlRequest(self, url, pageDistance): url = TextScrape.urlFuncs.urlClean(url) # Snip off leading slashes that have shown up a few times. if url.startswith("//"): url = 'http://' + url[2:] # print('Dispatch URL', url) netloc = urllib.parse.urlsplit(url.lower()).netloc isGdoc, realUrl = gdp.isGdocUrl(url) isGfile, fileUrl = gdp.isGFileUrl(url) # print('Fetching: ', url, 'distance', pageDistance) # print(isGdoc, isGfile) if 'drive.google.com' in netloc: self.log.info("Google Drive content!") response = self.extractGoogleDriveFolder(url) elif isGdoc: self.log.info("Google Docs content!") response = self.retreiveGoogleDoc(realUrl) elif isGfile: self.log.info("Google File content!") response = self.retreiveGoogleFile(realUrl) else: response = self.retreivePlainResource(url) if 'title' in response and 'contents' in response: self.updateDbEntry(url=url, title=response['title'], contents=response['contents'], mimetype='text/html', dlstate=2, istext=True) self.processResponse(response, pageDistance)