def get_document_characteristics(self, doc): """ get infos on document by making an HEAD request to get meta data on document (size, etag, etc.) :param dict doc: Title of the document :param str href: URL of the document :return dict: dict with informations about the resource """ if doc["domain"] in self.domain_filter: if self.console: print(".", end="", flush=True) # get more info with a HEAD request self.logger.debug(f"get_document_characteristics {str(doc)}") try: headreq = requests.head(doc['href'], proxies=helpers.get_proxies( self.proxy)) if headreq.status_code == 301: # get the new location # Assume only one redirection happens... doc['href'] = headreq.headers['Location'] headreq = requests.head(doc['href'], proxies=helpers.get_proxies( self.proxy)) if headreq.status_code == 200: doc["last-modified"] = headreq.headers["Last-Modified"] doc["size"] = int(headreq.headers["Content-Length"]) doc["etag"] = headreq.headers["Etag"] else: self.logger.error( f"Error {headreq.status_code} - URL={doc['href']}\n") except Exception as ex: self.logger.error( f"Error HEAD request {doc['href']} exception {str(ex)}") return doc
def download(self, docinfo): """ Download the document from the IHE website locally The document is located in the domain it belongs to ;param dict docinfo: information about record describing the document """ filename = self.document_path(docinfo, createpath=True) return helpers.download(docinfo["href"], filename, proxies=helpers.get_proxies(self.proxy))
def load_ihe_page(self, webpage=IHE_TF_URL): """ Load IHE html page Find documents Classify them """ unsorted_docs = {} retcode = True try: req = requests.get(webpage, proxies=helpers.get_proxies(self.proxy)) doc_class = webpage.split('/')[-2] if req.status_code == 200: soup = BeautifulSoup(req.text, "html5lib") links = list( filter(lambda x: x.get("href"), soup.find_all("a"))) pdf_list = list( filter(lambda x: x.get("href").endswith(".pdf"), links)) self.logger.info("Get information about documents") for link in pdf_list: docinfo = self.get_infos(link.text, link.get("href"), doc_class) unsorted_docs[docinfo["filename"]] = docinfo self.logger.info( f"\n{len(unsorted_docs)} documents found in IHE website : {doc_class}" ) self.classify(unsorted_docs) self.logger.debug(f"IHE category keys {self.doc.keys()}") except requests.exceptions.ConnectionError as conn_err: self.logger.error( f"Error while loading remote IHE page {conn_err.errno}:{conn_err.strerror}" ) retcode = False return retcode
def set_proxy(self, proxy): self.logger.debug(f"Update proxy in network watchdog {proxy}") self.proxies = helpers.get_proxies(proxy)
def __init__(self, url, proxy, delay): super().__init__(None) self.url = url self.proxies = helpers.get_proxies(proxy) self.delay = delay