コード例 #1
0
ファイル: sync.py プロジェクト: flrt/ihe-tf-sync
    def get_document_characteristics(self, doc):
        """
        get infos on document by making an HEAD request to get meta data on document (size, etag, etc.)

        :param dict doc: Title of the document
        :param str href: URL of the document
        :return dict: dict with informations about the resource
        """

        if doc["domain"] in self.domain_filter:
            if self.console:
                print(".", end="", flush=True)

            # get more info with a HEAD request
            self.logger.debug(f"get_document_characteristics {str(doc)}")
            try:
                headreq = requests.head(doc['href'],
                                        proxies=helpers.get_proxies(
                                            self.proxy))

                if headreq.status_code == 301:
                    # get the new location
                    # Assume only one redirection happens...
                    doc['href'] = headreq.headers['Location']
                    headreq = requests.head(doc['href'],
                                            proxies=helpers.get_proxies(
                                                self.proxy))

                if headreq.status_code == 200:
                    doc["last-modified"] = headreq.headers["Last-Modified"]
                    doc["size"] = int(headreq.headers["Content-Length"])
                    doc["etag"] = headreq.headers["Etag"]
                else:
                    self.logger.error(
                        f"Error {headreq.status_code} - URL={doc['href']}\n")
            except Exception as ex:
                self.logger.error(
                    f"Error HEAD request {doc['href']} exception {str(ex)}")

        return doc
コード例 #2
0
ファイル: sync.py プロジェクト: flrt/ihe-tf-sync
    def download(self, docinfo):
        """
        Download the document from the IHE website locally
        The document is located in the domain it belongs to

        ;param dict docinfo: information about record describing the document

        """

        filename = self.document_path(docinfo, createpath=True)
        return helpers.download(docinfo["href"],
                                filename,
                                proxies=helpers.get_proxies(self.proxy))
コード例 #3
0
ファイル: sync.py プロジェクト: flrt/ihe-tf-sync
    def load_ihe_page(self, webpage=IHE_TF_URL):
        """
        Load IHE html page
        Find documents
        Classify them
        """

        unsorted_docs = {}
        retcode = True
        try:
            req = requests.get(webpage,
                               proxies=helpers.get_proxies(self.proxy))
            doc_class = webpage.split('/')[-2]

            if req.status_code == 200:
                soup = BeautifulSoup(req.text, "html5lib")
                links = list(
                    filter(lambda x: x.get("href"), soup.find_all("a")))
                pdf_list = list(
                    filter(lambda x: x.get("href").endswith(".pdf"), links))

                self.logger.info("Get information about documents")
                for link in pdf_list:
                    docinfo = self.get_infos(link.text, link.get("href"),
                                             doc_class)
                    unsorted_docs[docinfo["filename"]] = docinfo

            self.logger.info(
                f"\n{len(unsorted_docs)} documents found in IHE website : {doc_class}"
            )
            self.classify(unsorted_docs)
            self.logger.debug(f"IHE category keys {self.doc.keys()}")
        except requests.exceptions.ConnectionError as conn_err:
            self.logger.error(
                f"Error while loading remote IHE page {conn_err.errno}:{conn_err.strerror}"
            )
            retcode = False
        return retcode
コード例 #4
0
 def set_proxy(self, proxy):
     self.logger.debug(f"Update proxy in network watchdog {proxy}")
     self.proxies = helpers.get_proxies(proxy)
コード例 #5
0
 def __init__(self, url, proxy, delay):
     super().__init__(None)
     self.url = url
     self.proxies = helpers.get_proxies(proxy)
     self.delay = delay