Esempio n. 1
0
    def _down_image(self, img, proxy=None):
        """
        Downloads only one image. Helper Method to load_images

        :author: Sebastian
        :raises NameError: If there is an image that can not be fetched because no known attribute
        containing a link to it exists or has a link that satisfies the urlPattern.
        :param img: The img tag object.
        :param proxy: The proxy that is to be used to download the image. Defaults to None, to download it directly.
        :return: A Response object with the response status and the image to store.
        """

        attributes = ['src', 'data-full-size', 'data-original', 'data']
        tag = None
        for attr in attributes:
            if attr in img.attrs and proxy_util.url_specification.match(
                    img[attr]):
                tag = img[attr]
            elif attr in img.attrs and _starts_with_slashes(img[attr]):
                tag = "http:{}".format(img[attr])
        if tag is None:
            msg = "Thread-{}: An image did not have an html specification url: {}".format(
                self.threadID, img)
            logger(msg)
            raise NameError(msg)
        logger("Thread-{}: Trying to download image: {}".format(
            self.threadID, tag))
        if self.robot_check and not self.bot_parser.can_fetch(self.url):
            text = "Thread-{}: Not allowed to fetch image file specified by url:{} because of robots.txt"\
                .format(self.threadID, self.url)
            logger(text)
            raise urllib.error.URLError(text)

        try:
            """
            First we try to get the image with a proxy. If that fails we try it without proxy.
            """
            if proxy:
                res = requests.get(tag,
                                   stream=True,
                                   proxies={"http": "http://" + proxy},
                                   headers=header)
                return res

            res = requests.get(tag, stream=True, headers=header)
        except ConnectionRefusedError or MaxRetryError as con:
            logger(
                "Thread-{} Could not request image due to: {}\ntrying without proxy."
                .format(self.threadID, con.strerror))
            res = requests.get(tag, stream=True, headers=header)
        except ConnectionResetError as reset:
            raise reset

        return res
Esempio n. 2
0
    def initialize(proxy=None, proxy_location=None, thread_id=404):
        """
        Helper method that initializes the PhantomJS Headless browser and sets the proxy.

        :author: Sebastian
        :param proxy: The proxy to set.
        :param proxy_location: A location for a proxy. If no proxy is specified it is fetched from that location.
        :param thread_id: The thread that started the initialization process. Only for logging purposes.
        :return: The PhantomJS driver object.
        """
        logger("Thread-{}: Initialize Phantom with proxy:{} and location: {}".
               format(thread_id, proxy, proxy_location))
        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 " \
                                                    "(KHTML, like Gecko) Chrome/15.0.87"

        if proxy is not None:
            service_args = [
                '--proxy={}'.format(proxy),
                '--proxy-type=http',
            ]
        elif proxy_location is not None:
            try:
                logger("Thread-{}: retrieve proxy for location: {}".format(
                    thread_id, proxy_location))
                new_proxy = proxy_util.get_one_proxy(proxy_location)

            except RuntimeError:
                logger(
                    "Thread-{}: Restarted proxy retrieval with new event loop".
                    format(thread_id))
                asyncio.set_event_loop(asyncio.new_event_loop())
                new_proxy = proxy_util.get_one_proxy(proxy_location)

            service_args = [
                '--proxy={}'.format(new_proxy),
                '--proxy-type=http',
            ]
        else:
            service_args = []
            logger(
                "Thread-{}: Neither proxy nor location are set, doing things locally"
                .format(thread_id))
        dcap["acceptSslCerts"] = True
        phantom = webdriver.PhantomJS(js_path,
                                      desired_capabilities=dcap,
                                      service_args=service_args)
        max_wait = 35

        phantom.set_window_size(1024, 768)
        phantom.set_page_load_timeout(max_wait)
        phantom.set_script_timeout(max_wait)
        return phantom
Esempio n. 3
0
def is_correct_html(html, t_id=None, url=None):
    """
    Applies heuristics to filter "incorrect" HTML. Checks the character length of the HTML as proxies sometimes send
    back empty HTMLs. In short HTMLs it checks for keywords like Error or Denied as heuristics.

    :author: Sebastian
    :param html: The HTML as String.
    :param t_id: Optional parameter used only for logging in case this method is called by a DownloadThread to
    associate the text in the log with the thread.
    :param url: THe url that the html was retrieved from. Used as an exception to the length rule if url is
    proxy_util.ip_check_url = "http://httpbin.org/ip"
    :return: True if the heuristics conclude it is a real HTML without errors, otherwise False.
    """

    keywords = [
        "Error", "ERROR", "Denied", "Authentication Required", "Authenticate"
    ]
    if html is None:
        logger(
            "Thread-{} Failed the HTML correctness check on 1. condition it is None"
            .format(t_id))
        return False
    if url is not None and url.find("httpbin.org") != -1:
        logger(
            "Thread-{} HTML correctness check succeeded, because URL is {}!".
            format(t_id, url))
        return True
    if len(html) < 300:
        logger(
            "Thread-{} Failed the HTML correctness check on 2. condition it is too small"
            .format(t_id))
        return False
    if len(html) < 1000:
        for key in keywords:
            if key in html:
                logger(
                    "Thread-{} Failed the HTML correctness check on 3. condition for Keyword '{}'"
                    .format(t_id, key))
                return False
    logger(
        "Thread-{} HTML correctness check succeeded. HTML seems valid!".format(
            t_id))
    return True
Esempio n. 4
0
 def scroll(phantom):
     try:
         pause = 0.2
         start_time = time.time()
         last_height = phantom.execute_script(
             "return document.body.scrollHeight")
         # only load for a maximum of 5 seconds
         while True or time.time() - start_time > 5:
             phantom.execute_script(
                 "window.scrollTo(0, document.body.scrollHeight);")
             time.sleep(pause)
             new_height = phantom.execute_script(
                 "return document.body.scrollHeight")
             if new_height == last_height:
                 break
             last_height = new_height
     except WebDriverException as e:
         logger(
             "Could not scroll down due to javascript security policy forbidding the use of eval: {}"
             .format(e.msg))
         raise e
Esempio n. 5
0
def get_from_ipfs(timestamp, file_path=None):
    """
    Get data from IPFS. The data on IPFS is identified by the hash (timestamp variable).
    We collect the data using the IPFS API. IPFS has to be installed and a daemon process of IPFS needs to be
    running for this functionality to work. If the data is not present on IPFS it raises a ValueError.


    :author: Sebastian
    :raises ValueError: A ValueError is raised whenever the process fails due to a incorrectly formatted hash or a hash
    that is not retrievable by ipfs within the timeout of 5 seconds. Whenever this error is raised we assume the data
    is currently not present on IPFS
    :param file_path: If the file to retrieve should be stored in a specific location it can be specified via this
    parameter.
    :param timestamp: The hash describing the data on IPFS.
    :return: Returns the path to the locally stored data collected from IPFS.
    """
    if file_path:
        path = file_path + timestamp
    else:
        path = proxy_util.base_path + timestamp
    cur_dir = os.getcwd()
    os.chdir(proxy_util.base_path)
    logger("Trying to fetch the File from IPFS: {}".format(timestamp))
    try:
        ipfs_Client.get(timestamp, timeout=5)
    except ReadTimeout:
        logger("Could not fetch file from IPFS, file does probably not exist.")
        raise ValueError
    except HTTPError:
        logger(
            "Could not fetch file from IPFS, Hash was of the wrong format. Length: {}"
            .format(len(timestamp)))
        raise ValueError
    os.chdir(cur_dir)
    return path
Esempio n. 6
0
    def _download_html_backup(self):

        logger("Thread-{}Trying the backup without phantomJS".format(
            self.threadID))
        try:
            """
            First we try to get the image with a proxy. If that fails we try it without proxy.
            """
            if self.proxy is not None:
                res = requests.get(self.url,
                                   stream=True,
                                   proxies={"http": "http://" + self.proxy},
                                   headers=header)
                return res

            res = requests.get(self.url, stream=True, headers=header)
        except ConnectionRefusedError or MaxRetryError as con:
            logger(
                "Thread-{} Could not request page due to: {}\ntrying without proxy."
                .format(self.threadID, con.strerror))
            try:
                res = requests.get(self.url, stream=True, headers=header)
            except OSError:
                return False
        except OSError:
            return False
        if res.status_code == 200:
            logger(
                "Thread-{} successfully requested page without phantomJS and html:\n {}"
                .format(self.threadID, res.status_code, res.text))
            self.html = res.text
            return True
        return False
Esempio n. 7
0
    def _make_pdf(self):
        """
        Creates a pdf file from the preprocessed html with the images embedded in it.

        """
        # TO DO preserve links done
        html_path = "{}pdf_source.html".format(self.path)
        pdf_path = "{}{}.pdf".format(self.storage_path, self.ipfs_hash)
        #if not os.path.exists(pdf_path):  # Always create pdf even if overwrite is necessary
        soup = BeautifulSoup(self.html, "lxml")
        for img in soup.find_all(['amp-img', 'img']):
            if "ipfs-src" in img.attrs:
                for key in self.images:
                    if img["ipfs-src"] == self.images[key]["hash"]:
                        img["src"] = self.images[key]["filename"]

        with open(html_path, "w") as html_file:
            html_file.write(str(soup.find("html")).replace("noscript", "div"))
        # PDF is written to the basepath of the application (usually app/pdf/)
        pdfkit.from_file(html_path, pdf_path)
        logger(
            "Thread-{}: Created PDF file from Preprocessed and img source changed html file: {}"
            .format(self.threadID, pdf_path))
Esempio n. 8
0
def preprocess_doc(html_text):
    """
    Preprocessing of an html text as a String is done here. Tags that are advertisement and that do not describe the
    content are removed at first. The encoding is detected and next the html is parsed and preprocessed using the
    readability-lxml Document class to clean the content (text and images embedded in the text).
    An HTML string is returned together with the title of the website.

    :author: Sebastian
    :param html_text: html document in string format to preprocess.
    :returns: The preprocessed html as a String and the title if needed by the callee.
    """
    # remove some common advertisement tags beforehand
    bs = BeautifulSoup(html_text, "lxml")
    for tag_desc in negative_tags:
        for tag in bs.findAll(
                attrs={'class': re.compile(r".*\b{}\b.*".format(tag_desc))}):
            tag.extract()
    doc = Document(str(bs.html),
                   negative_keywords=negative_classes,
                   positive_keywords=positive_classes)
    try:
        # Detect the encoding of the html, if not detectable use utf-8 as default.
        encoding = chardet.detect(doc.content().encode()).get('encoding')
        title = doc.title()
    except TypeError or IndexError as e:
        logger("Encountered {} setting encoding to utf-8.".format(str(e)))
        encoding = "utf-8"
        title = bs.title.getText()
    if not encoding:
        logger("Using default encoding utf-8")
        encoding = 'utf-8'
        title = bs.title.getText()
    doc.encoding = encoding

    head = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1' \
           '-transitional.dtd">\n' + '<head>\n' + \
           '<meta http-equiv="Content-Type" content="text/html" ' \
           'charset="' + encoding + '">\n' + '</head>\n' + '<body>\n' \
           + '<h1>' + title.split(sep='|')[0] + '</h1>'

    # Unparsable Type Error in encoding, where's the problem.
    text = head + doc.summary()[12:]

    # sometimes some tags get messed up and need to be translated back
    text = text.replace("&lt;", "<").replace("&gt;", ">")
    logger(
        'Preprocessing done. Type of text is: {}, Length of test is {}'.format(
            type(text), len(text)))
    return text, title
Esempio n. 9
0
def add_to_ipfs(fname):
    """
        Helper method that submits a file to IPFS and returns the resulting hash,
        that describes the address of the file on IPFS.

        :author: Sebastian
        :param fname: The path to the File to get the hash for.
        :return: Returns the Hash of the file.
    """
    if not os.path.isdir(fname):
        #TO DO only submit ZIP not the whole structure /home/seb...
        # os.chdir(fname.rpartition("/")[0])
        # os.chdir(fname)
        res = ipfs_Client.add(fname, recursive=False)
        if type(res) is list:
            logger("IPFS result from list: " + str(res[0]))
            return res[0]['Hash']

        logger("IPFS result: " + str(res))
        return res['Hash']
    else:
        res = ipfs_Client.add(fname, recursive=False)[0]
        logger("IPFS result for directory: " + str(res))
        return res['Hash']
Esempio n. 10
0
    def handle_submission(self):
        """
        Handles the submission of the hash to OriginStamp to create the actual timestamp.
        The title that is submitted to OriginStamp contains the URL in Memento format.
            (For reference see: http://timetravel.mementoweb.org/about/)
        Handles PNG and PDF creation and storage. Sets location to Germany if no proxy was used.

        :author: Sebastian
        """
        logger("Thread-{} submit hash to originstamp.".format(self.threadID))
        if self.prox_loc is None:
            #TO DO define default location as constant
            self.prox_loc = "DE"
        self.originstamp_result = submit(
            self.ipfs_hash,
            title="StampTheWeb decentralized timestamp of article {} at "
            "{} from location {}".format(
                self.url,
                datetime.utcnow().strftime("%Y%m%d%H%M"), self.prox_loc))
        logger("Thread-{}: Originstamp result: {}".format(
            self.threadID, str(self.originstamp_result.text)))
        if self.originstamp_result.status_code != 200:
            msg = "Thread-{} Originstamp submission returned {} and failed for some reason: {}"\
                .format(self.threadID, str(self.originstamp_result.status_code), self.originstamp_result.text)
            self.error = HTTPError(msg)
            # self.originstamp_result = self.originstamp_result.json()
            logger(msg)
            raise self.error
        else:
            self._take_screenshot()
            self._make_pdf()

            #TODO in new OriginStamp API there will eb no error on second submit - no harm done though
            if "errors" in self.originstamp_result.text:
                logger(
                    "Thread-{} submitted hash to originstamp but the content has not changed. A timestamp "
                    "exists already.".format(self.threadID))
                # hash already submitted
                self.already_submitted = True
                history = get_originstamp_history(self.ipfs_hash)
                if history.status_code == 200:
                    self.originstamp_result = history.json()
                    self.originstamp_result["created_at"] = self._format_date(
                        self.originstamp_result["date_created"] / 1000)

            else:
                logger(
                    "Thread-{} successfully submitted hash to originstamp and created a new timestamp."
                    .format(self.threadID))
                self.originstamp_result = self.originstamp_result.json()
                #TODO format of timestamp? not unix timestamp three 0 at the end
                self.originstamp_result["created_at"] = self._format_date(
                    self.originstamp_result["date_created"] / 1000)
                logger(
                    "Thread-{} returned the following originstamp Result: {}".
                    format(self.threadID,
                           self.originstamp_result["created_at"]))
                # Only add content to warc for new or changed content -> only for new timestamps
                if self.warc:
                    self._add_to_warc()
Esempio n. 11
0
    def __init__(self,
                 thread_id,
                 url=None,
                 proxy=None,
                 prox_loc=None,
                 basepath='app/pdf/',
                 html=None,
                 robot_check=False,
                 create_warc=True):
        """
        Default constructor for the DownloadThread class, that initializes the creation of a new download job in a
        separate thread.

        :author: Sebastian
        :param thread_id: The ID of this thread.
        :param url: The URL that is to be downloaded in this job.
        :param proxy: The proxy to use when downloading from the before specified URL.
        :param prox_loc: The proxy location.
        :param basepath: The base path to store the temporary files in.
        :param html: Defaults to None and needs only to be specified if a user input of an HTML was given by the
        StampTheWeb extension.
        :param robot_check: Boolean value that indicates whether the downloader should honour the robots.txt of
        the given website or not.
        :param create_warc: This boolean parameter specifies whether or not a warc should be created for this
        download job.
        """
        threading.Thread.__init__(self)
        logger("Starting Thread-{}".format(str(thread_id)))

        self.url, self.html, self.robot_check, self.threadID = url, html, robot_check, thread_id
        self.proxy, self.prox_loc, self.warc = proxy, prox_loc, create_warc
        self.storage_path, self.images = basepath, dict()
        self.path = "{}temporary".format(basepath)
        self.ipfs_hash, self.title, self.originstamp_result = None, None, None
        self.error, self.screenshot, self.already_submitted = None, dict(
        ), False
        if self.robot_check:
            url_parser = urlparse(self.url)
            self.bot_parser = RobotFileParser().set_url(
                "{url.scheme}://{url.netloc}/robots.txt".format(
                    url=url_parser))
            self.bot_parser.read()

        if self.html is None:
            self._proxy_setup()
            self.extension_triggered = False

            self.phantom = self.initialize(self.proxy, self.prox_loc,
                                           self.threadID)
        else:
            self.extension_triggered = True
            self.phantom = self.initialize(thread_id=self.threadID)
            logger("Thread{} was extension triggered!".format(self.threadID))

        # create temporary storage folder
        if not os.path.exists(self.path):
            try:
                os.mkdir(self.path)
            except FileNotFoundError:
                # should only be thrown and caught in testing mode!
                logger("Thread-{}: Path not found: {}".format(
                    self.threadID, self.path))
                if app.config["TESTING"]:
                    self.path = os.path.abspath(
                        os.path.expanduser("~/")) + "/testing-stw/temporary"
                    logger("Thread-{}: Testing, so new path is: {}".format(
                        self.threadID, self.path))
                else:
                    self.path = "{}/StampTheWeb/{}temporary".format(
                        os.path.abspath(os.path.expanduser("~/")),
                        self.storage_path)
                    if not os.path.exists(self.path.rpartition("/")[0]):
                        os.mkdir(self.path.rpartition("/")[0])

                if not os.path.exists(self.path):
                    os.mkdir(self.path)

        self.path = "{}/{}/".format(self.path, str(thread_id))
        logger("Initialized a new Thread: {} with proxy {} and location {}".
               format(str(self.threadID), self.proxy, self.prox_loc))

        # remove temporary folder with thread id as name and recreate
        if os.path.exists(self.path):
            shutil.rmtree(self.path)
        os.mkdir(self.path)
Esempio n. 12
0
    def run(self):
        """
        Run the initialized thread and start the download job. Afterwards submit the hash to originstamp to create a
        lasting and verifyable timestamp. If HTML is not allowed to be retrieved by the crawler raise URLError.

        :author: Sebastian
        :raises URLError: Is raised if HTML retrieval is forbidden by robots.txt.
                ValueError: Is raised if URL was unreachable due to heuristics failure.
        """
        logger("Started Thread-{}: {}".format(self.threadID, self))
        if self.robot_check and not self.bot_parser.can_fetch(self.url):
            self.error = urllib.error.URLError(
                "Not allowed to fetch root html file specified by url:{} because of "
                "robots.txt".format(self.url))
            logger("Thread-{}: {}".format(self.threadID, self.error))
            raise self.error
        try:
            self.download()
        except ValueError as e:
            self.error = e
            logger("Thread-{}: {}".format(self.threadID, self.error))
            raise e
        except (RuntimeError, ConnectionResetError, TimeoutException,
                HTTPError) as e:
            # Give it another try
            logger("Thread-{}: {} ---  We're giving it a second try".format(
                self.threadID, e))
            try:
                if self.prox_loc is None:
                    self.download()
                elif not proxy_util.is_proxy_alive(self.proxy, 3):
                    self._get_one_proxy()
                    self.download()
                else:
                    raise ValueError(
                        "Thread-{} Proxy checked after failed retrieval, but proxy is alive"
                        .format(self.threadID))
            except (RuntimeError, ConnectionResetError, TimeoutException,
                    HTTPError, ValueError) as e:
                self.error = e
                logger(
                    "Thread-{} Gave it a second try, still didn't work. URL unreachable because of {}"
                    .format(self.threadID, e))
                self.phantom.quit()
                raise e
        # submit the hash to originstamp to create a lasting timestamp.
        if self.error is None:
            logger("Thread-{}: Encountered no errors, going for submission".
                   format(self.threadID))
            self.handle_submission()
        self.phantom.quit()