Example #1
0
class PagesChecker(object):
    """docstring for PagesChecker."""
    def __init__(self,
                 urls,
                 onchange=None):  #onchange is a function with onchange(url)

        ########################################################################
        #HYPERPARAMETERS
        self.datapath = "./data"
        self.periode = 10 * 60  #check every hour
        self.deltastart = 3  #start thread with a delta time of 3 sec
        self.max_change_ratio = 0.10
        ########################################################################

        super(PagesChecker, self).__init__()
        self.isrunning = False
        self.urls = urls
        self.onchange = onchange
        self.threads = []
        self.sm = Similarity()

        if not os.path.exists(self.datapath) or os.path.exists(
                self.datapath) and not os.path.isdir(self.datapath):
            os.mkdir(self.datapath, 0o755)

    def run(self):
        self.isrunning = True
        urls = self.urls
        self.urls = []
        for url in urls:
            self.addNewChecker(url)
            time.sleep(self.deltastart)

    def stop(self):
        self.isrunning = False
        for thread in self.threads:
            thread.stop()

    def state(self):
        print("Got " + str(len(self.threads)) + " running !")

    def getFileNameFromUrl(self, url):
        nurl = url.replace("https://", "").replace("http://", "").replace(
            "/", "-").replace(".html",
                              "").replace(".php",
                                          "").replace(".js",
                                                      "").replace(".", "_")
        return self.datapath + "/" + nurl

    def formatDate(self):
        return '{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now())

    def saveState(self, url, data):
        f = open(self.getFileNameFromUrl(url), "w+")
        f.truncate(0)
        f.seek(0)
        f.write(json.dumps(data))
        f.close()

    def strIsSame(self, html1, html2):
        if len(html1) == len(html2):
            return self.sm.isSimilar(html1, html2)
        else:
            return False

    def addNewChecker(self, url):
        if not os.path.exists(self.getFileNameFromUrl(url)):
            f = open(self.getFileNameFromUrl(url), "w+")
            f.write(
                json.dumps({
                    "url": url,
                    "created_at": self.formatDate(),
                    "html": "",
                    "last_check": "0",
                    "nb_change": 0
                }))
            f.close()
        t = threading.Thread(target=self.worker, args=(url, ))
        self.threads.append(t)
        self.urls.append(url)
        t.start()

    def stopChecker(self, url):
        for i in range(0, len(self.urls)):
            if self.urls[i] == url:
                self.threads[i].stop()
                self.threads.pop(i)
                self.urls.pop(i)
                break

    def worker(self, url):
        html = str(requests.get(url).content, "utf-8")

        datafile = open(self.getFileNameFromUrl(url), "r+")
        raw = datafile.read()
        data = json.loads(raw)
        datafile.close()

        #isSame = self.strIsSame(html, data["html"])
        distance = sum(
            [1 for x, y in zip(html, data["html"]) if x.lower() != y.lower()])
        change_ratio = distance * 2 / (len(html) + len(data["html"]))

        if (change_ratio > self.max_change_ratio or data["html"] == ""):
            data["nb_change"] += 1
            data["html"] = html

            self.onchange(url)

        data["last_check"] = self.formatDate()
        self.saveState(url, data)
        time.sleep(self.periode)