コード例 #1
0
    def get_next_links(self):
        """
        Returns next link from scraper database
        """
        def regexp(expr, item):
            reg = re.compile(expr)
            return reg.search(item) is not None

        db = get_db()
        db.create_function("REGEXP", 2, regexp)

        try:
            while True:
                cursor = db.cursor()
                whitelist = '|'.join(self.whitelist)
                ignorelist = '|'.join(self.ignorelist)

                urls = cursor.execute(
                    """
                    SELECT url FROM urls WHERE scraped_at is null AND url REGEXP ? AND url not REGEXP ?;
                    """, (whitelist, ignorelist)).fetchmany(100)

                for url in urls:
                    db.close()
                    yield url[0]
        except Exception as error:
            print("Error while fetching url from scraper database: ", error)
コード例 #2
0
    def on_success(self, url):
        db = get_db()
        print('Done with', url)
        print("------------------")

        try:
            cursor = db.cursor()
            cursor.execute(
                """
                UPDATE urls set scraped_at = ? where url = ?;
                """, (date.today(), url))
        except Exception as error:
            print("Error while inserting url: ", error)
        finally:
            db.commit()
            db.close()
コード例 #3
0
    def scrap_in_future(self, urls):
        """
        Insert next scrap link in scraper database
        """
        db = get_db()

        try:
            cursor = db.cursor()
            cursor.executemany(
                """
                INSERT OR IGNORE INTO urls (url, scraped_at) VALUES (?, null);
                """, (urls))
        except Exception as error:
            print("error while inserting scraped url: ", error)
        finally:
            db.commit()
            db.close()
コード例 #4
0
    def is_scraped(self, url):
        db = get_db()
        already_scraped = False

        try:
            cursor = db.cursor()
            available = cursor.execute(
                """
                SELECT url FROM urls WHERE url = ? and scraped_at is not null;
                """, (url, )).fetchone()

            if available and available[0]:
                already_scraped = True

        except Exception as error:
            print("Error while checking if url is already scraped: " + error)
        finally:
            db.close()

        return already_scraped