def get_next_links(self): """ Returns next link from scraper database """ def regexp(expr, item): reg = re.compile(expr) return reg.search(item) is not None db = get_db() db.create_function("REGEXP", 2, regexp) try: while True: cursor = db.cursor() whitelist = '|'.join(self.whitelist) ignorelist = '|'.join(self.ignorelist) urls = cursor.execute( """ SELECT url FROM urls WHERE scraped_at is null AND url REGEXP ? AND url not REGEXP ?; """, (whitelist, ignorelist)).fetchmany(100) for url in urls: db.close() yield url[0] except Exception as error: print("Error while fetching url from scraper database: ", error)
def on_success(self, url): db = get_db() print('Done with', url) print("------------------") try: cursor = db.cursor() cursor.execute( """ UPDATE urls set scraped_at = ? where url = ?; """, (date.today(), url)) except Exception as error: print("Error while inserting url: ", error) finally: db.commit() db.close()
def scrap_in_future(self, urls): """ Insert next scrap link in scraper database """ db = get_db() try: cursor = db.cursor() cursor.executemany( """ INSERT OR IGNORE INTO urls (url, scraped_at) VALUES (?, null); """, (urls)) except Exception as error: print("error while inserting scraped url: ", error) finally: db.commit() db.close()
def is_scraped(self, url): db = get_db() already_scraped = False try: cursor = db.cursor() available = cursor.execute( """ SELECT url FROM urls WHERE url = ? and scraped_at is not null; """, (url, )).fetchone() if available and available[0]: already_scraped = True except Exception as error: print("Error while checking if url is already scraped: " + error) finally: db.close() return already_scraped