def _collect_meta_from_db(self, db_file):
     with connect_sqlite(db_file) as conn:
         cursor = conn.cursor()
         sql = "SELECT tid, html FROM raw_html;"
         cursor.execute(sql)
         for x in cursor.fetchall():
             tid, html = x
             self.tasks.append((tid, html))
     self.logger.info("collected {} tasks".format(len(self.tasks)))
 def _collect_meta_from_db(self, db_file):
     with connect_sqlite(db_file) as conn:
         cursor = conn.cursor()
         sql = "SELECT url, response_body FROM subreddit;"
         cursor.execute(sql)
         for x in cursor.fetchall():
             url, html = x
             self.tasks.append((url, html))
     self.logger.info("collected {} tasks".format(len(self.tasks)))
Esempio n. 3
0
 def _collect_meta_from_db(self, dbfile):
     with connect_sqlite(dbfile) as conn:
         cursor = conn.cursor()
         sql = "select response_body, url from cnet where response_url like 'https://www.cnet.com/forums/discussions/%';"
         try:
             cursor.execute(sql)
             for x in cursor.fetchall():
                 self.tasks.append(x)
         except Exception:
             traceback.print_exc()
     self.all_loaded = True
     self.logger.info("collected {} tasks".format(len(self.tasks)))
 def _collect_meta_from_db(self, db_file):
     total_tasks = 0
     with connect_sqlite(db_file) as conn:
         cursor = conn.cursor()
         sql = "SELECT tid, html FROM raw_html;"
         cursor.execute(sql)
         for x in cursor.fetchall():
             tid, html = x
             total_tasks += 1
             self.tasks.put(dict(tid=tid, html=html))
     self.all_loaded.value = True
     self.total_tasks = total_tasks
     self.logger.info("collected {} tasks".format(self.total_tasks))
Esempio n. 5
0
 def _collect_meta_from_db(self, dbfile):
     chunk_size = 1000
     with connect_sqlite(dbfile) as conn:
         cursor = conn.cursor()
         sql = "select response_body, url from {} where response_url not like 'https://www.reddit.com/r/%/?count=%';".format(
             self.reddit_type)
         try:
             cursor.execute(sql)
             batch = cursor.fetchmany(chunk_size)
             while batch:
                 self.total_tasks += len(batch)
                 for x in batch:
                     self.tasks.put(x)
                 batch = cursor.fetchmany(chunk_size)
         except Exception:
             traceback.print_exc()
     self.all_loaded = True
     self.logger.info("collected {} tasks".format(len(self.tasks)))