def add_user(user, dburi=None, session=None, update=False): if not session: session = make_session(dburi) user = trim_user(user) olduser = session.query(User).filter(User.id == user['id']) if olduser: if not update: return olduser.delete() nuser = User() for key, value in user.items(): setattr(nuser, key, value) user = nuser if update: session.add(user) logger.debug('Adding entry') entry = session.query(ExtractorEntry).filter( ExtractorEntry.user == user.id).first() if not entry: entry = ExtractorEntry(user=user.id) session.add(entry) logger.debug(entry.pending) entry.pending = True entry.cursor = -1 session.commit() session.close()
def list_users(ctx, db): dburl = 'sqlite:///{}'.format(db) session = make_session(dburl) for i in session.query(User): print(i.screen_name) for j in i.__dict__: print('\t{}: {}'.format(j, getattr(i, j)))
def list_users(ctx, db): dburl = 'sqlite:///{}'.format(db) session = make_session(dburl) for i in session.query(User): print(i.screen_name) for j in i.__dict__: print('\t{}: {}'.format(j, getattr(i,j)))
def download_entry(wq, entry_id, dburi=None, recursive=False): session = make_session(dburi) if not session: raise Exception("Provide dburi or session") logger.info("Downloading entry: %s (%s)" % (entry_id, type(entry_id))) entry = session.query(ExtractorEntry).filter( ExtractorEntry.id == entry_id).first() user = session.query(User).filter(User.id == entry.user).first() download_user(wq, session, user, entry, recursive) session.close()
def consume_queue(): global dburl, collected, ids_queue, lastid local_collected = 0 logging.debug('Consuming!') session = make_session(dburl) q_iter = iter(ids_queue.get, None) for user in utils.get_users(wq, q_iter): dbuser = User(**user) session.add(dbuser) local_collected += 1 with statslock: collected += 1 lastid = user['id'] if local_collected % 100 == 0: with db_lock: session.commit() session.commit() logger.debug('Done consuming')
def pending_entries(dburi): session = make_session(dburi) while True: candidate, entry = session.query(User, ExtractorEntry).\ filter(ExtractorEntry.user == User.id).\ filter(ExtractorEntry.pending == True).\ filter(ExtractorEntry.busy == False).\ order_by(User.followers_count).first() if candidate: entry.busy = True session.add(entry) session.commit() yield int(entry.id) continue if session.query(ExtractorEntry).\ filter(ExtractorEntry.busy == True).count() > 0: time.sleep(1) continue logger.info("No more pending entries") break session.close()
def extract(wq, recursive=False, user=None, initfile=None, dburi=None, extractor_name=None): signal.signal(signal.SIGINT, signal_handler) if not dburi: dburi = 'sqlite:///%s.db' % extractor_name session = make_session(dburi) session.query(ExtractorEntry).update({ExtractorEntry.busy: False}) session.commit() if not (user or initfile): logger.info('Using pending users from last session') else: screen_names = [] user_ids = [] if user: classify_user(user, screen_names, user_ids) elif initfile: logger.info("No user. I will open %s" % initfile) with open(initfile, 'r') as f: for line in f: user = line.strip().split(',')[0] classify_user(user, screen_names, user_ids) def missing_user(ix, column=User.screen_name): res = session.query(User).filter(column == ix).count() == 0 if res: logger.info("Missing user %s. Count: %s" % (ix, res)) return res screen_names = list(filter(missing_user, screen_names)) user_ids = list( filter(partial(missing_user, column=User.id_str), user_ids)) nusers = [] logger.info("Missing user ids: %s" % user_ids) logger.info("Missing screen names: %s" % screen_names) if screen_names: nusers = list(get_users(wq, screen_names, by_name=True)) if user_ids: nusers += list(get_users(wq, user_ids, by_name=False)) for i in nusers: add_user(dburi=dburi, user=i) total_users = session.query(sqlalchemy.func.count(User.id)).scalar() logger.info('Total users: {}'.format(total_users)) de = partial(download_entry, wq, dburi=dburi) pending = pending_entries(dburi) session.close() with tqdm(parallel(de, pending), desc='Downloading users', total=total_users) as tq: for i in tq: tq.write('Got {}'.format(i)) logger.info("Got %s" % i)
def extract(wq, recursive=False, user=None, initfile=None, dburi=None, extractor_name=None): signal.signal(signal.SIGINT, signal_handler) w = wq.next() if not dburi: dburi = 'sqlite:///%s.db' % extractor_name session = make_session(dburi) screen_names = [] user_ids = [] def classify_user(id_or_name): try: int(user) user_ids.append(user) logger.info("Added user id") except ValueError: logger.info("Added screen_name") screen_names.append(user.split('@')[-1]) if user: classify_user(user) elif initfile: logger.info("No user. I will open %s" % initfile) with open(initfile, 'r') as f: for line in f: user = line.strip().split(',')[0] classify_user(user) else: logger.info('Using pending users from last session') nusers = list(get_users(wq, screen_names, by_name=True)) if user_ids: nusers += list(get_users(wq, user_ids, by_name=False)) for i in nusers: add_user(session, i, enqueue=True) total_users = session.query(sqlalchemy.func.count(User.id)).scalar() logging.info('Total users: {}'.format(total_users)) def pending_entries(): pending = session.query(ExtractorEntry).filter(ExtractorEntry.pending == True).count() logging.info('Pending: {}'.format(pending)) return pending while pending_entries() > 0: logger.info("Using account: %s" % w.name) candidate, entry = session.query(User, ExtractorEntry).\ filter(ExtractorEntry.user == User.id).\ filter(ExtractorEntry.pending == True).\ order_by(User.followers_count).first() if not candidate: break pending = True cursor = entry.cursor uid = candidate.id uobject = session.query(User).filter(User.id==uid).first() name = uobject.screen_name if uobject else None logger.info("#"*20) logger.info("Getting %s - %s" % (uid, name)) logger.info("Cursor %s" % cursor) logger.info("Pending: %s/%s" % (session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).count(), total_users)) try: resp = wq.followers.ids(user_id=uid, cursor=cursor) except TwitterHTTPError as ex: if ex.e.code in (401, ): logger.info('Not authorized for user: {}'.format(uid)) resp = {} if 'ids' in resp: logger.info("New followers: %s" % len(resp['ids'])) if recursive: newusers = get_users(wq, resp) for user in newusers: add_user(session, newuser, enqueue=True) for i in resp['ids']: existing_user = session.query(Following).\ filter(Following.isfollowed==uid).\ filter(Following.follower==i).first() now = int(time.time()) if existing_user: existing_user.created_at_stamp = now else: f = Following(isfollowed=uid, follower=i, created_at_stamp=now) session.add(f) total_followers = candidate.followers_count fetched_followers = session.query(Following).filter(Following.isfollowed==uid).count() logger.info("Fetched: %s/%s followers" % (fetched_followers, total_followers)) cursor = resp["next_cursor"] if cursor > 0: pending = True logger.info("Getting more followers for %s" % uid) else: logger.info("Done getting followers for %s" % uid) cursor = -1 pending = False else: logger.info("Error with id %s %s" % (uid, resp)) pending = False entry.pending = pending entry.cursor = cursor logging.debug('Entry: {} - {}'.format(entry.user, entry.pending)) session.add(candidate) session.commit() sys.stdout.flush()
def reset_extractor(ctx): db = ctx.obj['DBURI'] session = make_session(db) session.query(ExtractorEntry).filter( ExtractorEntry.pending == True).update({'pending': False})
def extractor(ctx, db): if '://' not in db: db = 'sqlite:///{}'.format(db) ctx.obj['DBURI'] = db ctx.obj['SESSION'] = make_session(db)
def reset_extractor(ctx): wq = crawlers.TwitterQueue.from_credentials(bconf.CREDENTIALS) db = ctx.obj['DBURI'] session = make_session(db) session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})