Ejemplo n.º 1
0
def add_user(user, dburi=None, session=None, update=False):
    if not session:
        session = make_session(dburi)

    user = trim_user(user)
    olduser = session.query(User).filter(User.id == user['id'])
    if olduser:
        if not update:
            return
        olduser.delete()
    nuser = User()
    for key, value in user.items():
        setattr(nuser, key, value)
    user = nuser
    if update:
        session.add(user)
        logger.debug('Adding entry')
        entry = session.query(ExtractorEntry).filter(
            ExtractorEntry.user == user.id).first()
        if not entry:
            entry = ExtractorEntry(user=user.id)
            session.add(entry)
        logger.debug(entry.pending)
        entry.pending = True
        entry.cursor = -1
        session.commit()
    session.close()
Ejemplo n.º 2
0
def list_users(ctx, db):
    dburl = 'sqlite:///{}'.format(db)
    session = make_session(dburl)
    for i in session.query(User):
        print(i.screen_name)
        for j in i.__dict__:
            print('\t{}: {}'.format(j, getattr(i, j)))
Ejemplo n.º 3
0
def list_users(ctx, db):
    dburl = 'sqlite:///{}'.format(db)
    session = make_session(dburl)
    for i in session.query(User):
        print(i.screen_name)
        for j in i.__dict__:
            print('\t{}: {}'.format(j, getattr(i,j)))
Ejemplo n.º 4
0
def download_entry(wq, entry_id, dburi=None, recursive=False):
    session = make_session(dburi)
    if not session:
        raise Exception("Provide dburi or session")
    logger.info("Downloading entry: %s (%s)" % (entry_id, type(entry_id)))
    entry = session.query(ExtractorEntry).filter(
        ExtractorEntry.id == entry_id).first()
    user = session.query(User).filter(User.id == entry.user).first()
    download_user(wq, session, user, entry, recursive)
    session.close()
Ejemplo n.º 5
0
 def consume_queue():
     global dburl, collected, ids_queue, lastid
     local_collected = 0
     logging.debug('Consuming!')
     session = make_session(dburl)
     q_iter = iter(ids_queue.get, None)
     for user in utils.get_users(wq, q_iter):
         dbuser = User(**user)
         session.add(dbuser)
         local_collected += 1
         with statslock:
             collected += 1
             lastid = user['id']
         if local_collected % 100 == 0:
             with db_lock:
                 session.commit()
     session.commit()
     logger.debug('Done consuming')
Ejemplo n.º 6
0
 def consume_queue():
     global dburl, collected, ids_queue, lastid
     local_collected = 0
     logging.debug('Consuming!')
     session = make_session(dburl)
     q_iter = iter(ids_queue.get, None)
     for user in utils.get_users(wq, q_iter):
         dbuser = User(**user)
         session.add(dbuser)
         local_collected += 1
         with statslock:
             collected += 1
             lastid = user['id']
         if local_collected % 100 == 0:
             with db_lock:
                 session.commit()
     session.commit()
     logger.debug('Done consuming')
Ejemplo n.º 7
0
def pending_entries(dburi):
    session = make_session(dburi)
    while True:
        candidate, entry = session.query(User, ExtractorEntry).\
                        filter(ExtractorEntry.user == User.id).\
                        filter(ExtractorEntry.pending == True).\
                        filter(ExtractorEntry.busy == False).\
                        order_by(User.followers_count).first()
        if candidate:
            entry.busy = True
            session.add(entry)
            session.commit()
            yield int(entry.id)
            continue
        if session.query(ExtractorEntry).\
            filter(ExtractorEntry.busy == True).count() > 0:
            time.sleep(1)
            continue
        logger.info("No more pending entries")
        break
    session.close()
Ejemplo n.º 8
0
def extract(wq,
            recursive=False,
            user=None,
            initfile=None,
            dburi=None,
            extractor_name=None):
    signal.signal(signal.SIGINT, signal_handler)

    if not dburi:
        dburi = 'sqlite:///%s.db' % extractor_name

    session = make_session(dburi)
    session.query(ExtractorEntry).update({ExtractorEntry.busy: False})
    session.commit()

    if not (user or initfile):
        logger.info('Using pending users from last session')
    else:
        screen_names = []
        user_ids = []
        if user:
            classify_user(user, screen_names, user_ids)
        elif initfile:
            logger.info("No user. I will open %s" % initfile)
            with open(initfile, 'r') as f:
                for line in f:
                    user = line.strip().split(',')[0]
                    classify_user(user, screen_names, user_ids)

        def missing_user(ix, column=User.screen_name):
            res = session.query(User).filter(column == ix).count() == 0
            if res:
                logger.info("Missing user %s. Count: %s" % (ix, res))
            return res

        screen_names = list(filter(missing_user, screen_names))
        user_ids = list(
            filter(partial(missing_user, column=User.id_str), user_ids))
        nusers = []
        logger.info("Missing user ids: %s" % user_ids)
        logger.info("Missing screen names: %s" % screen_names)
        if screen_names:
            nusers = list(get_users(wq, screen_names, by_name=True))
        if user_ids:
            nusers += list(get_users(wq, user_ids, by_name=False))

        for i in nusers:
            add_user(dburi=dburi, user=i)

    total_users = session.query(sqlalchemy.func.count(User.id)).scalar()
    logger.info('Total users: {}'.format(total_users))

    de = partial(download_entry, wq, dburi=dburi)
    pending = pending_entries(dburi)
    session.close()

    with tqdm(parallel(de, pending),
              desc='Downloading users',
              total=total_users) as tq:
        for i in tq:
            tq.write('Got {}'.format(i))
            logger.info("Got %s" % i)
Ejemplo n.º 9
0
def extract(wq, recursive=False, user=None, initfile=None, dburi=None, extractor_name=None):
    signal.signal(signal.SIGINT, signal_handler)

    w = wq.next()
    if not dburi:
        dburi = 'sqlite:///%s.db' % extractor_name

    session = make_session(dburi)

    screen_names = []
    user_ids = []

    def classify_user(id_or_name):
        try:
            int(user)
            user_ids.append(user)
            logger.info("Added user id")
        except ValueError:
            logger.info("Added screen_name")
            screen_names.append(user.split('@')[-1])

    if user:
        classify_user(user)

    elif initfile:
        logger.info("No user. I will open %s" % initfile)
        with open(initfile, 'r') as f:
            for line in f:
                user = line.strip().split(',')[0]
                classify_user(user)
    else:
        logger.info('Using pending users from last session')


    nusers = list(get_users(wq, screen_names, by_name=True))
    if user_ids:
        nusers += list(get_users(wq, user_ids, by_name=False))

    for i in nusers:
        add_user(session, i, enqueue=True)

    total_users = session.query(sqlalchemy.func.count(User.id)).scalar()
    logging.info('Total users: {}'.format(total_users))
    def pending_entries():
        pending = session.query(ExtractorEntry).filter(ExtractorEntry.pending == True).count()
        logging.info('Pending: {}'.format(pending))
        return pending

    while pending_entries() > 0:
        logger.info("Using account: %s" % w.name)
        candidate, entry = session.query(User, ExtractorEntry).\
                           filter(ExtractorEntry.user == User.id).\
                           filter(ExtractorEntry.pending == True).\
                           order_by(User.followers_count).first()
        if not candidate:
            break
        pending = True
        cursor = entry.cursor
        uid = candidate.id
        uobject = session.query(User).filter(User.id==uid).first()
        name = uobject.screen_name if uobject else None

        logger.info("#"*20)
        logger.info("Getting %s - %s" % (uid, name))
        logger.info("Cursor %s" % cursor)
        logger.info("Pending: %s/%s" % (session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).count(), total_users))
        try:
            resp = wq.followers.ids(user_id=uid, cursor=cursor)
        except TwitterHTTPError as ex:
            if ex.e.code in (401, ):
                logger.info('Not authorized for user: {}'.format(uid))
                resp = {}
        if 'ids' in resp:
            logger.info("New followers: %s" % len(resp['ids']))
            if recursive:
                newusers = get_users(wq, resp)
                for user in newusers:
                    add_user(session, newuser, enqueue=True)
            for i in resp['ids']:
                existing_user = session.query(Following).\
                                filter(Following.isfollowed==uid).\
                                filter(Following.follower==i).first()
                now = int(time.time())
                if existing_user:
                    existing_user.created_at_stamp = now
                else:
                    f = Following(isfollowed=uid,
                                  follower=i,
                                  created_at_stamp=now)
                    session.add(f)

            total_followers = candidate.followers_count
            fetched_followers = session.query(Following).filter(Following.isfollowed==uid).count()
            logger.info("Fetched: %s/%s followers" % (fetched_followers,
                                                      total_followers))
            cursor = resp["next_cursor"]
            if cursor > 0:
                pending = True
                logger.info("Getting more followers for %s" % uid)
            else:
                logger.info("Done getting followers for %s" % uid)
                cursor = -1
                pending = False
        else:
            logger.info("Error with id %s %s" % (uid, resp))
            pending = False

        entry.pending = pending
        entry.cursor = cursor
        logging.debug('Entry: {} - {}'.format(entry.user, entry.pending))

        session.add(candidate)
        session.commit()

        sys.stdout.flush()
Ejemplo n.º 10
0
def reset_extractor(ctx):
    db = ctx.obj['DBURI']
    session = make_session(db)
    session.query(ExtractorEntry).filter(
        ExtractorEntry.pending == True).update({'pending': False})
Ejemplo n.º 11
0
def extractor(ctx, db):
    if '://' not in db:
        db = 'sqlite:///{}'.format(db)
    ctx.obj['DBURI'] = db
    ctx.obj['SESSION'] = make_session(db)
Ejemplo n.º 12
0
def reset_extractor(ctx):
    wq = crawlers.TwitterQueue.from_credentials(bconf.CREDENTIALS)
    db = ctx.obj['DBURI']
    session = make_session(db)
    session.query(ExtractorEntry).filter(ExtractorEntry.pending==True).update({'pending':False})
Ejemplo n.º 13
0
def extractor(ctx, db):
    if '://' not in db:
        db = 'sqlite:///{}'.format(db)
    ctx.obj['DBURI'] = db
    ctx.obj['SESSION'] = make_session(db)