Example #1
0
    def _get_label_id(self, name):
        """
        Get or create a database label object, return the ID.
        """
        print('Label name to add:{}'.format(name), flush=True)
        db_session = worker.get_session()
        redis = worker.get_redis()
        label = db_session.query(Label).filter_by(name=name.lower().strip()).first()

        if label:
            print('Found Label :{}'.format(label.id), flush=True)
            return label.id
        else:
            label = Label(name=name.lower().strip())
            db_session.add(label)

            try:
                db_session.commit()
            except IntegrityError:
                db_session.rollback()
                raise
            except AssertionError:
                db_session.rollback()
                raise ValueError(
                    'Label "{}" contains non-alphanumeric character'
                    .format(name)
                )

            redis.publish('label', json.dumps(label.as_dict()))
            print('Created label :{}'.format(label.id), flush=True)
            return label.id
Example #2
0
def test_site(site_id, tracker_id, request_timeout=10):
    """
    Perform postive and negative test of site.

    Postive test: check_username() return True for existing username.
    Negative test: check_username() returns False for non-existent username.

    Site is valid if:

        positive result  = 'f' (found)
        negative result = 'n' (not found)
    """
    worker.start_job()
    redis = worker.get_redis()
    db_session = worker.get_session()
    site = db_session.query(Site).get(site_id)

    # Do positive test.
    result_pos_id = check_username(username=site.test_username_pos,
                                   site_id=site_id,
                                   category_id=None,
                                   total=2,
                                   tracker_id=tracker_id + '-1',
                                   test=True)

    result_pos = db_session.query(Result).get(result_pos_id)

    # Do negative test.
    result_neg_id = check_username(username=site.test_username_neg,
                                   site_id=site_id,
                                   category_id=None,
                                   total=2,
                                   tracker_id=tracker_id + '-2',
                                   test=True)

    result_neg = db_session.query(Result).get(result_neg_id)

    # Update site with test results
    site.test_result_pos = result_pos
    site.test_result_neg = result_neg

    # Set site validity based on results
    # of both tests.
    if result_pos.status == 'f' and \
            result_neg.status == 'n':
        site.valid = True
    else:
        site.valid = False

    site.tested_at = datetime.utcnow()
    db_session.commit()

    # Send redis notification
    msg = {
        'tracker_id': tracker_id,
        'status': 'tested',
        'site': site.as_dict(),
        'resource': None,
    }
    redis.publish('site', json.dumps(msg))
Example #3
0
def scrape_profile(site, usernames, stub=False, labels={}):
    """ Scrape a twitter or instagram account. """

    redis = worker.get_redis()
    worker.start_job()

    try:
        if site == 'twitter':
            profiles = scrape_twitter_account(usernames, stub, labels)
        elif site == 'instagram':
            profiles = []
            for username in usernames:
                profile = scrape_instagram_account(username, stub)
                profiles.append(profile)
        else:
            raise ScrapeException('No scraper exists for site: {}'.format(site))

        for profile in profiles:
            redis.publish('profile', json.dumps(profile))

        worker.finish_job()

    except requests.exceptions.HTTPError as he:
        response = he.response
        message = {
            'usernames': usernames,
            'site': site,
            'code': response.status_code
        }

        if response.status_code == 404:
            message['error'] = 'Does not exist on {}.'.format(site)
        else:
            message['error'] = 'Cannot communicate with {} ({})' \
                               .format(site, response.status_code)

        message_str = json.dumps(message)
        redis.publish('profile', message_str)
        sys.stderr.write('{}\n'.format(message_str))
        sys.stderr.write('{}\n'.format(response.text))

    except ScrapeException as se:
        message = {
            'usernames': usernames,
            'site': site,
            'error': se.message,
        }
        redis.publish('profile', json.dumps(message))

    except:
        message = {
            'usernames': usernames,
            'site': site,
            'error': 'Unknown error while fetching profile.',
        }
        redis.publish('profile', json.dumps(message))
        raise
Example #4
0
def scrape_avatar(id_, site, url):
    """
    Get an twitter avatar from ``url`` and save it to the Profile identified by
    ``id_``.
    """

    worker.start_job()
    redis = worker.get_redis()
    db_session = worker.get_session()
    avatar = None
    profile = db_session.query(Profile).filter(Profile.id == id_).first()

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    if site == 'twitter':
        # Twitter points you to a scaled image by default, but we can
        # get the original resolution by removing "_normal" from the URL.
        #
        # See: https://dev.twitter.com/overview/general/user-profile-images-and-banners
        url = url.replace('_normal', '')

    # Update Avatar if it's already stored in the db
    for profile_avatar in profile.avatars:
        if profile_avatar.upstream_url == url:
            profile_avatar.end_date = datetime.today()
            avatar = profile_avatar
            break

    # Otherwise, scrape the new Avatar and append to the profile
    if avatar is None:

        response = requests.get(url)
        response.raise_for_status()

        if 'content-type' in response.headers:
            mime = response.headers['content-type']
        else:
            mime = 'application/octet-stream'

        image = response.content
        avatar = Avatar(url, mime, image)
        profile.avatars.append(avatar)
        profile.current_avatar = avatar

    db_session.commit()
    worker.finish_job()

    redis.publish('avatar', json.dumps({
        'id': id_,
        'thumb_url': '/api/file/' + str(avatar.thumb_file.id),
        'url': '/api/file/' + str(avatar.file.id),
    }))
Example #5
0
def create_archive(username, category_id, tracker_id, user_id):
    """
    Archive summary of results in the database and store
    a zip archive in the data directory.
    """

    redis = worker.get_redis()
    worker.start_job()
    db_session = worker.get_session()
    found_count = 0
    not_found_count = 0
    error_count = 0

    results = (db_session.query(Result).options(subqueryload(
        Result.image_file)).filter(Result.tracker_id == tracker_id).all())
    site_count = len(results)

    # Generate zip file
    filename = re.sub('[\W_]+', '', username)  # Strip non-alphanumeric char
    zip_file_id = create_zip(filename, results, user_id)

    for result in results:
        if result.status == 'e':
            error_count += 1
        elif result.status == 'f':
            found_count += 1
        elif result.status == 'n':
            not_found_count += 1

    archive = Archive(tracker_id=tracker_id,
                      username=username,
                      category_id=category_id,
                      site_count=site_count,
                      found_count=found_count,
                      not_found_count=not_found_count,
                      error_count=error_count,
                      zip_file_id=zip_file_id,
                      user_id=user_id)

    # Write to db
    db_session.add(archive)
    db_session.commit()

    # Publish
    message = {
        'id': archive.id,
        'name': archive.username,
        'status': 'created',
        'archive': archive.as_dict(),
    }
    redis.publish('archive', json.dumps(message))
    worker.finish_job()
Example #6
0
def scrape_avatar(id_, site, url):
    '''
    Get an twitter avatar from ``url`` and save it to the Profile identified by
    ``id_``.
    '''

    worker.start_job()
    redis = worker.get_redis()
    db_session = worker.get_session()
    avatar = None
    profile = db_session.query(Profile).filter(Profile.id==id_).first()

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    if site == 'twitter':
        # Twitter points you to a scaled image by default, but we can
        # get the original resolution by removing "_normal" from the URL.
        #
        # See: https://dev.twitter.com/overview/general/user-profile-images-and-banners
        url = url.replace('_normal', '')

    # Update Avatar if it's already stored in the db
    for profile_avatar in profile.avatars:
        if profile_avatar.upstream_url == url:
            profile_avatar.end_date = datetime.today()
            avatar = profile_avatar

    # Otherwise, scrape the new Avatar and append to the profile
    if avatar is None:

        response = requests.get(url)
        response.raise_for_status()

        if 'content-type' in response.headers:
            mime = response.headers['content-type']
        else:
            mime = 'application/octet-stream'

        image = response.content
        avatar = Avatar(url, mime, image)
        profile.avatars.append(avatar)

    db_session.commit()
    worker.finish_job()

    redis.publish('avatar', json.dumps({
        'id': id_,
        'thumb_url': '/api/file/' + str(avatar.thumb_file.id),
        'url': '/api/file/' + str(avatar.file.id),
    }))
Example #7
0
    def delete(self, id_):
        '''
        Delete the note identified by `id`.

        **Example Response**

        .. sourcecode:: json

            {
                "message": "note `12` deleted",
            }

        :<header Content-Type: application/json
        :<header X-Auth: the client's auth token

        :>header Content-Type: application/json
        :>json str message: the API response message
        :status 202: deleted
        :status 400: invalid request body
        :status 401: authentication required
        :status 404: note does not exist
        '''

        # Get note.

        redis = worker.get_redis()
        id_ = get_int_arg('id_', id_)
        note = g.db.query(ProfileNote).filter(ProfileNote.id == id_).first()

        if note is None:
            raise NotFound("Note `%s` does not exist." % id_)

        # Delete note
        g.db.delete(note)
        try:
            g.db.commit()
        except DBAPIError as e:
            raise BadRequest('Database error: {}'.format(e))

        message = 'Note `{}` deleted'.format(note.id)
        redis.publish('profile_notes',
                      json.dumps({
                          'id': id_,
                          'status': 'deleted',
                      }))

        response = jsonify(message=message)
        response.status_code = 202

        return response
Example #8
0
def check_username(username,
                   site_id,
                   category_id,
                   total,
                   tracker_id,
                   request_timeout=10,
                   test=False):
    """
    Check if `username` exists on the specified site.
    """

    worker.start_job()
    redis = worker.get_redis()
    db_session = worker.get_session()

    # Make a splash request.
    site = db_session.query(Site).get(site_id)

    # Check site.
    splash_result = _splash_username_request(username, site, request_timeout)
    image_file = _save_image(db_session, splash_result)

    # Save result to DB.
    result = Result(tracker_id=tracker_id,
                    site_name=splash_result['site']['name'],
                    site_url=splash_result['url'],
                    status=splash_result['status'],
                    image_file_id=image_file.id,
                    error=splash_result['error'])
    db_session.add(result)
    db_session.commit()

    if not test:
        # Notify clients of the result.
        current = redis.incr(tracker_id)
        result_dict = result.as_dict()
        result_dict['current'] = current
        # result_dict['image_file_url'] = image_file.url()
        # result_dict['image_name'] = image_file.name
        result_dict['total'] = total
        redis.publish('result', json.dumps(result_dict))

        # If this username search is complete, then queue an archive job.
        if current == total:
            app.queue.schedule_archive(username, category_id, tracker_id)

    worker.finish_job()

    return result.id
Example #9
0
def scrape_profile(site, username):
    ''' Scrape a twitter or instagram account. '''

    redis = worker.get_redis()
    worker.start_job()

    try:
        if site == 'twitter':
            profile = scrape_twitter_account(username)
        elif site == 'instagram':
            profile = scrape_instagram_account(username)
        else:
            raise ScrapeException('No scraper exists for site: {}'.format(site))

        redis.publish('profile', json.dumps(profile))
        worker.finish_job()

    except requests.exceptions.HTTPError as he:
        response = he.response
        message = {'username': username, 'site': site, 'code': response.status_code}

        if response.status_code == 404:
            message['error'] = 'Does not exist on Twitter.'
        else:
            message['error'] = 'Cannot communicate with Twitter ({})' \
                               .format(response.status_code)

        redis.publish('profile', json.dumps(message))

    except ScrapeException as se:
        message = {
            'username': username,
            'site': site,
            'error': se.message,
        }
        redis.publish('profile', json.dumps(message))

    except Exception as e:
        message = {
            'username': username,
            'site': site,
            'error': 'Unknown error while fetching profile.',
        }
        redis.publish('profile', json.dumps(message))
        raise
Example #10
0
def scrape_twitter_posts(id_, recent):
    '''
    Fetch tweets for the user identified by id_.
    Checks tweets already stored in db, and will only fetch older or newer
    tweets depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent tweets not already stored in the db.
    The number of tweets to fetch is configured in the Admin.
    '''
    db = worker.get_session()
    #max_results = _get_max_posts(db)['twitter']
    max_results = get_config(db, 'max_posts_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_twitter must be an integer')

    worker.start_job(total=max_results)
    redis = worker.get_redis()
    author = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    results = 0
    max_id = None
    more_results = True
    count = 200

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get posts currently stored in db for this profile.
    post_query = db.query(Post) \
                        .filter(Post.author_id == id_) \
                        .order_by(Post.upstream_created.desc())

    url = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
    params = {'count': count, 'user_id': author.upstream_id}

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            since_id = post_query[0].upstream_id
            params['since_id'] = str(since_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() -1].upstream_id
            params['max_id'] = str(max_id)

    while more_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False
        )
        response.raise_for_status()

        post_ids = list()

        tweets = response.json()
        if len(tweets) == 0:
            more_results = False

        if len(tweets) < count:
            more_results = False

        for tweet in tweets:
            # Twitter API result set includes the tweet with the max_id/since_id
            # so ignore it.
            if tweet['id_str'] != max_id:
                post = Post(
                    author,
                    tweet['id_str'],
                    dateutil.parser.parse(tweet['created_at']),
                    tweet['text']
                )

                if tweet['lang'] is not None:
                    post.language = tweet['lang']

                if tweet['coordinates'] is not None:
                    post.latitude, post.longitude = tweet['coordinates']

                place = tweet['place']

                if place is not None:
                    # Set longitude/latitude to the center the of bounding polygon.
                    total_lon = 0
                    total_lat = 0
                    num_coords = 0

                    for lon, lat in place['bounding_box']['coordinates'][0]:
                        total_lon += lon
                        total_lat += lat
                        num_coords += 1

                    post.longitude = total_lon / num_coords
                    post.latitude = total_lat / num_coords

                    # Set location to string identifying the place.
                    post.location = '{}, {}'.format(
                        place['full_name'],
                        place['country']
                    )

                db.add(post)
                db.flush()
                post_ids.append(post.id)
                # Set the max_id to the last tweet to get the next set of
                # results
                max_id = tweet['id_str']
                params['max_id'] = max_id
                results += 1
                worker.update_job(current=results)

                if results == max_results:
                    more_results = False
                    break


    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)
Example #11
0
    def post(self):
        '''
        Create profile notes.

        **Example Request**

        .. sourcecode:: json

            {
                "notes": [
                    {
                        "category": "user annotation",
                        "body": "this profile belongs to an interesting network",
                        "profile_id": "25 ",
                    },
                    {
                        "category": "user annotation",
                        "body": "this user does not exist anymore.",
                        "profile_id": "10",
                    },
                    ...
                ]
            }

        **Example Response**

        .. sourcecode:: json

            {
                "message": "2 profile notes created."
            }

        :<header Content-Type: application/json
        :<header X-Auth: the client's auth token
        :>json list notes: a list of notes to create
        :>json str notes[n].category: the user-defined category of this note
        :>json str notes[n].body: the note
        :>json str notes[n]profile_id: the unique id of the profile this note belongs to

        :>header Content-Type: application/json
        :>json str message: api response message

        :status 202: created
        :status 400: invalid request body
        :status 401: authentication required
        '''

        request_json = request.get_json()
        redis = worker.get_redis()
        notes = list()
        profiles = list()

        required_fields = ['category', 'body', 'profile_id']

        # Validate input
        if 'notes' not in request_json:
            raise BadRequest('`notes` is required.')

        for note_json in request_json['notes']:
            request_fields = note_json.keys()
            # Check necessary fields are present
            missing_fields = set(required_fields) - set(request_fields)
            if len(missing_fields) > 0:
                raise BadRequest('All notes require: {}.'.format(
                    ','.join(required_fields)))
            # Check fields aren't empty
            for field in request_fields:
                if field.strip() == '':
                    raise BadRequest(
                        '{} is required and must not be empty.'.format(field))

            # Check the profile exists
            profile = g.db.query(Profile).filter(
                Profile.id == note_json['profile_id']).first()
            if profile is None:
                raise BadRequest('Profile `{}` does not exist.'.format(
                    note_json['profile_id']))
            profiles.append(profile)

        # Create notes
        for note_json in request_json['notes']:
            try:
                note = ProfileNote(
                    category=note_json['category'].lower().strip(),
                    body=note_json['body'].strip(),
                    profile_id=note_json['profile_id'],
                )
                g.db.add(note)
                g.db.flush()
                notes.append(note)
            except:
                g.db.rollback()
                raise BadRequest('Notes could not be saved')

        # Save notes
        g.db.commit()

        # Publish SSEs
        for note in notes:
            redis.publish('profile_notes', json.dumps(note.as_dict()))

        message = '{} new notes created'.format(len(notes))
        response = jsonify(message=message, )
        response.status_code = 202

        return response
Example #12
0
    def put(self, id_):
        '''
        Update the note identified by `id`.

        **Example Request**

        .. sourcecode:: json

            {
                {
                    "category": "user annotation",
                    "body": "This profile belongs to two interesting networks",
                    "profile_id": "25 ",
                },
            }

        **Example Response**

        .. sourcecode:: json

            {
                "id": "2",
                "category": "user annotation",
                "body": "This profile belongs to an interesting network",
                "profile_id": "25 ",
                "created_at": "2015-12-14T16:23:18.101558",
                "url": "https://quickpin/api/note/2",
            }


        :<header Content-Type: application/json
        :<header X-Auth: the client's auth token
        :>header Content-Type: application/json
        :>json int id: unique identifier for the note
        :>json str category: the user-defined category of this note
        :>json str body: the note
        :>json str profile_id: the unique id of the profile this note belongs to
        :>json str created_at: the iso-formatted creation time of the note
        :>json str url: API endpoint URL for this note object

        :status 202: created
        :status 400: invalid request body
        :status 401: authentication required
        '''

        # Get note.
        id_ = get_int_arg('id_', id_)
        note = g.db.query(ProfileNote).filter(ProfileNote.id == id_).first()

        if note is None:
            raise NotFound("Note '%s' does not exist." % id_)

        redis = worker.get_redis()
        request_json = request.get_json()

        # Validate data and set attributes
        if 'category' in request_json:
            if request_json['category'].strip() != '':
                note.category = request_json['category'].lower().strip()

        if 'body' in request_json:
            if request_json['body'].strip() != '':
                note.body = request_json['body'].strip()
            else:
                raise BadRequest('Attribute "name" cannot be an empty string')

        # Save the updated note
        try:
            g.db.commit()
        except DBAPIError:
            g.db.rollback()
            raise BadRequest('Could not update note.')

        # Generate SSE
        redis.publish('profile_notes', json.dumps(note.as_dict()))
        response = note.as_dict()
        response['url'] = url_for('ProfileNoteView:get', id_=note.id)

        # Send response.
        return jsonify(**response)
Example #13
0
    def put(self, id_):
        '''
        Update the profile identified by `id` with submitted data.
        The following attribute are modifiable:
           * is_interesting
           * lables

        **Example Request**

        .. sourcecode:: json

            {
                "is_interesting": true,
                "labels": [
                    {"name": "male"},
                    {"name": "british"},
                    ...
                ],
                ...
            }

        **Example Response**

        .. sourcecode:: json

            {
                "avatar_url": "https://quickpin/api/file/1",
                "avatar_thumb_url": "https://quickpin/api/file/2",
                "description": "A human being.",
                "follower_count": 71,
                "friend_count": 28,
                "id": 1,
                "is_stub": false,
                "is_interesting": true,
                "join_date": "2012-01-30T15:11:35",
                "labels": [
                    {
                        "id": 1,
                        "name": "male"
                    },
                    {
                        "id": 2,
                        "name": "british"
                    },
                ],
                "last_update": "2015-08-18T10:51:16",
                "location": "Washington, DC",
                "name": "John Doe",
                "post_count": 1666,
                "private": false,
                "site": "twitter",
                "site_name": "Twitter",
                "time_zone": "Central Time (US & Canada)",
                "upstream_id": "11009418",
                "url": "https://quickpin/api/profile/1",
                "username": "******",
                "usernames": [
                    {
                        "end_date": "2012-06-30T15:00:00",
                        "start_date": "2012-01-01T12:00:00",
                        "username": "******"
                    },
                    ...
                ]
            }

        :<header Content-Type: application/json
        :<header X-Auth: the client's auth token
        :>json bool is_interesting: whether profile is marked as interesting
        :>json list labels: whether profile is marked as interesting

        :>header Content-Type: application/json
        :>json str avatar_url: URL to the user's current avatar
        :>json str avatar_thumb_url: URL to a 32x32px thumbnail of the user's
            current avatar
        :>json str description: profile description
        :>json int follower_count: number of followers
        :>json int friend_count: number of friends (a.k.a. followees)
        :>json int id: unique identifier for profile
        :>json bool is_stub: indicates that this is a stub profile, e.g.
            related to another profile but has not been fully imported
        :>json bool is_interesting: indicates whether this profile has been
            marked as interesting. The value can be null.
        :>json str join_date: the date this profile joined its social network
            (ISO-8601)
        :>json list labels: list of labels for this profile
        :>json int label[n].id: the unique id for this label
        :>json str label[n].name: the label
        :>json str last_update: the last time that information about this
            profile was retrieved from the social media site (ISO-8601)
        :>json str location: geographic location provided by the user, as free
            text
        :>json str name: the full name provided by this user
        :>json int post_count: the number of posts made by this profile
        :>json bool private: true if this is a private account (i.e. not world-
            readable)
        :>json str site: machine-readable site name that this profile belongs to
        :>json str site_name: human-readable site name that this profile belongs
            to
        :>json str time_zone: the user's provided time zone as free text
        :>json str upstream_id: the user ID assigned by the social site
        :>json str url: URL endpoint for retriving more data about this profile
        :>json str username: the current username for this profile
        :>json list usernames: list of known usernames for this profile
        :>json str usernames[n].end_date: the last known date this username was
            used for this profile
        :>json str usernames[n].start_date: the first known date this username
            was used for this profile
        :>json str usernames[n].username: a username used for this profile

        :status 202: accepted for background processing
        :status 400: invalid request body
        :status 401: authentication required

        '''

        redis = worker.get_redis()

        # Get profile.
        id_ = get_int_arg('id_', id_)

        current_avatar_id = self._current_avatar_subquery()

        profile, avatar = g.db.query(Profile, Avatar) \
                              .outerjoin(Avatar, Avatar.id == current_avatar_id) \
                              .filter(Profile.id == id_).first()

        if profile is None:
            raise NotFound("Profile '%s' does not exist." % id_)

        request_json = request.get_json()

        # Validate put data and set attributes
        # Only 'is_interesting' and 'labels' are modifiable
        if 'is_interesting' in request_json:
            if isinstance(request_json['is_interesting'], bool):
                profile.is_interesting = request_json['is_interesting']
            elif request_json['is_interesting'] is None:
                profile.is_interesting = None
            else:
                raise BadRequest("Attribute 'is_interesting' is type boolean,"
                                 " or can be set as null")

        # labels expects the string 'name' rather than id, to avoid the need to
        # create labels before adding them.
        if 'labels' in request_json:
            labels = []
            if isinstance(request_json['labels'], list):
                for label_json in request_json['labels']:
                    if 'name' in label_json:
                        label = g.db.query(Label) \
                                    .filter(Label.name==label_json['name']) \
                                    .first()
                        if label is None:
                            try:
                                label = Label(
                                    name=label_json['name'].lower().strip()
                                )

                                g.db.add(label)
                                g.db.flush()

                                redis.publish(
                                    'label',
                                    json.dumps(label.as_dict())
                                )
                            except IntegrityError:
                                g.db.rollback()
                                raise BadRequest('Label could not be saved')
                            except AssertionError:
                                g.db.rollback()
                                raise BadRequest(
                                    '"{}" contains non-alphanumeric character'
                                    .format(
                                        label_json['name']
                                    )
                                )

                        labels.append(label)
                    else:
                        raise BadRequest("Label 'name' is required")

                profile.labels = labels
            else:
                raise BadRequest("'labels' must be a list")


        response = profile.as_dict()
        response['url'] = url_for('ProfileView:get', id_=profile.id)

        # Save the profile
        try:
            g.db.commit()
            redis.publish('profile_update', json.dumps(profile.as_dict()))
        except DBAPIError as e:
            g.db.rollback()
            raise BadRequest('Profile could not be saved')

        # Create usernames list.
        usernames = list()

        for username in profile.usernames:
            if username.end_date is not None:
                end_date = username.end_date.isoformat()
            else:
                end_date = None

            if username.start_date is not None:
                start_date = username.start_date.isoformat()
            else:
                start_date = None

            usernames.append({
                'end_date': end_date,
                'username': username.username,
                'start_date': start_date,
            })

        response['usernames'] = usernames

        # Create avatar attributes.
        if avatar is not None:
            response['avatar_url'] = url_for(
                'FileView:get',
                id_=avatar.file.id
            )
            response['avatar_thumb_url'] = url_for(
                'FileView:get',
                id_=avatar.thumb_file.id
            )
        else:
            response['avatar_url'] = url_for(
                'static',
                filename='img/default_user.png'
            )
            response['avatar_thumb_url'] = url_for(
                'static',
                filename='img/default_user_thumb.png'
            )

        # Send response.
        return jsonify(**response)
Example #14
0
    def put(self, id_):
        '''
        Update the profile identified by `id` with submitted data.

        The following attributes are modifiable:

           * is_interesting
           * labels
           * score

        **Example Request**

        .. sourcecode:: json

            {
                "is_interesting": true,
                "labels": [
                    {"name": "male"},
                    {"name": "british"},
                    ...
                ],
                "score": 2323.0,
                ...
            }

        **Example Response**

        .. sourcecode:: json

            {
                "avatar_url": "https://quickpin/api/file/1",
                "avatar_thumb_url": "https://quickpin/api/file/2",
                "description": "A human being.",
                "follower_count": 71,
                "friend_count": 28,
                "id": 1,
                "is_stub": false,
                "is_interesting": true,
                "join_date": "2012-01-30T15:11:35",
                "labels": [
                    {
                        "id": 1,
                        "name": "male"
                    },
                    {
                        "id": 2,
                        "name": "british"
                    },
                ],
                "last_update": "2015-08-18T10:51:16",
                "location": "Washington, DC",
                "name": "John Doe",
                "post_count": 1666,
                "private": false,
                "score": "-2.0621606863",
                "site": "twitter",
                "site_name": "Twitter",
                "time_zone": "Central Time (US & Canada)",
                "upstream_id": "11009418",
                "url": "https://quickpin/api/profile/1",
                "username": "******",
                "usernames": [
                    {
                        "end_date": "2012-06-30T15:00:00",
                        "start_date": "2012-01-01T12:00:00",
                        "username": "******"
                    },
                    ...
                ]
            }

        :<header Content-Type: application/json
        :<header X-Auth: the client's auth token
        :<json bool is_interesting: whether profile is marked as interesting
        :<json list labels: list of profile labels
        :<json float score: profile score

        :>header Content-Type: application/json
        :>json str avatar_url: URL to the user's current avatar
        :>json str avatar_thumb_url: URL to a 32x32px thumbnail of the user's
            current avatar
        :>json str description: profile description
        :>json int follower_count: number of followers
        :>json int friend_count: number of friends (a.k.a. followees)
        :>json int id: unique identifier for profile
        :>json bool is_stub: indicates that this is a stub profile, e.g.
            related to another profile but has not been fully imported
        :>json bool is_interesting: indicates whether this profile has been
            marked as interesting. The value can be null.
        :>json str join_date: the date this profile joined its social network
            (ISO-8601)
        :>json list labels: list of labels for this profile
        :>json int label[n].id: the unique id for this label
        :>json str label[n].name: the label
        :>json str last_update: the last time that information about this
            profile was retrieved from the social media site (ISO-8601)
        :>json str location: geographic location provided by the user, as free
            text
        :>json str name: the full name provided by this user
        :>json int note[n].id: the unique id for this note
        :>json int note[n].category: the user-defined category of this note
        :>json int note[n].body: the user-defined text-body of this note
        :>json int post_count: the number of posts made by this profile
        :>json bool private: true if this is a private account (i.e. not world-
            readable)
        :>json str score: user-defined score for this profile. Can be null.
        :>json str site: machine-readable site name that this profile belongs to
        :>json str site_name: human-readable site name that this profile belongs
            to
        :>json str time_zone: the user's provided time zone as free text
        :>json str upstream_id: the user ID assigned by the social site
        :>json str url: URL endpoint for retriving more data about this profile
        :>json str username: the current username for this profile
        :>json list usernames: list of known usernames for this profile
        :>json str usernames[n].end_date: the last known date this username was
            used for this profile
        :>json str usernames[n].start_date: the first known date this username
            was used for this profile
        :>json str usernames[n].username: a username used for this profile

        :status 202: accepted for background processing
        :status 400: invalid request body
        :status 401: authentication required
        '''

        redis = worker.get_redis()

        # Get profile.
        id_ = get_int_arg('id_', id_)

        profile, avatar = g.db.query(Profile, Avatar) \
                              .outerjoin(Profile.current_avatar) \
                              .filter(Profile.id == id_).first()

        if profile is None:
            raise NotFound("Profile '%s' does not exist." % id_)

        request_json = request.get_json()

        # Validate put data and set attributes
        # Only 'is_interesting', 'score', and 'labels' are modifiable
        if 'is_interesting' in request_json:
            if isinstance(request_json['is_interesting'], bool):
                profile.is_interesting = request_json['is_interesting']
            elif request_json['is_interesting'] is None:
                profile.is_interesting = None
            else:
                raise BadRequest("'is_interesting' is a boolean (true or false.")

        if 'score' in request_json:
            if request_json['score'] is None:
                profile.score = None
            else:
                try:
                    profile.score = float(request_json['score'])
                except:
                    raise BadRequest("'score' must be a decimal number.")


        # labels expects the string 'name' rather than id, to avoid the need to
        # create labels before adding them.
        if 'labels' in request_json:
            labels = []
            if isinstance(request_json['labels'], list):
                for label_json in request_json['labels']:
                    if 'name' in label_json:
                        label = g.db.query(Label) \
                                    .filter(Label.name==label_json['name']) \
                                    .first()
                        if label is None:
                            try:
                                label = Label(
                                    name=label_json['name'].lower().strip()
                                )

                                g.db.add(label)
                                g.db.flush()

                                redis.publish(
                                    'label',
                                    json.dumps(label.as_dict())
                                )
                            except IntegrityError:
                                g.db.rollback()
                                raise BadRequest('Label could not be saved')
                            except AssertionError:
                                g.db.rollback()
                                raise BadRequest(
                                    '"{}" contains non-alphanumeric character'
                                    .format(
                                        label_json['name']
                                    )
                                )

                        labels.append(label)
                    else:
                        raise BadRequest("Label 'name' is required")

                profile.labels = labels
            else:
                raise BadRequest("'labels' must be a list")

        response = profile.as_dict()
        response['url'] = url_for('ProfileView:get', id_=profile.id)

        # Save the profile
        try:
            g.db.commit()
            redis.publish('profile', json.dumps(profile.as_dict()))
        except DBAPIError as e:
            g.db.rollback()
            raise BadRequest('Profile could not be saved')

        # Create usernames list.
        usernames = list()

        for username in profile.usernames:
            if username.end_date is not None:
                end_date = username.end_date.isoformat()
            else:
                end_date = None

            if username.start_date is not None:
                start_date = username.start_date.isoformat()
            else:
                start_date = None

            usernames.append({
                'end_date': end_date,
                'username': username.username,
                'start_date': start_date,
            })

        response['usernames'] = usernames

        # Create avatar attributes.
        if avatar is not None:
            response['avatar_url'] = url_for(
                'FileView:get',
                id_=avatar.file.id
            )
            response['avatar_thumb_url'] = url_for(
                'FileView:get',
                id_=avatar.thumb_file.id
            )
        else:
            response['avatar_url'] = url_for(
                'static',
                filename='img/default_user.png'
            )
            response['avatar_thumb_url'] = url_for(
                'static',
                filename='img/default_user_thumb.png'
            )

        # Send response.
        return jsonify(**response)
Example #15
0
def scrape_instagram_relations(id_):
    '''
    Fetch friends and followers for the Instagram user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    '''
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    friends_results = 0
    followers_results = 0
    #max_results = _get_max_relations(db)['instagram']
    max_results = get_config(db, 'max_relations_instagram', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_instagram must be an integer'
        )

    friends_params = {}
    followers_params = {}
    total_results = max_results*2

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]

    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    worker.start_job(total=total_results)

    # Get friend IDs.
    friends_url = 'https://api.instagram.com/v1/users/{}/follows' \
                  .format(profile.upstream_id)

    while friends_results < max_results:
        # Get friends from Instagram API
        friends_response = requests.get(
            friends_url,
            params=friends_params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()
        pagination = friends_response.json()['pagination']

        for friend in friends_response.json()['data']:
            # Only store friends that are not already in db.
            if friend['id'] not in current_friends_ids:
                related_profile = Profile(
                    'instagram',
                    friend['id'],
                    friend['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==friend['id']) \
                            .one()

                related_profile.name = friend['full_name']
                profile.friends.append(related_profile)
                friends_results += 1
                worker.update_job(current=friends_results)

                if friends_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            friends_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    # Get follower IDs.
    followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \
                    .format(profile.upstream_id)

    # Get followers from Instagram API
    while followers_results < max_results:
        # Get friends from Instagram API
        followers_response = requests.get(
            followers_url,
            params=followers_params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()
        pagination = followers_response.json()['pagination']

        for follower in followers_response.json()['data']:
            # Only store followers that are not already in db.
            if follower['id'] not in current_followers_ids:
                related_profile = Profile(
                    'instagram',
                    follower['id'],
                    follower['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==follower['id']) \
                            .one()

                related_profile.name = follower['full_name']
                profile.followers.append(related_profile)
                followers_results += 1
                worker.update_job(current=friends_results + followers_results)

                if followers_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            followers_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
Example #16
0
    def post(self):
        """
        Create a label.

        **Example Request**

        .. sourcecode:: json

            {
                "labels": [
                    {"name": "gender"},
                    {"name": "age"},
                    ...
                ]
            }

        **Example Response**

        .. sourcecode:: json

            {
                "message": "2 new labels created."
            }

        :<header Content-Type: application/json
        :<header X-Auth: the client's auth token
        :>json list labels: a list of labels to create
        :>json str labels[n].name: name of label to create

        :>header Content-Type: application/json
        :>json str message: api response message

        :status 202: created
        :status 400: invalid request body
        :status 401: authentication required
        """

        request_json = request.get_json()
        redis = worker.get_redis()
        labels = list()

        # Validate input and create labels
        for t in request_json['labels']:
            if t['name'].strip() == '':
                raise BadRequest('Label name is required')
            else:
                try:
                    label = Label(name=t['name'].lower().strip())
                    g.db.add(label)
                    g.db.flush()
                    redis.publish('label', json.dumps(label.as_dict()))
                    labels.append(label.as_dict())
                except IntegrityError:
                    g.db.rollback()
                    raise BadRequest(
                        'Label "{}" already exists'.format(label.name)
                    )
                except AssertionError:
                    g.db.rollback()
                    raise BadRequest(
                        '"{}" contains non-alphanumeric character'
                        .format(t['name'])
                    )

        # Save labels
        g.db.commit()

        message = '{} new labels created'.format(len(request_json['labels']))
        labels = labels
        response = jsonify(
            message=message,
            labels=labels
        )
        response.status_code = 202

        return response
Example #17
0
    def post(self):
        '''
            Create a label.

            **Example Request**

            ..sourcode:: json

                {
                    "labels": [
                        {"name": "gender"},
                        {"name": "age"},
                        ...
                    ]
                }

        **Example Response**

        ..sourcecode:: json

            {
                "message": "2 new labels created."
            }

        :<header Content-Type: application/json
        :<header X-Auth: the client's auth token
        :>json list labels: a list of labels to create
        :>json str labels[n].name: name of label to create

        :>header Content-Type: application/json
        :>json str message: api response message

        :status 202: created
        :status 400: invalid request body
        :status 401: authentication required
        '''

        request_json = request.get_json()
        redis = worker.get_redis()
        labels = list()

        # Validate input and create labels
        for t in request_json['labels']:
            if t['name'].strip() == '':
                raise BadRequest('Label name is required')
            else:
                try:
                    label = Label(name=t['name'].lower().strip())
                    g.db.add(label)
                    g.db.flush()
                    redis.publish('label', json.dumps(label.as_dict()))
                    labels.append(label.as_dict())
                except IntegrityError:
                    g.db.rollback()
                    raise BadRequest(
                        'Label "{}" already exists'.format(label.name)
                    )
                except AssertionError:
                    g.db.rollback()
                    raise BadRequest(
                        '"{}" contains non-alphanumeric character'
                        .format(t['name'])
                    )

        # Save labels
        g.db.commit()

        message = '{} new labels created'.format(len(request_json['labels']))
        labels = labels
        response = jsonify(
            message=message,
            labels=labels
        )
        response.status_code = 202

        return response
Example #18
0
def scrape_twitter_relations(id_):
    '''
    Fetch friends and followers for the Twitter user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    '''
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    #max_results = _get_max_relations(db)['twitter']
    max_results = get_config(db, 'max_relations_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_twitter must be an integer'
        )

    friends_results = 0
    friends_ids = []
    followers_results = 0
    followers_ids = []
    friends_cursor = -1
    followers_cursor = -1

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    params = {
        'count': 5000,
        'user_id': profile.upstream_id,
        'stringify_ids': True,
    }

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]


    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    ## Get friend IDs.
    friends_url = 'https://api.twitter.com/1.1/friends/ids.json'
    params['cursor'] = friends_cursor

    while friends_results < max_results:
        friends_response = requests.get(
            friends_url,
            params=params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()

        # Ignore friends already in the db
        for friend_id in friends_response.json()['ids']:
            if friend_id not in current_friends_ids:
                friends_ids.append(friend_id)
                friends_results += 1
                if friends_results == max_results:
                    break

        friends_cursor = friends_response.json()['next_cursor']

        if friends_cursor == 0:
            break # No more results
        else:
            params['cursor'] = friends_cursor

    # Get follower IDs.
    followers_url = 'https://api.twitter.com/1.1/followers/ids.json'
    params['cursor'] = followers_cursor

    while followers_results < max_results:
        followers_response = requests.get(
            followers_url,
            params=params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()

        # Ignore followers already in the db
        for follower_id in followers_response.json()['ids']:
            if follower_id not in current_followers_ids:
                followers_ids.append(follower_id)
                followers_results += 1
                if followers_results == max_results:
                    break

        followers_cursor = followers_response.json()['next_cursor']

        if followers_cursor == 0:
            break # No more results
        else:
            params['cursor'] = followers_cursor

    # Get username for each of the friend/follower IDs and create
    # a relationship in QuickPin.
    user_ids = [(uid, 'friend') for uid in friends_ids] + \
               [(uid, 'follower') for uid in followers_ids]
    worker.start_job(total=len(user_ids))
    chunk_size = 100
    for chunk_start in range(0, len(user_ids), chunk_size):
        chunk_end = chunk_start + chunk_size
        chunk = user_ids[chunk_start:chunk_end]
        chunk_lookup = {id_:relation for id_,relation in chunk}

        lookup_url = 'https://api.twitter.com/1.1/users/lookup.json'
        lookup_response = requests.post(
            lookup_url,
            proxies=_get_proxies(db),
            verify=False,
            data={'user_id': ','.join(chunk_lookup.keys())}
        )
        lookup_response.raise_for_status()
        relations = lookup_response.json()

        for related_dict in relations:
            uid = related_dict['id_str']
            username = related_dict['screen_name']
            related_profile = Profile('twitter', uid, username, is_stub=True)
            db.add(related_profile)

            try:
                db.commit()
            except IntegrityError:
                # Already exists: use the existing profile.
                db.rollback()
                related_profile = db \
                    .query(Profile) \
                    .filter(Profile.site=='twitter') \
                    .filter(Profile.upstream_id==uid) \
                    .one()

            _twitter_populate_profile(related_dict, related_profile)
            relation = chunk_lookup[uid]

            if relation == 'friend':
                profile.friends.append(related_profile)
            else: # relation == 'follower':
                profile.followers.append(related_profile)

            db.commit()

        worker.update_job(current=chunk_end)

    db.commit()
    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
Example #19
0
def scrape_twitter_relations(id_):
    """
    Fetch friends and followers for the Twitter user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    """
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    max_results = get_config(db, 'max_relations_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_twitter must be an integer'
        )

    friends_results = 0
    friends_ids = []
    followers_results = 0
    followers_ids = []
    friends_cursor = -1
    followers_cursor = -1

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    params = {
        'count': 5000,
        'user_id': profile.upstream_id,
        'stringify_ids': True,
    }

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]


    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    ## Get friend IDs.
    friends_url = 'https://api.twitter.com/1.1/friends/ids.json'
    params['cursor'] = friends_cursor

    while friends_results < max_results:
        friends_response = requests.get(
            friends_url,
            params=params,
            proxies=proxies,
            verify=False,
            headers=TWITTER_HEADERS
        )
        friends_response.raise_for_status()

        # Ignore friends already in the db
        for friend_id in friends_response.json()['ids']:
            if friend_id not in current_friends_ids:
                friends_ids.append(friend_id)
                friends_results += 1
                if friends_results == max_results:
                    break

        friends_cursor = friends_response.json()['next_cursor']

        if friends_cursor == 0:
            break # No more results
        else:
            params['cursor'] = friends_cursor

    # Get follower IDs.
    followers_url = 'https://api.twitter.com/1.1/followers/ids.json'
    params['cursor'] = followers_cursor

    while followers_results < max_results:
        followers_response = requests.get(
            followers_url,
            params=params,
            proxies=proxies,
            verify=False,
            headers=TWITTER_HEADERS,
        )
        followers_response.raise_for_status()

        # Ignore followers already in the db
        for follower_id in followers_response.json()['ids']:
            if follower_id not in current_followers_ids:
                followers_ids.append(follower_id)
                followers_results += 1
                if followers_results == max_results:
                    break

        followers_cursor = followers_response.json()['next_cursor']

        if followers_cursor == 0:
            break # No more results
        else:
            params['cursor'] = followers_cursor

    # Get username for each of the friend/follower IDs and create
    # a relationship in QuickPin.
    user_ids = [(uid, 'friend') for uid in friends_ids] + \
               [(uid, 'follower') for uid in followers_ids]
    worker.start_job(total=len(user_ids))
    chunk_size = 100
    for chunk_start in range(0, len(user_ids), chunk_size):
        chunk_end = chunk_start + chunk_size
        chunk = user_ids[chunk_start:chunk_end]
        chunk_lookup = {id_:relation for id_,relation in chunk}

        lookup_url = 'https://api.twitter.com/1.1/users/lookup.json'
        lookup_response = requests.post(
            lookup_url,
            proxies=_get_proxies(db),
            verify=False,
            headers=TWITTER_HEADERS,
            data={'user_id': ','.join(chunk_lookup.keys())}
        )
        lookup_response.raise_for_status()
        relations = lookup_response.json()

        for related_dict in relations:
            uid = related_dict['id_str']
            username = related_dict['screen_name']
            related_profile = Profile('twitter', uid, username, is_stub=True)
            db.add(related_profile)

            try:
                db.commit()
            except IntegrityError:
                # Already exists: use the existing profile.
                db.rollback()
                related_profile = db \
                    .query(Profile) \
                    .filter(Profile.site=='twitter') \
                    .filter(Profile.upstream_id==uid) \
                    .one()

            _twitter_populate_profile(related_dict, related_profile)
            relation = chunk_lookup[uid]

            if relation == 'friend':
                profile.friends.append(related_profile)
            else: # relation == 'follower':
                profile.followers.append(related_profile)

            db.commit()

        worker.update_job(current=chunk_end)

    db.commit()
    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
Example #20
0
def scrape_twitter_posts(id_, recent):
    """
    Fetch tweets for the user identified by id_.
    Checks tweets already stored in db, and will only fetch older or newer
    tweets depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent tweets not already stored in the db.
    The number of tweets to fetch is configured in the Admin.
    """
    db = worker.get_session()
    max_results = get_config(db, 'max_posts_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_twitter must be an integer')

    worker.start_job(total=max_results)
    redis = worker.get_redis()
    author = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    results = 0
    max_id = None
    more_results = True
    count = 200

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get posts currently stored in db for this profile.
    post_query = db.query(Post) \
                        .filter(Post.author_id == id_) \
                        .order_by(Post.upstream_created.desc())

    url = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
    params = {'count': count, 'user_id': author.upstream_id}

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            since_id = post_query[0].upstream_id
            params['since_id'] = str(since_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() -1].upstream_id
            params['max_id'] = str(max_id)

    while more_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False,
            headers=TWITTER_HEADERS,
        )
        response.raise_for_status()

        post_ids = list()

        tweets = response.json()
        if len(tweets) == 0:
            more_results = False

        if len(tweets) < count:
            more_results = False

        for tweet in tweets:
            # Twitter API result set includes the tweet with the max_id/since_id
            # so ignore it.
            if tweet['id_str'] != max_id:
                post = Post(
                    author,
                    tweet['id_str'],
                    dateutil.parser.parse(tweet['created_at']),
                    tweet['text']
                )

                if tweet['lang'] is not None:
                    post.language = tweet['lang']

                if tweet['coordinates'] is not None:
                    post.latitude, post.longitude = tweet['coordinates']

                place = tweet['place']

                if place is not None:
                    # Set longitude/latitude to the center the of bounding polygon.
                    total_lon = 0
                    total_lat = 0
                    num_coords = 0

                    for lon, lat in place['bounding_box']['coordinates'][0]:
                        total_lon += lon
                        total_lat += lat
                        num_coords += 1

                    post.longitude = total_lon / num_coords
                    post.latitude = total_lat / num_coords

                    # Set location to string identifying the place.
                    post.location = '{}, {}'.format(
                        place['full_name'],
                        place['country']
                    )

                db.add(post)
                db.flush()
                post_ids.append(post.id)
                # Set the max_id to the last tweet to get the next set of
                # results
                max_id = tweet['id_str']
                params['max_id'] = max_id
                results += 1
                worker.update_job(current=results)

                if results == max_results:
                    more_results = False
                    break


    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)
Example #21
0
def scrape_instagram_relations(id_):
    """
    Fetch friends and followers for the Instagram user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    """
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    friends_results = 0
    followers_results = 0
    max_results = get_config(db, 'max_relations_instagram', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_instagram must be an integer'
        )

    friends_params = {}
    followers_params = {}
    total_results = max_results*2

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]

    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    worker.start_job(total=total_results)

    # Get friend IDs.
    friends_url = 'https://api.instagram.com/v1/users/{}/follows' \
                  .format(profile.upstream_id)

    while friends_results < max_results:
        # Get friends from Instagram API
        friends_response = requests.get(
            friends_url,
            params=friends_params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()
        pagination = friends_response.json()['pagination']

        for friend in friends_response.json()['data']:
            # Only store friends that are not already in db.
            if friend['id'] not in current_friends_ids:
                related_profile = Profile(
                    'instagram',
                    friend['id'],
                    friend['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==friend['id']) \
                            .one()

                related_profile.name = friend['full_name']
                profile.friends.append(related_profile)
                friends_results += 1
                worker.update_job(current=friends_results)

                if friends_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            friends_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    # Get follower IDs.
    followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \
                    .format(profile.upstream_id)

    # Get followers from Instagram API
    while followers_results < max_results:
        # Get friends from Instagram API
        followers_response = requests.get(
            followers_url,
            params=followers_params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()
        pagination = followers_response.json()['pagination']

        for follower in followers_response.json()['data']:
            # Only store followers that are not already in db.
            if follower['id'] not in current_followers_ids:
                related_profile = Profile(
                    'instagram',
                    follower['id'],
                    follower['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==follower['id']) \
                            .one()

                related_profile.name = follower['full_name']
                profile.followers.append(related_profile)
                followers_results += 1
                worker.update_job(current=friends_results + followers_results)

                if followers_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            followers_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
Example #22
0
def scrape_instagram_posts(id_, recent):
    """
    Fetch instagram posts for the user identified by id_.
    Checks posts already stored in db, and will only fetch older or newer
    posts depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent posts not already stored in the db.
    The number of posts to fetch is configured in the Admin.
    """
    redis = worker.get_redis()
    db = worker.get_session()
    author = db.query(Profile).filter(Profile.id == id_).first()
    proxies = _get_proxies(db)
    max_results = get_config(db, 'max_posts_instagram', required=True).value
    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_instagram must be an integer')

    min_id = None
    results = 0
    params = {}

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    url = 'https://api.instagram.com/v1/users/{}/media/recent' \
          .format(author.upstream_id)

    # Get last post currently stored in db for this profile.
    post_query = db.query(Post) \
        .filter(Post.author_id == id_) \
        .order_by(Post.upstream_created.desc()) \

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            min_id = post_query[0].upstream_id
            params['min_id'] = str(min_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() - 1].upstream_id
            params['max_id'] = str(max_id)

    worker.start_job(total=max_results)
    while results < max_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False
        )

        response.raise_for_status()
        post_ids = list()
        response_json = response.json()['data']
        pagination = response.json()['pagination']

        # Instagram API result includes post with min_id so remove it
        response_json[:] = [d for d in response_json if d.get('id') != min_id]

        for gram in response_json:
            if gram['caption'] is not None:
                text = gram['caption']['text']
            else:
                text = None

            post = Post(
                author,
                gram['id'],
                datetime.fromtimestamp(int(gram['created_time'])),
                text
            )

            if gram['location'] is not None:
                if 'latitude' in gram['location']:
                    post.latitude = gram['location']['latitude']
                    post.longitude = gram['location']['longitude']

                if 'name' in gram['location']:
                    post.location = gram['location']['name']

                    if 'street_address' in gram['location']:
                        post.location += ' ' + gram['location']['street_address']

            if 'images' in gram:
                image_url = gram['images']['standard_resolution']['url']
                name = os.path.basename(urlparse(image_url).path)
                img_response = requests.get(image_url, verify=False)
                mime = img_response.headers['Content-type']
                image = img_response.content
                post.attachments.append(File(name, mime, image))

            db.add(post)
            db.flush()
            post_ids.append(post.id)
            worker.update_job(current=results)
            results += 1
            if results == max_results:
                break

        # If there are more results, set the max_id param, otherwise finish
        if 'next_max_id' in pagination:
            params['max_id'] = pagination['next_max_id']
        else:
            break

    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)
Example #23
0
def check_username(username,
                   site_id,
                   category_id,
                   total,
                   tracker_id,
                   user_id,
                   test=False):
    """
    Check if `username` exists on the specified site.
    """

    worker.start_job()
    redis = worker.get_redis()
    db_session = worker.get_session()

    # Get site
    site = db_session.query(Site).get(site_id)

    # Check site for username
    splash_result = _splash_username_request(username, site)
    # Save image file
    image_file = _save_image(db_session=db_session,
                             scrape_result=splash_result,
                             user_id=user_id,
                             censor=site.censor_images)

    # Save result to DB.
    result = Result(tracker_id=tracker_id,
                    site_id=splash_result['site']['id'],
                    site_name=splash_result['site']['name'],
                    site_url=splash_result['url'],
                    status=splash_result['status'],
                    image_file_id=image_file.id,
                    username=username,
                    error=splash_result['error'],
                    user_id=user_id)

    if result.status == 'f':
        result.html = splash_result['html']

    db_session.add(result)
    db_session.commit()

    if not test:
        # Notify clients of the result.
        current = redis.incr(tracker_id)
        result_dict = result.as_dict()
        result_dict['current'] = current
        # result_dict['image_file_url'] = image_file.url()
        # result_dict['image_name'] = image_file.name
        result_dict['total'] = total
        redis.publish('result', json.dumps(result_dict))

        # If this username search is complete, then queue an archive job.
        if current == total:
            description = 'Archiving results ' \
                          'for username "{}"'.format(username)
            worker.archive.create_archive.enqueue(
                username=username,
                category_id=category_id,
                tracker_id=tracker_id,
                jobdesc=description,
                timeout=_redis_worker['archive_timeout'],
                user_id=user_id)

    worker.finish_job()
    return result.id
Example #24
0
def scrape_instagram_posts(id_, recent):
    '''
    Fetch instagram posts for the user identified by id_.
    Checks posts already stored in db, and will only fetch older or newer
    posts depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent posts not already stored in the db.
    The number of posts to fetch is configured in the Admin.
    '''
    redis = worker.get_redis()
    db = worker.get_session()
    author = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    max_results = get_config(db, 'max_posts_instagram', required=True).value
    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_instagram must be an integer')

    min_id = None
    more_results = True
    results = 0
    params = {}

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    url = 'https://api.instagram.com/v1/users/{}/media/recent' \
          .format(author.upstream_id)

    # Get last post currently stored in db for this profile.
    post_query = db.query(Post) \
                        .filter(Post.author_id == id_) \
                        .order_by(Post.upstream_created.desc()) \

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            min_id = post_query[0].upstream_id
            params['min_id'] = str(min_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() -1].upstream_id
            params['max_id'] = str(max_id)

    worker.start_job(total=max_results)
    logging.warning('WORKER max results: {}'.format(max_results))
    while results < max_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False
        )

        response.raise_for_status()
        post_ids = list()
        response_json = response.json()['data']
        pagination = response.json()['pagination']

        # Instagram API result includes post with min_id so remove it
        response_json[:] = [d for d in response_json if d.get('id') != min_id]

        for gram in response_json:
            if gram['caption'] is not None:
                text = gram['caption']['text']
            else:
                text = None

            post = Post(
                author,
                gram['id'],
                datetime.fromtimestamp(int(gram['created_time'])),
                text
            )

            if gram['location'] is not None:
                if 'latitude' in gram['location']:
                    post.latitude = gram['location']['latitude']
                    post.longitude = gram['location']['longitude']

                if 'name' in gram['location']:
                    post.location = gram['location']['name']

                    if 'street_address' in gram['location']:
                        post.location += ' ' + gram['location']['street_address']

            if 'images' in gram:
                image_url = gram['images']['standard_resolution']['url']
                name = os.path.basename(urlparse(image_url).path)
                img_response = requests.get(image_url, verify=False)
                mime = img_response.headers['Content-type']
                image = img_response.content
                post.attachments.append(File(name, mime, image))

            db.add(post)
            db.flush()
            post_ids.append(post.id)
            worker.update_job(current=results)
            results += 1
            if results == max_results:
                break

        # If there are more results, set the max_id param, otherwise finish
        if 'next_max_id' in pagination:
            params['max_id'] = pagination['next_max_id']
        else:
            break

    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)