Beispiel #1
0
def get_active_time(page_id):
    """get lastest post

    Args:
        page_id (TYPE): Description
    """
    # construct the URL string
    node = page_id
    edge = 'posts'
    fields = ['created_time', 'type', 'id']
    parameters = {
        'access_token': constant.ACCESS_TOKEN,
        'fields': ','.join(fields),
        'limit': 1
    }

    url = '%s/%s/%s/?%s' % (constant.BASE_GRAPH_API, node, edge,
                            build_param_string(parameters))
    data = request_until_succeed(url).decode('utf-8')
    posts = json.loads(data)['data']

    if len(posts) > 0:
        post = posts[0]
        last_active_time = dateutil.parser.parse(post['created_time'])
        return last_active_time

    return None
Beispiel #2
0
def get_page_info(page_id):
    """
    get page infor by page id

    Args:
        page_id (TYPE): Description
    """
    fields = [
        'about', 'username', 'name', 'description', 'fan_count', 'mission',
        'talking_about_count', 'company_overview'
    ]

    # construct the URL string
    node = page_id
    parameters = {
        'access_token': constant.ACCESS_TOKEN,
        'fields': ','.join(fields)
    }
    url = '%s/%s/?%s' % (constant.BASE_GRAPH_API, node,
                         build_param_string(parameters))
    data = request_until_succeed(url).decode('utf-8')
    page_info = json.loads(data)
    logger.debug('id=%s: %s', page_id, page_info)

    name = ''
    if 'username' in page_info.keys():
        name = page_info['username']
    else:
        name = page_info['name']
    fb_page = FbPage(page_id, name)

    # text fields
    fb_page.name = page_info['name'] if 'name' in page_info.keys() else ''
    fb_page.about = page_info['about'] if 'about' in page_info.keys() else ''
    if 'description' in page_info.keys():
        fb_page.description = page_info['description']
    if 'company_overview' in page_info.keys():
        fb_page.company_overview = page_info['company_overview']
    if 'mission' in page_info.keys():
        fb_page.mission = page_info['mission']

    # number fields
    if 'fan_count' in page_info.keys():
        fb_page.fan_count = page_info['fan_count']
    if 'talking_about_count' in page_info.keys():
        fb_page.talking_about_count = page_info['talking_about_count']

    return fb_page
Beispiel #3
0
def test_page(page_name):
    """Summary

    Args:
        page_name (TYPE): Description

    """
    # construct the URL string

    node = page_name
    parameters = {'access_token': constant.ACCESS_TOKEN}
    url = '%s/%s/?%s' % (constant.BASE_GRAPH_API, node,
                         build_param_string(parameters))
    logger.debug('url: %s', url)

    # retrieve data
    response = urlopen(url)
    data = response.read().decode('utf-8')
    logger.debug('type of response: %s', str(type(data)))
    data_json = json.loads(data)
    logger.info(data_json)
Beispiel #4
0
def get_page_id(page_name):
    """
    find id of FB page from it content. Pattern to find id is:
    - <a href="https://m.me/1000515996721686"... (mobile mode)
    - content="fb://page/?id=1000515996721686 (desktop mode)

    Args:
        page_name (TYPE): Description

    Returns:
        TYPE: Description
    """
    node = page_name
    parameters = {'access_token': constant.ACCESS_TOKEN}
    url = '%s/%s/?%s' % (constant.BASE_GRAPH_API, node,
                         build_param_string(parameters))
    logger.debug('url: %s', url)

    # retrieve data
    response = urlopen(url)
    data = response.read().decode('utf-8')
    logger.debug('type of response: %s', str(type(data)))
    data_json = json.loads(data)
    return data_json['id']
Beispiel #5
0
def get_liked_pages_by():
    """Summary
    """
    seed_path = os.path.join(constant.BASE_DIR, 'data', 'seed_pages.csv')
    df = pd.read_csv(seed_path, dtype={'page_id': str, 'name': str})
    df = df.fillna('')
    logger.info(df.head())

    ignored_page_ids = set()
    inactive_pages = {}
    active_pages = {}
    for index, row in df.iterrows():
        p_id = row['page_id']
        active_pages[p_id] = get_page_info(p_id)

    while len(active_pages) < 100:
        page_ids = list(active_pages.keys())
        for p_id in page_ids:
            page = active_pages[p_id]

            # construct the URL string
            node = p_id
            edge = 'likes'
            parameters = {'access_token': constant.ACCESS_TOKEN}
            url = '%s/%s/%s/?%s' % (constant.BASE_GRAPH_API, node, edge,
                                    build_param_string(parameters))
            data = request_until_succeed(url).decode('utf-8')
            liked_pages = json.loads(data)['data']
            logger.info('page=%s likes %s', page.name, liked_pages)

            # update liked pages list of current page
            active_pages[p_id].liked_pages = set(
                [l['id'] for l in liked_pages])

            # check if liked page exists in page_info or not
            # in order to add to database if liked page does not exist
            for liked_page in liked_pages:
                liked_id = liked_page['id']
                if liked_id in active_pages.keys():
                    continue
                elif liked_id in inactive_pages.keys():
                    continue
                else:
                    liked_page = get_page_info(liked_id)
                    if is_vietnamese_page(liked_page) is False:
                        ignored_page_ids.add(liked_id)
                        continue

                    if is_inactive_page(liked_page):
                        if liked_id not in inactive_pages.keys():
                            inactive_pages[liked_id] = liked_page
                    else:
                        active_pages[liked_id] = liked_page

        logger.info('TOTAL CRAWLED %d pages', len(active_pages))

    # update active page list
    df = pd.DataFrame.from_records(
        [v.to_dict() for k, v in active_pages.items()])

    # extract full page info
    df.to_csv(os.path.join(constant.BASE_DIR, FUL_ACTIVE_PAGE_PATH),
              index=False)
    logger.info('export FULL active page info to file: %s',
                FUL_ACTIVE_PAGE_PATH)

    active_file_path = os.path.join(constant.BASE_DIR, ACTIVE_PAGE_PATH)
    df = df[[
        'page_id', 'username', 'name', 'fan_count', 'talking_about_count',
        'last_active', 'liked_pages'
    ]]
    df.to_csv(active_file_path, index=False)
    logger.info('export active page info to file: %s', ACTIVE_PAGE_PATH)

    # update inactive page ids
    inactive_file_path = os.path.join(constant.BASE_DIR, INACTIVE_PAGE_PATH)
    df = pd.DataFrame.from_records(
        [v.to_dict() for k, v in inactive_pages.items()])
    df = df[['page_id', 'username']]
    df.to_csv(inactive_file_path, index=False)
    logger.info('export inactive page ids to file: %s', INACTIVE_PAGE_PATH)

    # update ifnored page ids
    ignored_file_path = os.path.join(constant.BASE_DIR, 'ignored_pages.csv')
    df = pd.DataFrame.from_records([{
        'page_id': p_id
    } for p_id in ignored_page_ids])
    df.to_csv(ignored_file_path, index=False)
    logger.info('export ignored page ids to file: %s', ignored_file_path)
def get_posts_by_periods(page_name, output_path, periods):
    """Summary

    Args:
        page_name (TYPE): Description
        output_path (TYPE): Description
        periods (TYPE): Description
    """
    total_posts = 0
    page_id = get_page_id(page_name=page_name)

    # construct the URL string
    node = page_name
    edge = 'posts'
    fields = [
        'message', 'link', 'created_time', 'type', 'id',
        'comments.limit(1).summary(true)',
        'reactions.type(LIKE).limit(0).summary(1).as(like)',
        'reactions.type(LOVE).limit(0).summary(1).as(love)',
        'reactions.type(HAHA).limit(0).summary(1).as(haha)',
        'reactions.type(WOW).limit(0).summary(1).as(wow)',
        'reactions.type(SAD).limit(0).summary(1).as(sad)',
        'reactions.type(ANGRY).limit(0).summary(1).as(angry)'
    ]
    parameters = {
        'access_token': constant.ACCESS_TOKEN,
        'fields': ','.join(fields),
        'limit': 100
    }

    url = '%s/%s/%s/?%s' % (constant.BASE_GRAPH_API, node, edge,
                            build_param_string(parameters))

    no_page_to_load = False
    for period in periods:
        fb_posts = []
        crawled_utctime = datetime.utcnow()
        jump_to_prev_period = False
        while url is not None:
            logger.info('crawl url: %s' % url)
            # retrieve data
            data = request_until_succeed(url).decode('utf-8')
            posts = json.loads(data)['data']
            for p in posts:
                p['crawled_utctime'] = crawled_utctime
                try:
                    post_obj = FbPost(page_id, p)
                    logger.debug(post_obj.__dict__)

                    if post_obj.created_time > period[1]:
                        logger.info(
                            'crawled posts is newer than period, move next>>')
                        continue

                    if post_obj.created_time < period[0]:
                        jump_to_prev_period = True
                        break
                    fb_posts.append(post_obj)
                except Exception as e:
                    logger.error(e)
                    logger.error('ERROR when parsing data_json=%s', p)

            if jump_to_prev_period:
                export_posts_to_file(fb_posts, page_id, period[0], output_path)
                break

            # get next page result
            url = None
            paging = json.loads(data)['paging']
            if 'next' in paging.keys():
                url = paging['next']
            else:
                logger.info('there is no more page to load')
                no_page_to_load = True
                export_posts_to_file(fb_posts, page_id, period[0], output_path)
                break

        total_posts += len(fb_posts)
        logger.info('There are %d posts in time %s', len(fb_posts), period[0])

        if no_page_to_load:
            break

    logger.info('There are %d posts in from %s to %s.', total_posts,
                periods[-1][0], periods[0][1])