def scrape_link(link_arg_set): promo_url, link, time_bucket, data_source, config = link_arg_set try: link_url = link.get_attribute("href") except StaleElementReferenceException: pass else: # continue under specified conditions if isinstance(link_url, basestring) and is_article(link_url, config): # parse link text try: link_text = link.text.encode('utf-8').strip() except: link_text = None if link_text is not None and link_text is not '': pos_x = int(link.location['x']) pos_y = int(link.location['y']) if pos_x > 0 and pos_y > 0: # get image img_dict = get_image_for_a_link(link) # parse link link_url = link_url.encode('utf-8') article_url = parse_url(link_url) # sluggify article_slug = sluggify(article_url) # scrape log.info("PROMOPAGE\tLink detected on %s\t%s" % (promo_url, article_url)) link_dict = { 'article_slug' : article_slug, 'article_url': article_url, 'time_bucket': time_bucket, 'raw_timestamp': int(datetime.now().strftime("%s")), 'pp_promo_url' : promo_url, 'pp_link_url': link_url, 'pp_headline' : link_text, 'pp_font_size' : int(link.value_of_css_property('font-size')[:-2]), 'pp_pos_x' : pos_x, 'pp_pos_y' : pos_y } value = json.dumps({data_source : dict(img_dict.items() + link_dict.items())}) # upsert url upsert_url(article_url, article_slug, data_source, config) # upload data to redis db.zadd(article_slug, time_bucket, value)
def scrape_link(link_arg_set): promo_url, link, time_bucket, data_source, config = link_arg_set try: link_url = link.get_attribute("href") except StaleElementReferenceException: pass else: # parse link text try: link_text = link.text.encode('utf-8').strip() except: link_text = None # det image img_dict = get_image_for_a_link(link) # tests test_link = link_url is not None and isinstance(link_url, basestring) and is_article(link_url, config) test_text = link_text is not None and link_text is not '' test_image = True if img_dict['pp_is_img']==1 else False test_existence = True if link.location['x'] > 0 or link.location['y'] > 0 else False tests = [any([test_image, test_text]), test_link, test_existence] # continue under specified conditions if all(tests): # parse link link_url = link_url.encode('utf-8') article_url = parse_url(link_url) # sluggify article_slug = sluggify(article_url) # scrape print "INFO\tPROMOPAGE\tlink detected on %s re: %s" % (promo_url, article_slug) link_dict = { 'article_slug' : article_slug, 'article_url': article_url, 'time_bucket': time_bucket, 'raw_timestamp': int(datetime.now().strftime("%s")), 'pp_promo_url' : promo_url, 'pp_link_url': link_url, 'pp_headline' : link_text, 'pp_font_size' : int(link.value_of_css_property('font-size')[:-2]), 'pp_pos_x' : int(link.location['x']), 'pp_pos_y' : int(link.location['y']) } value = json.dumps({data_source : dict(img_dict.items() + link_dict.items())}) # upsert url upsert_url(article_url, article_slug, data_source, config) # upload data to redis db.zadd(article_slug, time_bucket, value)
def upsert_url(article_url, article_slug, data_source, config): if not db.sismember('article_set', article_url): # add it to the set db.sadd('article_set', article_url) # insert metadata ts = current_timestamp(config) value = json.dumps({ "url" : article_url, "slug": article_slug, "timestamp" : ts, "data_source": data_source }) db.zadd('article_sorted_set', ts, value)
def insert_new_post(post_arg_set): """ insert new post into redis """ api, post_data, acct_data, page_id, config = post_arg_set try: post_id = post_data['id'] if post_data.has_key('id') else None except Exception as e: print e else: if is_insights(page_id, config): insights_value = get_insights_data(api, page_id, post_id) else: insights_value = {} # parse date if post_data.has_key('created_time') and post_data['created_time'] is not None: dt = datetime.strptime(post_data['created_time'], FB_DATE_FORMAT) date_time = tz_adj(dt, config) time_bucket = round_datetime(date_time, config) raw_timestamp = int(date_time.strftime("%s")) else: time_bucket = None raw_timestamp = None # extract message so we can find links within the msg if not in url article_urls = [get_fb_link(post_data, config, unshorten=True)] message = post_data['message'].encode('utf-8') if post_data.has_key('message') else None message_urls = get_message_urls(article_urls, message, config) # detect article links, unshorten and parse article_urls = [ parse_url(unshorten_link(article_url, config)) \ for article_url in article_urls + message_urls if article_url is not None and is_article(article_url, config) ] if article_urls: for article_url in set(article_urls): # sluggify url article_slug = sluggify(article_url) # format data post_value = { 'article_slug': article_slug, 'article_url': article_url, 'time_bucket': time_bucket, 'fb_post_created': time_bucket, 'raw_timestamp': raw_timestamp, 'fb_raw_link' : get_fb_link(post_data, config=config), 'fb_page_id': page_id, 'fb_post_id': post_id, 'fb_page_likes': acct_data['likes'] if acct_data.has_key('likes') else None, 'fb_page_talking_about': acct_data['talking_about_count'] if acct_data.has_key('talking_about_count') else None, 'fb_type': post_data['type'] if post_data.has_key('type') else None, 'fb_status_type': post_data['status_type'] if post_data.has_key('status_type') else None, 'fb_message': message } # always insert insights data if is_insights(page_id, config): print "INFO\tINSIGHTS\tAdding data from %s re: %s" % (page_id, article_slug) # data_source = "facebook_insights_%s" % page_id # upsert url upsert_url(article_url, article_slug, data_source, config) # insert id db.sadd('facebook_post_ids', post_id) # format time bucket current_time_bucket = gen_time_bucket(config) insights_value.pop('time_bucket', current_time_bucket) post_value.pop('time_bucket', None) value = json.dumps({ data_source : dict(post_value.items() + insights_value.items()) }) # upload data to redis db.zadd(article_slug, current_time_bucket, value) # only insert new posts elif not db.sismember('facebook_post_ids', post_id): print "INFO\tFACEBOOK\tnew post %s re: %s" % (post_id, article_slug) # insert id db.sadd('facebook_post_ids', post_id) # upsert url data_source = "facebook_%s" % page_id upsert_url(article_url, article_slug, data_source, config) value = json.dumps({ data_source : dict(post_value.items() + insights_value.items()) }) # upload data to redis db.zadd(article_slug, time_bucket, value)
def parse_tweet(tweet_arg_set): slug, t, config = tweet_arg_set # check if id exists twt_id = t.id_str if not db.sismember('twitter_twt_ids', twt_id): # if not, add id to id_set db.sadd('twitter_twt_ids', twt_id) # check for relevant urls raw_urls = [u['expanded_url'] for u in t.entities['urls']] # parse urls article_urls = set([parse_url(unshorten_link(u, config)) for u in raw_urls]) if any([is_article(u, config) for u in article_urls]): # parse dates # sometimes t.created_at is a datetime object if isinstance(t.created_at, datetime): dt = t.created_at else: dt = datetime.strptrim(t.created_at, TWT_DATE_FORMAT) date_time = tz_adj(dt, config) time_bucket = round_datetime(date_time, config) if date_time is not None else None raw_timestamp = int(date_time.strftime('%s')) for article_url in article_urls: # sluggify url article_slug = sluggify(article_url) screen_name = t.user.screen_name log.info( "TWITTER\tNew Tweet %s/%s\t%s" % (screen_name, twt_id, article_url) ) # format data value = { 'article_slug': article_slug, 'article_url': article_url, 'time_bucket': time_bucket, 'raw_timestamp' : raw_timestamp, 'twt_list' : slug, 'twt_post_created': raw_timestamp, 'twt_id': twt_id, 'twt_screen_name': t.user.screen_name, 'twt_text': t.text, 'twt_followers': t.author.followers_count, 'twt_friends': t.author.friends_count, 'twt_lang': t.lang, 'twt_raw_links': raw_urls, 'twt_hashtags': t.entities['hashtags'], 'twt_user_mentions': t.entities['user_mentions'], 'twt_in_reply_to_screen_name': t.in_reply_to_screen_name, 'twt_in_reply_to_status_id_str': t.in_reply_to_status_id_str } data_source = "twitter_%s" % slug # upsert url upsert_url(article_url, article_slug, data_source, config) value = json.dumps({ data_source : value}) # add data to redis db.zadd(article_slug, time_bucket, value)