Exemple #1
0
def scrape_link(link_arg_set):
  promo_url, link, time_bucket, data_source, config = link_arg_set

  try:
    link_url = link.get_attribute("href")

  except StaleElementReferenceException:
    pass

  else:                  
    # continue under specified conditions
    if isinstance(link_url, basestring) and is_article(link_url, config):
      
      # parse link text
      try:
        link_text = link.text.encode('utf-8').strip()
      except:
        link_text = None

      if link_text is not None  and link_text is not '':

        pos_x = int(link.location['x'])
        pos_y = int(link.location['y'])

        if pos_x > 0 and pos_y > 0:
          
          # get image
          img_dict = get_image_for_a_link(link)

          # parse link
          link_url = link_url.encode('utf-8')
          article_url = parse_url(link_url)

          # sluggify
          article_slug = sluggify(article_url)

          # scrape
          log.info("PROMOPAGE\tLink detected on %s\t%s" % (promo_url, article_url))

          link_dict = {
            'article_slug' : article_slug,
            'article_url': article_url,
            'time_bucket': time_bucket,
            'raw_timestamp': int(datetime.now().strftime("%s")),
            'pp_promo_url' : promo_url,
            'pp_link_url': link_url,
            'pp_headline' : link_text,
            'pp_font_size' : int(link.value_of_css_property('font-size')[:-2]),
            'pp_pos_x' : pos_x,
            'pp_pos_y' : pos_y
          }

          value = json.dumps({data_source : dict(img_dict.items() + link_dict.items())})

          # upsert url
          upsert_url(article_url, article_slug, data_source, config)

          # upload data to redis
          db.zadd(article_slug, time_bucket, value)
Exemple #2
0
def scrape_link(link_arg_set):
    promo_url, link, time_bucket, data_source, config = link_arg_set

    try:
        link_url = link.get_attribute("href")
    except StaleElementReferenceException:
        pass
    else:
        # parse link text
        try:
            link_text = link.text.encode('utf-8').strip()
        except:
            link_text = None
        
        # det image
        img_dict = get_image_for_a_link(link)

        # tests 
        test_link = link_url is not None and isinstance(link_url, basestring) and is_article(link_url, config)
        test_text = link_text is not None  and link_text is not ''
        test_image = True if img_dict['pp_is_img']==1 else False
        test_existence = True if link.location['x'] > 0 or link.location['y'] > 0 else False
        tests = [any([test_image, test_text]), test_link, test_existence]
        
        # continue under specified conditions
        if all(tests):

            # parse link
            link_url = link_url.encode('utf-8')
            article_url = parse_url(link_url)

            # sluggify
            article_slug = sluggify(article_url)

            # scrape
            print "INFO\tPROMOPAGE\tlink detected on %s re: %s" % (promo_url, article_slug)

            link_dict = {
                'article_slug' : article_slug,
                'article_url': article_url,
                'time_bucket': time_bucket,
                'raw_timestamp': int(datetime.now().strftime("%s")),
                'pp_promo_url' : promo_url,
                'pp_link_url': link_url,
                'pp_headline' : link_text,
                'pp_font_size' : int(link.value_of_css_property('font-size')[:-2]),
                'pp_pos_x' : int(link.location['x']),
                'pp_pos_y' : int(link.location['y'])
            }

            value = json.dumps({data_source : dict(img_dict.items() + link_dict.items())})

            # upsert url
            upsert_url(article_url, article_slug, data_source, config)

            # upload data to redis
            db.zadd(article_slug, time_bucket, value)
Exemple #3
0
def upsert_url(article_url, article_slug, data_source, config):
  if not db.sismember('article_set', article_url):
    # add it to the set
    db.sadd('article_set', article_url)

    # insert metadata
    ts = current_timestamp(config)
    value = json.dumps({
      "url" : article_url,
      "slug": article_slug,
      "timestamp" : ts,
      "data_source": data_source
      })
    
    db.zadd('article_sorted_set', ts, value)
Exemple #4
0
def insert_new_post(post_arg_set):
  """
  insert new post into redis
  """
  api, post_data, acct_data, page_id, config = post_arg_set

  try:
    post_id = post_data['id'] if post_data.has_key('id') else None
  except Exception as e:
    print e
  else:
    if is_insights(page_id, config):
      insights_value = get_insights_data(api, page_id, post_id)
    else:
      insights_value = {}
    
    # parse date
    if post_data.has_key('created_time') and post_data['created_time'] is not None:  
      dt = datetime.strptime(post_data['created_time'], FB_DATE_FORMAT)
      date_time = tz_adj(dt, config)
      time_bucket = round_datetime(date_time, config)
      raw_timestamp = int(date_time.strftime("%s"))
    
    else:
      time_bucket = None
      raw_timestamp = None
    
    # extract message so we can find links within the msg if not in url
    article_urls = [get_fb_link(post_data, config, unshorten=True)]
    message = post_data['message'].encode('utf-8') if post_data.has_key('message') else None
    message_urls = get_message_urls(article_urls, message, config)

    # detect article links, unshorten and parse
    article_urls = [
      parse_url(unshorten_link(article_url, config)) \
      for article_url in article_urls + message_urls
      if article_url is not None and is_article(article_url, config)
    ]

    if article_urls:
      for article_url in set(article_urls):

        # sluggify url
        article_slug = sluggify(article_url)

        # format data
        post_value = {
          'article_slug': article_slug,
          'article_url': article_url,
          'time_bucket': time_bucket,
          'fb_post_created': time_bucket,
          'raw_timestamp': raw_timestamp,
          'fb_raw_link' : get_fb_link(post_data, config=config),
          'fb_page_id': page_id,
          'fb_post_id': post_id,
          'fb_page_likes': acct_data['likes'] if acct_data.has_key('likes') else None,
          'fb_page_talking_about': acct_data['talking_about_count'] if acct_data.has_key('talking_about_count') else None,
          'fb_type': post_data['type'] if post_data.has_key('type') else None,
          'fb_status_type': post_data['status_type'] if post_data.has_key('status_type') else None,
          'fb_message': message
        }
          
        # always insert insights data
        if is_insights(page_id, config):
          print "INFO\tINSIGHTS\tAdding data from %s re: %s" % (page_id, article_slug)
          # 
          data_source = "facebook_insights_%s" % page_id 
          # upsert url
          upsert_url(article_url, article_slug, data_source, config)

          # insert id
          db.sadd('facebook_post_ids', post_id)

          # format time bucket
          current_time_bucket = gen_time_bucket(config)
          insights_value.pop('time_bucket', current_time_bucket)
          post_value.pop('time_bucket', None)
          
          value = json.dumps({
            data_source : dict(post_value.items() + insights_value.items())
          })

          # upload data to redis
          db.zadd(article_slug, current_time_bucket, value)        
            
        # only insert new posts
        elif not db.sismember('facebook_post_ids', post_id):
          
          print "INFO\tFACEBOOK\tnew post %s re: %s" % (post_id, article_slug)
          
          # insert id
          db.sadd('facebook_post_ids', post_id)     
          
          # upsert url
          data_source = "facebook_%s" % page_id
          upsert_url(article_url, article_slug, data_source, config)

          value = json.dumps({
            data_source : dict(post_value.items() + insights_value.items())
          })

          # upload data to redis
          db.zadd(article_slug, time_bucket, value)
Exemple #5
0
def parse_tweet(tweet_arg_set):
  slug, t, config = tweet_arg_set

  # check if id exists
  twt_id = t.id_str
  if not db.sismember('twitter_twt_ids', twt_id):

    # if not, add id to id_set
    db.sadd('twitter_twt_ids', twt_id)
    
    # check for relevant urls
    raw_urls = [u['expanded_url'] for u in t.entities['urls']]

    # parse urls
    article_urls = set([parse_url(unshorten_link(u, config)) for u in raw_urls])

    if any([is_article(u, config) for u in article_urls]):

      # parse dates
      # sometimes t.created_at is a datetime object
      if isinstance(t.created_at, datetime):
        dt = t.created_at
      else:
        dt = datetime.strptrim(t.created_at, TWT_DATE_FORMAT)
      
      date_time = tz_adj(dt, config)
      time_bucket = round_datetime(date_time, config) if date_time is not None else None
      
      raw_timestamp = int(date_time.strftime('%s'))
      

      for article_url in article_urls:
        # sluggify url
        article_slug = sluggify(article_url)
        screen_name = t.user.screen_name
        log.info( "TWITTER\tNew Tweet %s/%s\t%s" % (screen_name, twt_id, article_url) )

      # format data
        value = {
          'article_slug': article_slug,
          'article_url': article_url,
          'time_bucket': time_bucket,
          'raw_timestamp' :  raw_timestamp,
          'twt_list' : slug,
          'twt_post_created': raw_timestamp,
          'twt_id': twt_id,
          'twt_screen_name': t.user.screen_name,
          'twt_text': t.text,
          'twt_followers': t.author.followers_count,
          'twt_friends': t.author.friends_count,
          'twt_lang': t.lang,    
          'twt_raw_links': raw_urls,
          'twt_hashtags': t.entities['hashtags'],
          'twt_user_mentions': t.entities['user_mentions'],
          'twt_in_reply_to_screen_name': t.in_reply_to_screen_name,
          'twt_in_reply_to_status_id_str': t.in_reply_to_status_id_str
        }
        
        data_source = "twitter_%s" % slug
        
        # upsert url
        upsert_url(article_url, article_slug, data_source, config)

        value = json.dumps({ data_source : value})
        
        # add data to redis
        db.zadd(article_slug, time_bucket, value)