Example #1
0
def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None):
  """Collects snarks from your Twitter screen name and @mentions.
  This is much more reliable than a plain search, but it only
  works for your own account.

  This parser adds non-standard attributes to snarks:
  "user_url" and "msg_url", links to the user's twitter
  page and to the specific tweet. Exporters might
  disregard this info.

  :param src_path: Not used.
  :param first_msg: If not None, ignore comments until this substring is found.
  :param options: A dict of extra options specific to this parser.
                  since_date (optional):
                      UTC Datetime to limit dredging up old tweets.
                  until_date (optional):
                      UTC Datetime to limit dredging up new tweets.
  :param keep_alive_func: Optional replacement to get an abort boolean.
  :param sleep_func: Optional replacement to sleep N seconds.
  :return: A List of snark dicts.
  :raises: ParserError
  """
  if (keep_alive_func is None): keep_alive_func = global_config.keeping_alive
  if (sleep_func is None): sleep_func = global_config.nap

  since_date = None
  if (ns+"since_date" in options and options[ns+"since_date"]):
    since_date = options[ns+"since_date"]

  until_date = None
  if (ns+"until_date" in options and options[ns+"until_date"]):
    until_date = options[ns+"until_date"]

  snarks = []

  tweepy = tweepy_backend.get_tweepy()
  tweepy_api = tweepy_backend.get_api()

  try:
    my_screen_name = tweepy_api.auth.get_username()

    # List of pattern/replacement tuples to strip reply topic from comments.
    reply_name_escaped = re.escape(my_screen_name)
    reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "),
                     (re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")]

    mention_args = {"count":200, "include_entities":"false", "include_rts":"false"}
    mention_rate = {"reset":None, "limit":0, "remaining":0, "res_family":"statuses", "res_name":"/statuses/mentions_timeline"}
    timeline_args = {"count":200, "include_entities":"false", "include_rts":"false"}
    timeline_rate = {"reset":None, "limit":0, "remaining":0, "res_family":"statuses", "res_name":"/statuses/user_timeline"}

    searches = []
    searches.append(("Mentions", tweepy_api.mentions_timeline, mention_args, 800, mention_rate))
    searches.append(("Timeline", tweepy_api.user_timeline, timeline_args, 3200, timeline_rate))

    def update_rate_info():
      # Sets new rate info values for the searches.
      rate_status = tweepy_api.rate_limit_status()
      for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
        rate_info.update(rate_status["resources"][rate_info["res_family"]][rate_info["res_name"]])

    update_rate_info()

    for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
      done = False
      query_count = 0
      results_count = 0
      last_max_id = None

      while (keep_alive_func() and done is False and results_count < search_cap and rate_info["remaining"] > 0):
        results = tweepy_func(**tweepy_func_args)
        rate_info["remaining"] -= 1
        if (not results):
          done = True
          break
        else:
          query_count += 1
          results_count += len(results)
          logging.info("%s Query % 2d: % 3d results." % (search_type, query_count, len(results)))

          last_status_id = None
          for status in results:
            if (last_max_id == status.id): continue
            last_status_id = status.id

            snark = {}
            snark["user"] = "******" % common.asciify(status.author.screen_name)
            snark["msg"] = status.text
            for (reply_ptn, reply_rep) in reply_regexes:
              snark["msg"] =  reply_ptn.sub(reply_rep, snark["msg"])
            snark["msg"] = common.asciify(common.html_unescape(snark["msg"]))

            snark["date"] = status.created_at

            snark["user_url"] = "http://www.twitter.com/%s" % common.asciify(status.author.screen_name)
            snark["msg_url"] = "http://twitter.com/#!/%s/status/%d" % (common.asciify(status.author.screen_name), status.id)

            if (until_date and snark["date"] > until_date):
              continue  # This snark is too recent.

            if (since_date and snark["date"] < since_date):
              done = True  # This snark is too early.
              break

            snarks.append(snark)

            if (first_msg):
              if (snark["msg"].find(first_msg) != -1):
                done = True  # Found the first comment.
                break

          if (last_status_id is not None):
            # Dig deeper into the past on the next loop.
            tweepy_func_args["max_id"] = last_status_id
            last_max_id = last_status_id
          else:
            # Must've only gotten the "max_id" tweet again.
            done = True
            break

          if (rate_info["reset"] is not None and time.time() >= float(rate_info["reset"])):
            update_rate_info()

            reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S")
            logging.info("API limit for '%s' reset. Calls left: %d (Until %s)" % (rate_info["res_name"], rate_info["remaining"], reset_string))

      if (done is False and rate_info["remaining"] <= 0):
        logging.warning("Twitter API rate limit truncated results for '%s'." % rate_info["res_name"])
        break  # No more searches.

    update_rate_info()
    logging.info("Twitter API calls left...")
    for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
      reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S")
      logging.info("'%s': %d (Until %s)." % (rate_info["res_name"], rate_info["remaining"], reset_string))
    logging.info("Current Time: %s" % datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

  except (Exception) as err:
    logging.exception("Parser failed.")
    raise common.ParserError("Parser failed.")

  snarks = sorted(snarks, key=lambda k: k["date"])

  # Drop duplicates from multiple passes.
  snarks = uniquify_list(snarks)

  if (first_msg):
    first_index = -1
    for i in range(len(snarks)):
      if (snarks[i]["msg"].find(first_msg) != -1):
        # Finally reached the expected first msg.
        first_index = i
    if (first_index >= 0):
      snarks = snarks[first_index:]
    else:
      logging.warning("first_msg string \"%s\" was not found." % first_msg)
      snarks = []

  return snarks
Example #2
0
def fetch_snarks(src_path, first_msg, options={}, keep_alive_func=None, sleep_func=None):
  """Collects snarks from a Twitter search. Finds
  tweets from any account and @reply mentions of it.
  See: https://dev.twitter.com/docs/api/1/get/search

  This parser adds non-standard attributes to snarks:
  "user_url" and "msg_url", links to the user's twitter
  page and to the specific tweet. Exporters might
  disregard this info.

  Twitter's search API only reaches back a few days
  and may be incomplete. :/

  :param src_path: Not used.
  :param first_msg: If not None, ignore comments prior to one containing this substring.
  :param options: A dict of extra options specific to this parser.
                  reply_name:
                      The name to which replies were directed (no "@").
                  since_date (optional):
                      UTC Datetime to limit dredging up old tweets.
                  until_date (optional):
                      UTC Datetime to limit dredging up new tweets.
  :param keep_alive_func: Optional replacement to get an abort boolean.
  :param sleep_func: Optional replacement to sleep N seconds.
  :return: A List of snark dicts.
  :raises: ParserError
  """
  if (keep_alive_func is None): keep_alive_func = global_config.keeping_alive
  if (sleep_func is None): sleep_func = global_config.nap

  since_date = None
  if (ns+"since_date" in options and options[ns+"since_date"]):
    since_date = options[ns+"since_date"]

  until_date = None
  if (ns+"until_date" in options and options[ns+"until_date"]):
    until_date = options[ns+"until_date"]

  missing_options = [o for o in ["reply_name"] if ((ns+o) not in options or not options[ns+o])]
  if (len(missing_options) > 0):
    logging.error("Required parser options weren't provided: %s." % ", ".join(missing_options))
    raise common.ParserError("Parser failed.")

  snarks = []

  tweepy = tweepy_backend.get_tweepy()
  tweepy_api = tweepy_backend.get_api()

  try:
    # List of pattern/replacement tuples to strip reply topic from comments.
    reply_name_escaped = re.escape(options[ns+"reply_name"])
    reply_regexes = [(re.compile(" +@"+ reply_name_escaped +" +", re.IGNORECASE), " "),
                     (re.compile(" *@"+ reply_name_escaped +" *", re.IGNORECASE), "")]

    search_args = {"rpp":100, "include_entities":"false", "result_type":"recent"}
    search_args["q"] = "@%s OR from:%s" % (options[ns+"reply_name"], options[ns+"reply_name"])
    if (since_date): search_args["since"] = since_date.strftime("%Y-%m-%d")
    if (until_date): search_args["until"] = until_date.strftime("%Y-%m-%d")
    search_rate = {"reset":None, "limit":0, "remaining":0, "res_family":"search", "res_name":"/search/tweets"}

    searches = []
    searches.append(("Search", tweepy_api.search, search_args, 1500, search_rate))

    def update_rate_info():
      # Sets new rate info values for the searches.
      rate_status = tweepy_api.rate_limit_status()
      for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
        rate_info.update(rate_status["resources"][rate_info["res_family"]][rate_info["res_name"]])

    update_rate_info()

    for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
      done = False
      query_count = 0
      results_count = 0
      last_max_id = None

      while (keep_alive_func() and done is False and results_count < search_cap and rate_info["remaining"] > 0):
        results = tweepy_func(**tweepy_func_args)
        rate_info["remaining"] -= 1
        if (not results):
          done = True
          break
        else:
          query_count += 1
          results_count += len(results)
          logging.info("%s Query % 2d: % 3d results." % (search_type, query_count, len(results)))

          last_id = None
          for search_result in results:
            if (last_max_id == search_result.id): continue
            last_id = search_result.id

            snark = {}
            snark["user"] = "******" % common.asciify(search_result.from_user)
            snark["msg"] = search_result.text
            for (reply_ptn, reply_rep) in reply_regexes:
              snark["msg"] =  reply_ptn.sub(reply_rep, snark["msg"])
            snark["msg"] = common.asciify(common.html_unescape(snark["msg"]))

            snark["date"] = search_result.created_at

            snark["user_url"] = "http://www.twitter.com/%s" % common.asciify(search_result.from_user)
            snark["msg_url"] = "http://twitter.com/#!/%s/status/%d" % (common.asciify(search_result.from_user), search_result.id)

            if (until_date and snark["date"] > until_date):
              continue  # This snark is too recent.

            if (since_date and snark["date"] < since_date):
              done = True  # This snark is too early.
              break

            snarks.append(snark)

            if (first_msg):
              if (snark["msg"].find(first_msg) != -1):
                done = True  # Found the first comment.
                break

          if (last_id is not None):
            # Dig deeper into the past on the next loop.
            tweepy_func_args["max_id"] = last_id
            last_max_id = last_id
          else:
            # Must've only gotten the "max_id" tweet again.
            done = True
            break

          if (rate_info["reset"] is not None and time.time() >= float(rate_info["reset"])):
            update_rate_info()

            reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S")
            logging.info("API limit for '%s' reset. Calls left: %d (Until %s)" % (rate_info["res_name"], rate_info["remaining"], reset_string))

      if (done is False and rate_info["remaining"] <= 0):
        logging.warning("Twitter API rate limit truncated results for '%s'." % rate_info["res_name"])
        break  # No more searches.

    update_rate_info()
    logging.info("Twitter API calls left...")
    for (search_type, tweepy_func, tweepy_func_args, search_cap, rate_info) in searches:
      reset_string = datetime.fromtimestamp(float(rate_info["reset"])).strftime("%Y-%m-%d %H:%M:%S")
      logging.info("'%s': %d (Until %s)." % (rate_info["res_name"], rate_info["remaining"], reset_string))
    logging.info("Current Time: %s" % datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

  except (Exception) as err:
    logging.exception("Parser failed.")
    raise common.ParserError("Parser failed.")

  snarks = sorted(snarks, key=lambda k: k["date"])

  # Drop duplicates from multiple passes.
  snarks = uniquify_list(snarks)

  if (first_msg):
    first_index = -1
    for i in range(len(snarks)):
      if (snarks[i]["msg"].find(first_msg) != -1):
        # Finally reached the expected first msg.
        first_index = i
    if (first_index >= 0):
      snarks = snarks[first_index:]
    else:
      logging.warning("first_msg string \"%s\" was not found." % first_msg)
      snarks = []

  return snarks