Beispiel #1
0
 def _split_hashes(self):
     """
     Splits hashtags via feature_extraction.freq_splitter.split_text (q.v.)
     :return: list of strings
     """
     split = []
     for hashtag in self.hashtags:
         split.append(split_text(hashtag)[1])
     return split
Beispiel #2
0
 def _split_hashes(self):
     """
     Splits hashtags via feature_extraction.freq_splitter.split_text (q.v.)
     :return: list of strings
     """
     split = []
     for hashtag in self.hashtags:
         split.append(split_text(hashtag)[1])
     return split
 def extract_tweet(self, tweet, splitter=None):
     """
     Extracts relevant tweet-related information from a tweet.
     :param tweet: the tweet to process
     :type tweet: dict
     :param splitter: tokenizing function. Defaults to nltk.tokenize.wordpunct_tokenize (q.v.)
     :return: (string, dict) tuple
     """
     d = {
         "hashtags": [],
         "split_hashtags": [],
         "urls": [],
         "domains": [],
         "day": -1,
         "hour": -1,
         "is_weekday": -1,
         "words": [],
         "mentions": [],
         "user": "",
     }
     splitter = splitter or wordpunct_tokenize
     cleaned_text = re.sub(self.clean_pat, "", tweet.get("text", ""))
     self.tweets.append(tweet)
     d["words"] = splitter(cleaned_text)
     entities = tweet.get("entities", {})
     d["hashtags"] = [hashtag.get("text", "") for hashtag in entities.get("hashtags", [])]
     d["user"] = tweet.get("user", {}).get("id_str", "")
     d["mentions"] = [user.get("id_str", "") for user in entities.get("user_mentions", [])]
     d["split_hashtags"] = [split_text(hashtag)[1] for hashtag in d["hashtags"]]
     urls = entities.get("urls")
     for url in urls:
         try:
             r = requests.get(url)
             d["urls"].append(r.url)
         except (requests.RequestException, HTTPError, URLError, TypeError):
             d["urls"].append(url.get("expanded_url", url.get("display_url", url.get("url", ""))))
     for url in d["urls"]:
         m = re.match(self.domain_pat, url)
         if m:
             d["domains"].append(m.group(1))
         else:
             d["domains"].append(None)
     d["day"], d["hour"] = self.parse_created(tweet.get("created_at"))
     if d["day"] != -1:
         if d["day"] > 4:
             d["is_weekday"] = 1
         else:
             d["is_weekday"] = 0
     return tweet.get("id_str"), d
Beispiel #4
0
 def extract_tweet(self, tweet, splitter=None):
     """
     Extracts relevant tweet-related information from a tweet.
     :param tweet: the tweet to process
     :type tweet: dict
     :param splitter: tokenizing function. Defaults to nltk.tokenize.wordpunct_tokenize (q.v.)
     :return: (string, dict) tuple
     """
     d = {"hashtags": [], "split_hashtags": [], "urls": [], "domains": [], "day": -1, "hour": -1, "is_weekday": -1,
          "words": [], "mentions": [], "user": ""}
     splitter = splitter or wordpunct_tokenize
     cleaned_text = re.sub(self.clean_pat, "", tweet.get("text", ""))
     self.tweets.append(tweet)
     d["words"] = splitter(cleaned_text)
     entities = tweet.get("entities", {})
     d["hashtags"] = [hashtag.get("text", "") for hashtag in entities.get("hashtags", [])]
     d["user"] = tweet.get("user", {}).get("id_str", "")
     d["mentions"] = [user.get("id_str", "") for user in entities.get("user_mentions", [])]
     d["split_hashtags"] = [split_text(hashtag)[1] for hashtag in d["hashtags"]]
     urls = entities.get("urls")
     for url in urls:
         try:
             r = requests.get(url)
             d["urls"].append(r.url)
         except (requests.RequestException, HTTPError, URLError, TypeError):
             d["urls"].append(url.get("expanded_url", url.get("display_url", url.get("url", ""))))
     for url in d["urls"]:
         m = re.match(self.domain_pat, url)
         if m:
             d["domains"].append(m.group(1))
         else:
             d["domains"].append(None)
     d["day"], d["hour"] = self.parse_created(tweet.get("created_at"))
     if d["day"] != -1:
         if d["day"] > 4:
             d["is_weekday"] = 1
         else:
             d["is_weekday"] = 0
     return tweet.get("id_str"), d