def _split_hashes(self): """ Splits hashtags via feature_extraction.freq_splitter.split_text (q.v.) :return: list of strings """ split = [] for hashtag in self.hashtags: split.append(split_text(hashtag)[1]) return split
def extract_tweet(self, tweet, splitter=None): """ Extracts relevant tweet-related information from a tweet. :param tweet: the tweet to process :type tweet: dict :param splitter: tokenizing function. Defaults to nltk.tokenize.wordpunct_tokenize (q.v.) :return: (string, dict) tuple """ d = { "hashtags": [], "split_hashtags": [], "urls": [], "domains": [], "day": -1, "hour": -1, "is_weekday": -1, "words": [], "mentions": [], "user": "", } splitter = splitter or wordpunct_tokenize cleaned_text = re.sub(self.clean_pat, "", tweet.get("text", "")) self.tweets.append(tweet) d["words"] = splitter(cleaned_text) entities = tweet.get("entities", {}) d["hashtags"] = [hashtag.get("text", "") for hashtag in entities.get("hashtags", [])] d["user"] = tweet.get("user", {}).get("id_str", "") d["mentions"] = [user.get("id_str", "") for user in entities.get("user_mentions", [])] d["split_hashtags"] = [split_text(hashtag)[1] for hashtag in d["hashtags"]] urls = entities.get("urls") for url in urls: try: r = requests.get(url) d["urls"].append(r.url) except (requests.RequestException, HTTPError, URLError, TypeError): d["urls"].append(url.get("expanded_url", url.get("display_url", url.get("url", "")))) for url in d["urls"]: m = re.match(self.domain_pat, url) if m: d["domains"].append(m.group(1)) else: d["domains"].append(None) d["day"], d["hour"] = self.parse_created(tweet.get("created_at")) if d["day"] != -1: if d["day"] > 4: d["is_weekday"] = 1 else: d["is_weekday"] = 0 return tweet.get("id_str"), d
def extract_tweet(self, tweet, splitter=None): """ Extracts relevant tweet-related information from a tweet. :param tweet: the tweet to process :type tweet: dict :param splitter: tokenizing function. Defaults to nltk.tokenize.wordpunct_tokenize (q.v.) :return: (string, dict) tuple """ d = {"hashtags": [], "split_hashtags": [], "urls": [], "domains": [], "day": -1, "hour": -1, "is_weekday": -1, "words": [], "mentions": [], "user": ""} splitter = splitter or wordpunct_tokenize cleaned_text = re.sub(self.clean_pat, "", tweet.get("text", "")) self.tweets.append(tweet) d["words"] = splitter(cleaned_text) entities = tweet.get("entities", {}) d["hashtags"] = [hashtag.get("text", "") for hashtag in entities.get("hashtags", [])] d["user"] = tweet.get("user", {}).get("id_str", "") d["mentions"] = [user.get("id_str", "") for user in entities.get("user_mentions", [])] d["split_hashtags"] = [split_text(hashtag)[1] for hashtag in d["hashtags"]] urls = entities.get("urls") for url in urls: try: r = requests.get(url) d["urls"].append(r.url) except (requests.RequestException, HTTPError, URLError, TypeError): d["urls"].append(url.get("expanded_url", url.get("display_url", url.get("url", "")))) for url in d["urls"]: m = re.match(self.domain_pat, url) if m: d["domains"].append(m.group(1)) else: d["domains"].append(None) d["day"], d["hour"] = self.parse_created(tweet.get("created_at")) if d["day"] != -1: if d["day"] > 4: d["is_weekday"] = 1 else: d["is_weekday"] = 0 return tweet.get("id_str"), d