def _handle_data(self, url, link, title, date): # Убрать лишние пробелы title = re.sub(' +',' ', title) # Единый формат ссылок link = normalize_urls(link) if url=="https://slon.ru/export/all.xml": title = title.replace("\n", "") # Приведение даты к общему формату и временной зоне date = self._parse_date(date) return link, title, date
def parse_tweet(self, data): tweet = self._get_tweet(data) if not (tweet): return None tweet = json.loads(data) urls = tweet.get("entities")["urls"] if len(urls) == 0: link = "NoLink" else: urls = urls[0] link = urls["expanded_url"] link = normalize_urls(link) user = tweet.get("user") if user: screen_name = user.get("screen_name") followers_count = user.get("followers_count") friends_count = user.get("friends_count") listed_count = user.get("listed_count") user_favourites_count = user.get("favourites_count") user_statuses_count = user.get("statuses_count") user_id = user.get("id_str") user_verified = 1 if user.get("verified") else 0 user_avatar = user.get("profile_image_url") user_date_created = self._parse_date(user.get("created_at")) user_location = user.get("location") user_timezone = user.get("time_zone") user_contributors = user.get("contributors") if user_contributors: user_contributors = ",".join(str(x) for x in user_contributors) else: user_contributors = None created_at = self._parse_date(tweet.get("created_at")) tw_id = tweet.get("id") if tweet.get("retweeted_status"): is_retweet = 1 else: is_retweet = 0 tw_geo = None if not tweet.get("geo") else tweet.get("geo") tw_source = tweet.get("source") tw_dict = { "url": link, "tw_id": tw_id, # "retweeted_count": retweeted_count, # "favorite_count":favorite_count, "is_retweet": is_retweet, "created_at": created_at, "tw_geo": tw_geo, "tw_source": tw_source, "user_id": user_id, "screen_name": screen_name, "user_followers_count": followers_count, "user_listed_count": listed_count, "user_friends_count": friends_count, "user_favourites_count": user_favourites_count, "user_statuses_count": user_statuses_count, "user_verified": user_verified, "user_avatar": user_avatar, "user_date_created": user_date_created, "user_location": user_location, "user_timezone": user_timezone, "user_contributors": user_contributors, } return tw_dict