def main(): ensure_topics_exist() while True: sample_text = get_random_sentence() mention = Mention( text=sample_text, url="https://www.google.com", source="hacker-news", origin_date=datetime.utcnow(), download_date=datetime.utcnow(), metadata=HackerNewsMetadata(author="YOLO author"), ) producer.send("hacker-news", mention.to_json()) logging.info(f"SENT: {sample_text}") time.sleep(random.uniform(0, 1))
def put(self, mention: Mention): if self._producer is None: try: self._producer = KinesisProducer(stream_name="sentinel-stream") except Exception as e: raise SinkNotAvailableError from e self._producer.put(mention.to_json())
def put(self, mention: Mention): if self._producer is None: self._ensure_topics_exist() self._producer = kafka.KafkaProducer( bootstrap_servers=[self.KAFKA_URL], value_serializer=lambda m: json.dumps(m).encode("utf8"), ) self._producer.send(mention.source, mention.to_json())
def clean_mention_text(mention, text_clean_func): return Mention( id=mention.id, text=text_clean_func(mention.text), url=mention.url, origin_date=mention.origin_date, download_date=mention.download_date, source=mention.source, metadata=mention.metadata, )
def _create_hn_mention(comment_json) -> Mention: comment = json.loads(str(comment_json, "utf-8")) try: metadata = HackerNewsMetadata(author=comment["author"]) return Mention( text=clean_html(comment["body"]), url=f'https://news.ycombinator.com/item?id={comment["id"]}', origin_date=datetime.utcnow(), download_date=datetime.utcnow(), source="hacker-news", metadata=metadata, ) except ValidationError as e: raise ValueError("Data parsing error", str(e), str(comment)) from e
def stream_comments(self) -> Iterator[Mention]: for tweet in self._get_stream(): twitter_mention_metadata = self.create_twitter_mention_metadata( tweet) username = tweet["user"]["screen_name"] url = f"https://twitter.com/{username}/status/{tweet['id_str']}" yield Mention( text=tweet["text"], url=url, origin_date=datetime.strptime(tweet["created_at"], "%a %b %d %H:%M:%S +0000 %Y"), download_date=datetime.utcnow(), source="twitter", metadata=twitter_mention_metadata, )
def create_gn_mention(article: Dict) -> Mention: text = " ".join( filter(None, [article["title"], article["description"], article["content"]]) ) try: article_metadata = create_gn_mention_metadata(article) return Mention( text=text, url=article["url"], origin_date=article["publishedAt"], download_date=datetime.utcnow(), source="google-news", metadata=article_metadata, ) except ValidationError as e: raise ValueError("Data parsing error", str(e), str(article)) from e
def download_mentions(self, keywords: List[str], since: datetime, until: datetime) -> Iterator[Mention]: for keyword in keywords: response = self._search(keyword, since, until) for hit in response: try: hn_metadata = self.create_hn_mention_metadata(hit) yield Mention( text=clean_html(hit["comment_text"]), url=hit["story_url"], origin_date=datetime.strptime(hit["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ"), download_date=datetime.utcnow(), source="hacker-news", metadata=hn_metadata, ) except ValidationError as e: raise ValueError("Data parsing error", str(e), str(hit)) from e
def map_reddit_comment(comment: praw.models.Comment) -> Mention: try: metadata = RedditMetadata( redditor=comment.author.id, redditor_link_karma=comment.author.link_karma, redditor_comment_karma=comment.author.comment_karma, score=comment.score, submission=comment.submission.id, ) return Mention( text=comment.body, url="https://reddit.com" + comment.permalink, origin_date=datetime.fromtimestamp(comment.created_utc), download_date=datetime.utcnow(), source="reddit", metadata=metadata, ) except ValidationError as e: raise ValueError("Data parsing error", str(e), str(comment)) from e
def download_mentions(self, keywords: List[str], since: datetime, until: datetime) -> Iterator[Mention]: query = self._build_query(keywords) tweet_generator = self._search(query, since, until) for tweet in tweet_generator: try: twitter_mention_metadata = self.create_twitter_mention_metadata( tweet) username = tweet.user.screen_name url = f"https://twitter.com/{username}/status/{tweet.id_str}" yield Mention( text=tweet.text, url=url, origin_date=tweet.created_at, download_date=datetime.utcnow(), source="twitter", metadata=twitter_mention_metadata, ) except ValidationError as e: raise ValueError("Data parsing error", str(e), str(tweet)) from e
def analize_and_save(event, context): data = [] for record in event["Records"]: payload = base64.b64decode(record["kinesis"]["data"]).decode("utf-8") if getsizeof(payload.encode('utf-8')) <= 5000: mention = Mention.from_json(payload) sentiment_data = client.detect_sentiment(Text=mention.text, LanguageCode="en") sentiment_score = map_sentiment_value(sentiment_data) data.append((mention, sentiment_score)) result = save_to_db(data) for mention_db in result: print(f"{mention_db}") body = {"message": f"{result}", "input": event} response = {"statusCode": 200, "body": json.dumps(body)} return response
def to_mention(data_tuple): _, mention_raw = data_tuple return Mention.from_json(json.loads(mention_raw))