Exemple #1
0
def main():
    ensure_topics_exist()
    while True:
        sample_text = get_random_sentence()
        mention = Mention(
            text=sample_text,
            url="https://www.google.com",
            source="hacker-news",
            origin_date=datetime.utcnow(),
            download_date=datetime.utcnow(),
            metadata=HackerNewsMetadata(author="YOLO author"),
        )
        producer.send("hacker-news", mention.to_json())
        logging.info(f"SENT: {sample_text}")
        time.sleep(random.uniform(0, 1))
Exemple #2
0
    def put(self, mention: Mention):
        if self._producer is None:
            try:
                self._producer = KinesisProducer(stream_name="sentinel-stream")
            except Exception as e:
                raise SinkNotAvailableError from e

        self._producer.put(mention.to_json())
Exemple #3
0
    def put(self, mention: Mention):
        if self._producer is None:
            self._ensure_topics_exist()
            self._producer = kafka.KafkaProducer(
                bootstrap_servers=[self.KAFKA_URL],
                value_serializer=lambda m: json.dumps(m).encode("utf8"),
            )

        self._producer.send(mention.source, mention.to_json())
Exemple #4
0
def clean_mention_text(mention, text_clean_func):
    return Mention(
        id=mention.id,
        text=text_clean_func(mention.text),
        url=mention.url,
        origin_date=mention.origin_date,
        download_date=mention.download_date,
        source=mention.source,
        metadata=mention.metadata,
    )
Exemple #5
0
 def _create_hn_mention(comment_json) -> Mention:
     comment = json.loads(str(comment_json, "utf-8"))
     try:
         metadata = HackerNewsMetadata(author=comment["author"])
         return Mention(
             text=clean_html(comment["body"]),
             url=f'https://news.ycombinator.com/item?id={comment["id"]}',
             origin_date=datetime.utcnow(),
             download_date=datetime.utcnow(),
             source="hacker-news",
             metadata=metadata,
         )
     except ValidationError as e:
         raise ValueError("Data parsing error", str(e), str(comment)) from e
Exemple #6
0
 def stream_comments(self) -> Iterator[Mention]:
     for tweet in self._get_stream():
         twitter_mention_metadata = self.create_twitter_mention_metadata(
             tweet)
         username = tweet["user"]["screen_name"]
         url = f"https://twitter.com/{username}/status/{tweet['id_str']}"
         yield Mention(
             text=tweet["text"],
             url=url,
             origin_date=datetime.strptime(tweet["created_at"],
                                           "%a %b %d %H:%M:%S  +0000 %Y"),
             download_date=datetime.utcnow(),
             source="twitter",
             metadata=twitter_mention_metadata,
         )
Exemple #7
0
def create_gn_mention(article: Dict) -> Mention:
    text = " ".join(
        filter(None, [article["title"], article["description"], article["content"]])
    )

    try:
        article_metadata = create_gn_mention_metadata(article)

        return Mention(
            text=text,
            url=article["url"],
            origin_date=article["publishedAt"],
            download_date=datetime.utcnow(),
            source="google-news",
            metadata=article_metadata,
        )
    except ValidationError as e:
        raise ValueError("Data parsing error", str(e), str(article)) from e
Exemple #8
0
 def download_mentions(self, keywords: List[str], since: datetime,
                       until: datetime) -> Iterator[Mention]:
     for keyword in keywords:
         response = self._search(keyword, since, until)
         for hit in response:
             try:
                 hn_metadata = self.create_hn_mention_metadata(hit)
                 yield Mention(
                     text=clean_html(hit["comment_text"]),
                     url=hit["story_url"],
                     origin_date=datetime.strptime(hit["created_at"],
                                                   "%Y-%m-%dT%H:%M:%S.%fZ"),
                     download_date=datetime.utcnow(),
                     source="hacker-news",
                     metadata=hn_metadata,
                 )
             except ValidationError as e:
                 raise ValueError("Data parsing error", str(e),
                                  str(hit)) from e
Exemple #9
0
def map_reddit_comment(comment: praw.models.Comment) -> Mention:
    try:
        metadata = RedditMetadata(
            redditor=comment.author.id,
            redditor_link_karma=comment.author.link_karma,
            redditor_comment_karma=comment.author.comment_karma,
            score=comment.score,
            submission=comment.submission.id,
        )

        return Mention(
            text=comment.body,
            url="https://reddit.com" + comment.permalink,
            origin_date=datetime.fromtimestamp(comment.created_utc),
            download_date=datetime.utcnow(),
            source="reddit",
            metadata=metadata,
        )
    except ValidationError as e:
        raise ValueError("Data parsing error", str(e), str(comment)) from e
Exemple #10
0
 def download_mentions(self, keywords: List[str], since: datetime,
                       until: datetime) -> Iterator[Mention]:
     query = self._build_query(keywords)
     tweet_generator = self._search(query, since, until)
     for tweet in tweet_generator:
         try:
             twitter_mention_metadata = self.create_twitter_mention_metadata(
                 tweet)
             username = tweet.user.screen_name
             url = f"https://twitter.com/{username}/status/{tweet.id_str}"
             yield Mention(
                 text=tweet.text,
                 url=url,
                 origin_date=tweet.created_at,
                 download_date=datetime.utcnow(),
                 source="twitter",
                 metadata=twitter_mention_metadata,
             )
         except ValidationError as e:
             raise ValueError("Data parsing error", str(e),
                              str(tweet)) from e
Exemple #11
0
def analize_and_save(event, context):
    data = []

    for record in event["Records"]:
        payload = base64.b64decode(record["kinesis"]["data"]).decode("utf-8")
        if getsizeof(payload.encode('utf-8')) <= 5000:
            mention = Mention.from_json(payload)
            sentiment_data = client.detect_sentiment(Text=mention.text,
                                                     LanguageCode="en")
            sentiment_score = map_sentiment_value(sentiment_data)
            data.append((mention, sentiment_score))

    result = save_to_db(data)

    for mention_db in result:
        print(f"{mention_db}")

    body = {"message": f"{result}", "input": event}

    response = {"statusCode": 200, "body": json.dumps(body)}

    return response
Exemple #12
0
def to_mention(data_tuple):
    _, mention_raw = data_tuple
    return Mention.from_json(json.loads(mention_raw))