def test_get_tweet_urls() -> None: """Test get_tweet_urls().""" tweet = { 'entities': { 'urls': [{ 'expanded_url': 'foo' }, { 'expanded_url': 'bar' }] } } urls = get_tweet_urls(tweet) assert sorted(urls) == ['bar', 'foo'] tweet = \ { 'entities': { 'urls': [{'expanded_url': 'url foo'}, {'expanded_url': 'url bar'}], }, 'retweeted_status': { 'entities': { 'urls': [{'expanded_url': 'rt url foo'}, {'expanded_url': 'rt url bar'}], } } } urls = get_tweet_urls(tweet) expected_urls = ['url bar', 'url foo', 'rt url foo', 'rt url bar'] assert sorted(urls) == sorted(expected_urls)
def get_post_urls(self, post: dict) -> list: """Given a post, return a list of urls included in the post.""" if 'data' in post['data'] and 'tweet' in post['data']['data']: return get_tweet_urls(post['data']['data']['tweet']) elif 'tweet' in post['data']: return get_tweet_urls(post['data']['tweet']) else: return super().get_post_urls(post)
def _add_tweet_story(db: DatabaseHandler, topic: Dict[str, Any], tweet: dict, topic_fetch_urls: List[Dict[str, Any]]) -> dict: """Generate a story based on the given tweet, as returned by the twitter api.""" screen_name = tweet['user']['screen_name'] content = tweet['text'] title = f"{screen_name}: {content}" tweet_date = tweet['created_at'] url = f"https://twitter.com/{screen_name}/status/{tweet['id']}" story = generate_story(db=db, url=url, content=content, title=title, publish_date=tweet_date) add_to_topic_stories(db=db, story=story, topic=topic, link_mined=True) for topic_fetch_url in topic_fetch_urls: topic_fetch_url = _log_tweet_added(db, topic_fetch_url, story) try_update_topic_link_ref_stories_id(db, topic_fetch_url) urls = get_tweet_urls(tweet) for url in urls: if skip_self_linked_domain_url(db, topic['topics_id'], story['url'], url): log.debug("skipping self linked domain url...") continue topic_link = { 'topics_id': topic['topics_id'], 'stories_id': story['stories_id'], 'url': url, } db.create('topic_links', topic_link) increment_domain_links(db, topic_link) return story
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None: """Reparse the tweet json for a given topic and try to reinsert all tweet urls.""" topic_posts_ids = db.query( """ SELECT topic_posts.topic_posts_id FROM topic_posts INNER JOIN topic_post_days ON topic_posts.topics_id = topic_post_days.topics_id AND topic_posts.topic_post_days_id = topic_post_days.topic_post_days_id INNER JOIN topic_seed_queries ON topic_post_days.topics_id = topic_seed_queries.topics_id AND topic_post_days.topic_seed_queries_id = topic_seed_queries.topic_seed_queries_id WHERE topics_id = %(topics_id)s """, { 'topics_id': topic['topics_id'], } ).flat() for (i, topic_posts_id) in enumerate(topic_posts_ids): if i % 1000 == 0: log.info('regenerate tweet urls: %d/%d' % (i, len(topic_posts_ids))) topic_post = db.require_by_id('topic_posts', topic_posts_id) data = decode_json(topic_post['data']) urls = get_tweet_urls(data['data']['tweet']) _insert_post_urls(db, topic_post, urls)
def _get_post_urls(post: dict) -> list: """Given a post, return a list of urls included in the post.""" # let the underlying module pass the urls in a field rather than parsing them out try: return post['urls'] except: pass # for ch tweets, find the tweets in the tweet payload so that we get the expanded urls rather than ti.co's if 'data' in post['data'] and 'tweet' in post['data']['data']: return get_tweet_urls(post['data']['data']['tweet']) elif 'tweet' in post['data']: return get_tweet_urls(post['data']['tweet']) links = [] for url in re.findall(r'https?://[^\s\")]+', post['content']): url = re.sub(r'\W+$', '', url) links.append(url) return links
def _get_post_urls(post: dict) -> list: """Given a post, return a list of urls included in the post.""" if 'urls' in post: return post['urls'] if 'tweet' in post: return get_tweet_urls(post['tweet']) links = [] for url in re.findall(r'https?://[^\s\")]+', post['content']): url = re.sub(r'\W+$', '', url) links.append(url) return links
def regenerate_post_urls(db: DatabaseHandler, topic: dict) -> None: """Reparse the tweet json for a given topic and try to reinsert all tweet urls.""" topic_posts_ids = db.query( """ select tt.topic_posts_id from topic_posts tt join topic_post_days ttd using ( topic_post_days_id ) where topics_id = %(a)s """, { 'a': topic['topics_id'] }).flat() for (i, topic_posts_id) in enumerate(topic_posts_ids): if i % 1000 == 0: log.info('regenerate tweet urls: %d/%d' % (i, len(topic_posts_ids))) topic_post = db.require_by_id('topic_posts', topic_posts_id) data = decode_json(topic_post['data']) urls = get_tweet_urls(data['data']['tweet']) _insert_post_urls(db, topic_post, urls)