예제 #1
0
def _extract_result(result, keywords, domain=u"google_search"):
    post = Post()
    post.title = result['title']
    post.url = result['link']
    post.domain = domain
    post.content = result['snippet']
    post.post_type = result['kind']
    if 'cacheId' not in result:
        post.guid = compute_author_guid_by_author_name(u'{}_{}'.format(
            post.url, keywords))
    else:
        post.guid = compute_author_guid_by_author_name(u'{}_{}_{}'.format(
            post.url, keywords, result['cacheId']))
    post.post_id = post.guid
    return post
예제 #2
0
    def save_to_db(self):
        self._db.addPosts(self._posts)
        self._db.add_claim_connections(self._claim_tweet_connections)
        self._db.add_claim_connections(self._post_comment_connections)
        authors = []
        reddit_authors = []
        for i, redditor in enumerate(set(self._redditors)):
            for attempt in xrange(self._number_of_attempts):
                try:
                    self._retrive_reddit_author(authors, i, reddit_authors, redditor)
                    print("\rretrive reddit author {0}/{1}".format(i, len(self._redditors)), end='')
                except prawcore.exceptions.ServerError as e:
                    print('Server overload code 503, save to DB and sleep 30 sec and try again')
                    self.save_to_db()
                    time.sleep(5)  # 30)
                except Exception as e:
                    print('\r retrive reddit author {0}/{1} exception: {2}'.format(i, len(self._redditors), e.message), end='')
            print()
        for i, redditor in enumerate(set(self._deleted_redditors)):
            author = Author()
            author.name = "deleted"
            author.author_guid = compute_author_guid_by_author_name(redditor)
            author.domain = u'reddit'
            author.author_type = u'deleted'
            authors.append(author)

        self._db.add_authors_fast(authors)
        self._db.add_reddit_authors(reddit_authors)
        self._posts = []
        self._claim_tweet_connections = []
        self._redditors = []
        self._deleted_redditors = []
        self._post_comment_connections = []
예제 #3
0
    def _convert_row_to_post(self, row):
        post = Post()

        claim_id = unicode(row['claim_id'])
        title = unicode(row['title'], errors='replace')
        post.content = title

        description = unicode(row['description'], errors='replace')
        post.description = description

        url = unicode(row['url'])
        post.url = url

        publication_date = row['publication_date']
        post.date = date(publication_date)

        post_guid = compute_post_guid(self._social_network_url, claim_id, publication_date)
        post.guid = post_guid
        post.post_id = post_guid
        post.domain = self._domain
        post.author = self._author_name
        author_guid = compute_author_guid_by_author_name(self._author_name)
        post.author_guid = author_guid
        post.post_osn_guid = post_guid

        keywords = unicode(row['keywords'])
        post.tags = keywords

        post_type = unicode(row['post_type'])
        post.post_type = post_type

        return post
    def _add_post(self, author, date, post_osn_id, score=0, upvote_ratio=-1):
        post = Post()
        post.post_osn_id = post_osn_id
        post.author = str(author)
        post.author_guid = compute_author_guid_by_author_name(post.author)
        post.created_at = str_to_date(date, formate="%d/%m/%Y %H:%M")
        post.url = 'https://www.reddit.com{}'.format(
            post.author)  # just for test
        post.guid = compute_post_guid(post.url, post.post_osn_id,
                                      date_to_str(post.created_at))
        post.domain = 'reddit_comment'
        post.post_type = 'reddit_comment'
        post.post_id = post.guid

        reddit_post = RedditPost()
        reddit_post.post_id = post.post_id
        reddit_post.guid = post.guid
        reddit_post.score = score
        if upvote_ratio != -1:
            post.domain = 'reddit_post'
            post.post_type = 'reddit_post'
            reddit_post.upvote_ratio = upvote_ratio
            reddit_post.ups = int(
                round((reddit_post.upvote_ratio * reddit_post.score) /
                      (2 * reddit_post.upvote_ratio - 1)) if
                reddit_post.upvote_ratio != 0.5 else round(reddit_post.score /
                                                           2))
            reddit_post.downs = reddit_post.ups - reddit_post.score
        else:
            reddit_post.ups = -1
            reddit_post.downs = -1
            reddit_post.upvote_ratio = -1

        self._db.addPosts([post, reddit_post])
        return post, reddit_post
    def _add_author(self,
                    name=None,
                    link_karma=None,
                    comment_karma=None,
                    is_employee=0,
                    is_mod=0,
                    is_gold=0,
                    author_osn_id=None):
        author = Author()
        reddit_author = RedditAuthor()
        author.name = name
        author.author_screen_name = author.name
        author.author_guid = compute_author_guid_by_author_name(author.name)
        author.domain = 'reddit'
        author.author_osn_id = author_osn_id
        author.author_full_name = name
        author.url = 'https://www.reddit.com/user/' + name

        reddit_author.name = author.name
        reddit_author.author_guid = author.author_guid

        reddit_author.comments_count = None
        reddit_author.comment_karma = comment_karma
        reddit_author.link_karma = link_karma
        reddit_author.is_gold = is_gold
        reddit_author.is_moderator = is_mod
        reddit_author.is_employee = is_employee

        self._db.add_authors([author])
        self._db.add_reddit_authors([reddit_author])
    def _convert_tweet_to_post(self, tweet, post_type):
        post = Post()

        post.post_osn_id = str(tweet.id)
        post_creation_date = tweet.date
        created_at = str(date_to_str(post_creation_date))
        post.created_at = created_at

        post.date = post_creation_date
        post.favorite_count = tweet.favorites
        post.retweet_count = tweet.retweets
        post.content = str(tweet.text)

        author_name = str(tweet.username)
        post.author = author_name
        post.author_guid = compute_author_guid_by_author_name(author_name)
        post_url = tweet.permalink
        post.url = str(post_url)

        post_guid = compute_post_guid(post_url, author_name, created_at)
        post.guid = post_guid
        post.post_id = post_guid
        post.domain = self._domain

        post.post_type = post_type
        return post
예제 #7
0
    def _convert_row_to_post(self, row):
        # [site, social_id, username_hash, comment_time, comment_tokens]
        print("\rInsert post to DataFrame {0}/{1}".format(
            self._current_row, len(self.posts_csv_df)),
              end="")
        self._current_row += 1
        date = datetime.datetime.fromtimestamp(row['comment_time'])
        post = Post()
        claim_id = compute_author_guid_by_author_name(str(row['social_id']))
        post.post_id = str(
            compute_post_guid(row['site'] + str(claim_id),
                              row['username_hash'], date_to_str(date)))
        post.content = str(row['comment_tokens'])
        post.author = str(row['username_hash'])
        post.author_guid = str(row['username_hash'])
        post.domain = str(row['site'])
        post.date = date
        self._posts.append(post)

        claim_tweet_connection = Claim_Tweet_Connection()
        claim_tweet_connection.claim_id = str(claim_id)
        claim_tweet_connection.post_id = str(post.post_id)
        self._claim_tweet_connections.append(claim_tweet_connection)

        if self._current_row % self._max_posts_without_save == 0:
            self._save_posts_and_connections()
예제 #8
0
 def _convert_reddit_author_to_author(self, redditor):
     author = Author()
     author.name = getattr(redditor, 'name', '')
     author.author_screen_name = author.name
     author.author_guid = compute_author_guid_by_author_name(author.name)
     author.domain = u'reddit'
     author.created_at = datetime.fromtimestamp(getattr(redditor, 'created_utc', 0))
     author.author_osn_id = getattr(redditor, 'id', '')
     author.author_full_name = getattr(redditor, 'fullname', '')
     author.url = u'https://www.reddit.com/user/' + redditor.name
     return author
 def _json_user_to_db_author_converter(self, user, domain='Instagram_author'):
     author = Author()
     author.name = user['username']
     author.author_screen_name = author.name
     author.author_guid = compute_author_guid_by_author_name(author.name)
     author.domain = domain
     author.author_type = domain
     author.author_osn_id = user['id']
     author.author_full_name = user['full_name']
     author.description = user.setdefault('biography', None)
     author.url = 'https://www.instagram.com/' + author.author_screen_name
     author.profile_image_url = user['profile_pic_url']
     return author
 def _json_comment_to_db_comment_converter(self, post, domain="Instagram_comment"):
     rpost = Post()
     rpost.post_osn_id = str(post['id'])
     rpost.created_at = datetime.fromtimestamp(post['created_at'])
     rpost.author = post['owner']['id']
     rpost.author_guid = compute_author_guid_by_author_name(rpost.author)
     rpost.url = str('https://www.instagram.com/p/{}/'.format(post['shortcode']))
     rpost.content = post['text']
     rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at))
     rpost.domain = domain
     rpost.post_type = domain
     rpost.post_id = rpost.guid
     return rpost
 def _json_post_to_db_post_converter(self, post, domain="Instagram_post"):
     rpost = Post()
     rpost.post_osn_id = str(post['id'])
     rpost.created_at = datetime.fromtimestamp(post['taken_at_timestamp'])
     rpost.author = post['owner']['id']
     rpost.author_guid = compute_author_guid_by_author_name(rpost.author)
     rpost.url = str('https://www.instagram.com/p/{}/'.format(post['shortcode']))
     rpost.content = ', '.join(x['node']['text'] for x in post['edge_media_to_caption']['edges'])
     rpost.guid = compute_post_guid(rpost.url, rpost.post_osn_id, date_to_str(rpost.created_at))
     rpost.domain = domain
     rpost.post_type = domain
     rpost.post_id = rpost.guid
     return rpost
예제 #12
0
    def _convert_row_to_claim(self, claim_row):
        claim = Claim()
        claim.claim_id = compute_author_guid_by_author_name(
            str(claim_row['social_id']))
        claim.domain = str(claim_row['site'])
        claim.verdict = str(claim_row['ruling_val'])
        claim.verdict_date = datetime.datetime.fromtimestamp(
            claim_row['ruling_time'])
        self._claims.append(claim)

        attribute_name = self.__class__.__name__ + "_claim_type"
        author_feature = BaseFeatureGenerator.create_author_feature(
            attribute_name, claim.claim_id, claim.verdict, self._window_start,
            self._window_end)
        self._author_features.append(author_feature)
        attribute_name = self.__class__.__name__ + "_claim_id"
        author_feature = BaseFeatureGenerator.create_author_feature(
            attribute_name, claim.claim_id, claim.claim_id, self._window_start,
            self._window_end)
        self._author_features.append(author_feature)
예제 #13
0
 def convert_comment_to_post(self, comment, submission, domain=u"Reddit"):
     post = Post()
     post.post_osn_id = unicode(comment.id)
     post.created_at = datetime.fromtimestamp(comment.created)
     post.date = datetime.fromtimestamp(comment.created)
     if hasattr(comment, 'author') and comment.author:
         post.author = unicode(comment.author.name)
         self._redditors.append(comment.author)
     else:
         self._deleted_redditors.append(str(post.date))
         post.author = unicode('')
     post.author_guid = compute_author_guid_by_author_name(post.author)
     post.url = unicode('https://www.reddit.com' + '/'.join(getattr(comment, 'permalink', '').split('/')[3:7]))
     post.title = unicode(submission.title)
     post.content = unicode(getattr(comment, 'body', '').encode('utf-8').strip())
     post.guid = compute_post_guid(post.url, post.post_osn_id, date_to_str(post.created_at))
     post.domain = domain
     post.post_type = domain
     post.post_id = post.guid
     post.url = u'https://www.reddit.com{}'.format(comment.permalink)
     return post
    def _convert_tweet_dict_to_post(self, tweet_dict):
        post = Post()

        post_osn_id = tweet_dict['id_str']
        post.post_osn_id = post_osn_id

        author_osn_id = tweet_dict['author_osn_id']
        author = self._author_osn_id_author_dict[author_osn_id]
        author_screen_name = author.author_screen_name
        post.author = author_screen_name

        post.author_guid = compute_author_guid_by_author_name(
            author_screen_name)

        created_at = tweet_dict['created_at']
        post.created_at = created_at

        creation_date_str = extract_tweet_publiction_date(created_at)
        creation_date = str_to_date(creation_date_str)
        post.date = creation_date

        post.favorite_count = tweet_dict['favorite_count']
        post.retweet_count = tweet_dict['retweet_count']
        post.reply_count = tweet_dict['reply_count']
        post.content = str(tweet_dict['full_text'])
        post.domain = self._domain
        post.language = str(tweet_dict['lang'])

        post_url = "https://twitter.com/{0}/status/{1}".format(
            author_screen_name, post_osn_id)
        post.url = post_url

        post_guid = compute_post_guid(post_url, author_screen_name,
                                      creation_date_str)
        post.guid = post_guid
        post.post_id = post_guid

        return post
    def execute(self, window_start):
        author_screen_names_df = pd.read_csv(self._path_to_file)
        author_screen_names = author_screen_names_df[
            'author_screen_name'].tolist()

        authors = []
        for i, author_screen_name in enumerate(author_screen_names):
            author = Author()
            msg = "\rCreate author: [{0}/{1}]".format(i,
                                                      len(author_screen_names))
            print(msg, end="")

            author.author_screen_name = author_screen_name
            author.name = author_screen_name

            author_guid = compute_author_guid_by_author_name(
                author_screen_name)
            author.author_guid = author_guid

            author.domain = self._domain

            authors.append(author)
        self._db.addPosts(authors)
예제 #16
0
    def _parse_articles_lst_to_articles(self, all_articles_lst_of_dics):
        print("###### 'Entering _parse_articles_lst_to_articles'")
        parsed_articles_lst = []
        claims_lst = []
        posts_lst = []
        articles_lst = []
        article_items_lst = []

        # Parsing articles list of dictionaries data, received using the API.
        for news_articles_dic in all_articles_lst_of_dics:
            print("###### 'PARSING: {}'".format(str(news_articles_dic)))
            parsed_articles_lst += self._parse_news_article(news_articles_dic)

        # For each news article dictionary commit:
        for parsed_news_article in parsed_articles_lst:
            print("###### 'Iterating parsed_articles_lst single item: {}'".
                  format(str(parsed_news_article)))
            # Building: claim & News_Article & News_Article_Item objects.
            claim = Claim()
            post = Post()
            article = News_Article()
            article_item = News_Article_Item()

            # Initializing Claim object with data:
            identifier = compute_post_guid(parsed_news_article['url'],
                                           parsed_news_article['author'],
                                           parsed_news_article['publishedAt'])
            claim.claim_id = post.post_id = post.guid = post.post_osn_guid = article.post_id = article_item.post_id = unicode(
                identifier)

            author_guid = compute_author_guid_by_author_name(
                parsed_news_article['author'])
            post.author_guid = article.author_guid = article_item.author_guid = unicode(
                author_guid)

            post.author = article.author = unicode(
                parsed_news_article['author'])

            post.title = claim.title = article.title = unicode(
                parsed_news_article['title'])

            post.content = article_item.content = unicode(
                parsed_news_article['content'])

            post.description = claim.description = article.description = unicode(
                parsed_news_article['description'])

            post.date = post.created_at = claim.verdict_date = article.published_date = datetime.datetime.strptime(
                parsed_news_article['publishedAt'], '%Y-%m-%d %H:%M:%S')

            article_item.source_newsapi_internal_id = unicode(
                parsed_news_article['source_id'])

            article_item.source_newsapi_internal_name = unicode(
                parsed_news_article['source_name'])

            post.url = claim.url = article.url = unicode(
                parsed_news_article['url'])

            article_item.img_url = unicode(parsed_news_article['urlToImage'])

            post.post_type = claim.verdict = unicode(
                "TRUE"
            )  # todo: Add constant. We assume all news articles are true.

            post.domain = claim.domain = unicode(
                "NewsSite")  # todo: Add constant.

            # Update objects lists:
            posts_lst.append(post)
            claims_lst.append(claim)
            articles_lst.append(article)
            article_items_lst.append(article_item)
        print("###### 'EXITING _parse_articles_lst_to_articles'")
        return posts_lst, claims_lst, articles_lst, article_items_lst