Exemple #1
0
 def process_item(self, item, spider):
     """Main function that process URL item (first phase)."""
     # validate URL length
     if len(item['raw']) > MAX_URL_LEN:
         item['raw'] = item['raw'][:MAX_URL_LEN]
         logger.error('Raw URL too long, trucate it! %r', item['raw'])
     # parse raw URL
     purl = get_parsed_url(item['raw'])
     if purl is None or purl.hostname is None:
         raise DropItem('Invalide URL')
     site_id = belongs_to_site(purl.hostname, self.site_tuples)
     if site_id is None:
         raise DropItem('Offsite domain: %s', item)
     item['site_id'] = site_id
     # insert URL into table
     try:
         get_or_create_murl(spider.session, item, spider.platform_id)
     except SQLAlchemyError as e:
         logger.error(e)
         spider.session.rollback()
         raise DropItem('Fail to insert database of url: %s', item)
     return item
    def parse_existed_one(self, tw_id, jd, session, g_urls_map, g_uusers_set,
                          g_edges_set):
        """The main parse function. This function will parse tweet into different
        components corresponding to related table records.

        Parameters
        ---------
        jd : json
            Tweet json data.
        tw_id : integer
            If tweet has been saved, tw_db_id is the id of
        """
        logger.debug('Parsing tweet %r begin ...', jd['id'])
        logger.debug('Level 1 parsing, roughly parse ...')
        l_urls, l_mentions, l_hashtags = self._parse_l1(jd)
        # Make sure we do saved and fetched all url_ids
        for u in l_urls['union']:
            if g_urls_map.get(u) is None:
                if len(u) > MAX_URL_LEN:
                    logger.warning(
                        'URL %s of tweet %s was ignored because of too long',
                        u, jd['id'])
                    murl_id = -1
                else:
                    logger.warning(
                        'Previously incomplete parsing, missing %s of tweet %s',
                        u, jd['id'])
                    murl_id = get_or_create_murl(
                        session,
                        data=dict(raw=u),
                        platform_id=self.platform_id).id
                    # Saving AssTweetUrl
                    session.add(AssTweetUrl(tweet_id=tw_id, url_id=murl_id))
                    try:
                        session.commit()
                    except IntegrityError as e:
                        logger.error('ass_tweet_url IntegrityError, see: %s',
                                     e)
                        session.rollback()
                g_urls_map[u] = murl_id
        logger.debug('Level 2 parsing, deeply parse ...')
        self._parse_l2(jd, l_urls, l_mentions, g_urls_map, g_uusers_set,
                       g_edges_set)
Exemple #3
0
    def parse(self, jd):
        """The main parse function.

        Parameters
        ---------
        jd : json
            Tweet json data.

        Procedures
        ----------
        1) validate `jd`
        2) extract URL and hashtag from `jd`
        3) insert into database
        """
        logger.debug('Parsing one tweet, begin')
        #
        # validation
        #
        try:
            tw_raw_id = jd['id']
            created_at = utc_from_str(jd['created_at'])
            user_raw_id = jd['user']['id']
        except KeyError as e:
            logger.error('Invalid tweet: %s', e)
            return None
        #
        # extract url, hashtag and associated tweet status id
        #
        urls_set = set()
        hashtags_set = set()
        entities_list = []
        if 'entities' in jd:
            entities_list.append(jd['entities'])
        if 'quoted_status' in jd:
            q_jd = jd['quoted_status']
            if 'entities' in q_jd:
                entities_list.append(q_jd['entities'])
        if 'retweeted_status' in jd:
            re_jd = jd['retweeted_status']
            if 'entities' in re_jd:
                entities_list.append(re_jd['entities'])
            if 'quoted_status' in re_jd and\
                    'entities' in re_jd['quoted_status']:
                entities_list.append(re_jd['quoted_status']['entities'])
        for entities in entities_list:
            if entities:
                self._parse_entities(entities, urls_set, hashtags_set)
        # This tweet should contain urls
        if len(urls_set) == 0 and self.save_none_url_tweet is False:
            logger.debug('No url found in %s, ignore!', tw_raw_id)
            return None
        #
        # Insert into database
        #
        # creating user
        logger.debug('creating user')
        muser = get_or_create_m(self.session,
                                TwitterUser,
                                data=dict(raw_id=user_raw_id),
                                fb_uk='raw_id')
        # creating tweet
        logger.debug('creating tweet')
        mtweet = Tweet(raw_id=tw_raw_id,
                       json_data=jd,
                       created_at=created_at,
                       user_id=muser.id)
        self.session.add(mtweet)
        try:
            self.session.commit()
            logger.debug('Inserted tweet %r', tw_raw_id)
        except IntegrityError as e:
            logger.warning('Tweet %s existed in db: %s', tw_raw_id, e)
            self.session.rollback()
            return
        # creating urls
        logger.debug('creating urls')
        for url in urls_set:
            murl = get_or_create_murl(self.session,
                                      data=dict(raw=url),
                                      platform_id=self.platform_id)
            self.session.add(AssTweetUrl(tweet_id=mtweet.id, url_id=murl.id))
            try:
                self.session.commit()
            except IntegrityError as e:
                logger.error('ass_tweet_url IntegrityError, see: %s', e)
                self.session.rollback()
        # creating hashtags
        logger.debug('creating hashtags')
        for hashtag in hashtags_set:
            mhashtag = get_or_create_m(self.session,
                                       Hashtag,
                                       data=dict(text=hashtag),
                                       fb_uk='text')
            self.session.add(
                AssTweetHashtag(tweet_id=mtweet.id, hashtag_id=mhashtag.id))
            try:
                self.session.commit()
            except IntegrityError as e:
                logger.error('ass_tweet_hashtag IntegrityError, see: %s', e)
                self.session.rollback()
        # paring associate tweet
        q1 = """
        INSERT INTO ass_tweet (id, retweeted_status_id, quoted_status_id,
            in_reply_to_status_id)
        SELECT id,
            CAST(json_data#>>'{retweeted_status, id}' AS BIGINT),
            CAST(json_data#>>'{quoted_status, id}' AS BIGINT),
            CAST(json_data->>'in_reply_to_status_id' AS BIGINT)
        FROM tweet
        WHERE id=:tweet_id
        """
        q1 = text(q1).bindparams(tweet_id=mtweet.id)
        try:
            self.session.execute(q1)
            self.session.commit()
        except DataError as e:
            # Handle \u0000 exception that postgresql json do not support
            logger.warning(e)
            self.session.rollback()
            q2 = r"""
            UPDATE tweet SET json_data=regexp_replace(
                        json_data::text, '\\u0000', '\\\\u0000', 'g')::json
            WHERE id=:tweet_id
            """
            q2 = text(q2).bindparams(tweet_id=mtweet.id)
            self.session.execute(q2)
            self.session.commit()
            logger.warning('json_data is updated (\\u0000 to \\\\u0000)')
            self.session.execute(q1)
            self.session.commit()
        logger.debug('Parsing one tweet, done.')
 def parse_new_one(self, jd, session, g_urls_map, g_uusers_set,
                   g_edges_set):
     # validate jd
     jd = replace_null_byte(jd)
     try:
         tw_raw_id = jd['id']
         created_at = utc_from_str(jd['created_at'])
         user_raw_id = jd['user']['id']
     except KeyError as e:
         logger.error('Invalid tweet: %s', e)
         return None
     # parsing, level 1
     l_urls, l_mentions, l_hashtags = self._parse_l1(jd)
     if len(l_urls['union']) == 0 and self.save_none_url_tweet is False:
         logger.warning('Ignore tweet %r with no urls!', tw_raw_id)
         return None
     # saving, level 1
     logger.debug('Saving this user ...')
     muser = get_or_create_m(session,
                             TwitterUser,
                             data=dict(raw_id=user_raw_id),
                             fb_uk='raw_id')
     logger.debug('Saving this tweet ...')
     muser_id = muser.id
     mtweet = Tweet(raw_id=tw_raw_id,
                    json_data=jd,
                    created_at=created_at,
                    user_id=muser_id)
     session.add(mtweet)
     try:
         session.commit()
         logger.debug('Inserted tweet %r', tw_raw_id)
     except IntegrityError as e:
         logger.warning('Tweet %s existed in db: %s', tw_raw_id, e)
         session.rollback()
         return None
     mtweet_id = mtweet.id
     logger.debug('Saving AssTweet ...')
     retweeted_status_id = None
     quoted_status_id = None
     if 'quoted_status' in jd:
         quoted_status_id = jd['quoted_status']['id']
     if 'retweeted_status' in jd:
         retweeted_status_id = jd['retweeted_status']['id']
     in_reply_to_status_id = jd['in_reply_to_status_id']
     session.add(
         AssTweet(id=mtweet_id,
                  retweeted_status_id=retweeted_status_id,
                  quoted_status_id=quoted_status_id,
                  in_reply_to_status_id=in_reply_to_status_id))
     try:
         session.commit()
     except IntegrityError as e:
         logger.warning(e)
         session.rollback()
     logger.debug('Saving urls ...')
     for u in l_urls['union']:
         if len(u) > MAX_URL_LEN:
             murl_id = -1
         else:
             murl_id = get_or_create_murl(session,
                                          data=dict(raw=u),
                                          platform_id=self.platform_id).id
             # Saving AssTweetUrl
             session.add(AssTweetUrl(tweet_id=mtweet_id, url_id=murl_id))
             try:
                 session.commit()
             except IntegrityError as e:
                 logger.error('ass_tweet_url IntegrityError, see: %s', e)
                 session.rollback()
         g_urls_map[u] = murl_id
     # creating hashtags
     logger.debug('creating hashtags ...')
     for hashtag in l_hashtags['union']:
         mhashtag = get_or_create_m(session,
                                    Hashtag,
                                    data=dict(text=hashtag),
                                    fb_uk='text')
         session.add(
             AssTweetHashtag(tweet_id=mtweet.id, hashtag_id=mhashtag.id))
         try:
             session.commit()
         except IntegrityError as e:
             logger.error('ass_tweet_hashtag IntegrityError, see: %s', e)
             session.rollback()
     self._parse_l2(jd, l_urls, l_mentions, g_urls_map, g_uusers_set,
                    g_edges_set)
    def parse(self, jd):
        """The main parse function.

        Parameters
        ---------
        jd : json
            Tweet json data.

        Procedures
        ----------
        1) do roughly parsing to validate `jd`
        2) carefully parsing and insert into database
        3) other associations
        """
        logger.debug('Parsing one tweet, begin ...')
        #
        # 1) do roughly parsing to validate the tweet
        #
        # 1-1) parsing necessary fields, if failed then it is not a valid tweet
        logger.debug('Replacing null byte if existing ...')
        jd = replace_null_byte(jd, self.fp)
        logger.debug('1) Roughly parsing ...')
        try:
            tw_raw_id = jd['id']
            created_at = utc_from_str(jd['created_at'])
            user_raw_id = jd['user']['id']
        except KeyError as e:
            logger.error('Invalid tweet: %s', e)
            return None
        # 1-2) roughly parsing
        entities_list = []
        quoted_status_id = None
        retweeted_status_id = None
        if 'entities' in jd:
            entities_list.append(jd['entities'])
        if 'quoted_status' in jd:
            quoted_jd = jd['quoted_status']
            quoted_user_jd = jd['quoted_status']['user']
            quoted_status_id = quoted_jd['id']
            if 'entities' in quoted_jd:
                entities_list.append(quoted_jd['entities'])
        if 'retweeted_status' in jd:
            retweeted_jd = jd['retweeted_status']
            retweeted_user_jd = jd['retweeted_status']['user']
            retweeted_status_id = retweeted_jd['id']
            if 'entities' in retweeted_jd:
                entities_list.append(retweeted_jd['entities'])
        in_reply_to_status_id = jd['in_reply_to_status_id']
        in_reply_to_user_id = jd['in_reply_to_user_id']
        in_reply_to_screen_name = jd['in_reply_to_screen_name']

        urls_set = set()
        hashtags_set = set()
        mentions_set = set()
        for entities in entities_list:
            if entities:
                self._parse_entities(entities, urls_set, hashtags_set,
                                     mentions_set)
        # This tweet should contain urls
        if len(urls_set) == 0 and self.save_none_url_tweet is False:
            logger.warning('No url found in tweet %s, ignore!', tw_raw_id)
            return None
        #
        # 2) carefully parsing and saving into database
        #
        logger.debug('2) Carefully parsing and saving ...')
        logger.debug('2-0) Saving twitter_user raw_id=%s ...', user_raw_id)
        muser = get_or_create_m(self.session,
                                TwitterUser,
                                data=dict(raw_id=user_raw_id),
                                fb_uk='raw_id')
        logger.debug('Saving this user into twitter_user_union as well ...')
        create_or_update_muser(
            self.session,
            data=dict(raw_id=user_raw_id,
                      screen_name=jd['user']['screen_name'],
                      followers_count=jd['user']['followers_count'],
                      profile=jd['user'],
                      updated_at=created_at))
        # creating tweet
        logger.debug('2-0) Saving tweet raw_id=%s ...', tw_raw_id)
        if self.saved_tweet is True:
            mtweet = self.session.query(Tweet).filter_by(
                raw_id=tw_raw_id).one()
        else:
            mtweet = Tweet(raw_id=tw_raw_id,
                           json_data=jd,
                           created_at=created_at,
                           user_id=muser.id)
            self.session.add(mtweet)
            try:
                self.session.commit()
                logger.debug('Inserted tweet %r', tw_raw_id)
            except IntegrityError as e:
                logger.warning('Tweet %s existed in db: %s', tw_raw_id, e)
                self.session.rollback()
                return None
        tweet_id = mtweet.id
        # Saving all urls and mapping the saved id
        url_map = dict()
        logger.debug('2-0) Saving all urls and associating with tweet...')
        for url in urls_set:
            murl = get_or_create_murl(self.session,
                                      data=dict(raw=url),
                                      platform_id=self.platform_id)
            url_map[url] = murl.id
            # saving ass_tweet_url
            if self.saved_tweet is False:
                self.session.add(
                    AssTweetUrl(tweet_id=tweet_id, url_id=url_map[url]))
                try:
                    self.session.commit()
                except IntegrityError as e:
                    logger.error('ass_tweet_url IntegrityError, see: %s', e)
                    self.session.rollback()
        # 2-1) retweet, focusing on retweeted_status
        #               edge direction: from retweeted_user to current user
        if retweeted_status_id is not None:
            logger.debug(
                '2-1-a) Saving the retweeted user into twitter_user_union ...')
            retweeted_user_id = retweeted_user_jd['id']
            retweeted_screen_name = retweeted_user_jd['screen_name']
            create_or_update_muser(
                self.session,
                data=dict(raw_id=retweeted_user_id,
                          screen_name=retweeted_screen_name,
                          followers_count=retweeted_user_jd['followers_count'],
                          profile=retweeted_user_jd,
                          updated_at=created_at))
            # retweeted user has been saved above, should be removed from mentions
            try:
                mentions_set.remove((retweeted_user_id, retweeted_screen_name))
            except KeyError as e:
                logger.warning('Tweet %r: retweeted user not in mentions',
                               tw_raw_id)
            logger.debug('2-1-a) Saving edges for retweet ...')
            self._save_edges(url_map,
                             retweeted_jd['entities'],
                             tweet_id,
                             tw_raw_id,
                             from_raw_id=retweeted_user_id,
                             to_raw_id=user_raw_id,
                             is_quoted_url=False,
                             is_mention=False,
                             tweet_type='retweet')
        # 2-2) reply, focusing on current status
        #             edges direction: from current user to mentions
        if in_reply_to_status_id is not None:
            # mentioned users would be saved later
            logger.debug('2-1-b) Saving edges for reply ...')
            # in_reply_to_user
            self._save_edges(url_map,
                             jd['entities'],
                             tweet_id,
                             tw_raw_id,
                             from_raw_id=user_raw_id,
                             to_raw_id=in_reply_to_user_id,
                             is_quoted_url=False,
                             is_mention=False,
                             tweet_type='reply')
            # mentions
            for m in jd['entities']['user_mentions']:
                to_raw_id = m.get('id')
                if to_raw_id and to_raw_id != in_reply_to_user_id:
                    self._save_edges(url_map,
                                     jd['entities'],
                                     tweet_id,
                                     tw_raw_id,
                                     from_raw_id=user_raw_id,
                                     to_raw_id=to_raw_id,
                                     is_quoted_url=False,
                                     is_mention=True,
                                     tweet_type='reply')
        # 2-3) quote
        if quoted_status_id is not None:
            logger.debug(
                '2-1-c) Saving the quoted user into twitter_user_union ...')
            quoted_user_id = quoted_user_jd['id']
            quoted_screen_name = quoted_user_jd['screen_name']
            create_or_update_muser(
                self.session,
                data=dict(raw_id=quoted_user_id,
                          screen_name=quoted_screen_name,
                          followers_count=quoted_user_jd['followers_count'],
                          profile=quoted_user_jd,
                          updated_at=created_at))
            # 2-3-1) retweeted quote, focusing on quoted_status
            #                         treated as retweet edge
            if retweeted_status_id is not None:
                logger.debug(
                    '2-1-c) Saving edges for quoting part of retweet ...')
                self._save_edges(url_map,
                                 quoted_jd['entities'],
                                 tweet_id,
                                 tw_raw_id,
                                 from_raw_id=retweeted_user_jd['id'],
                                 to_raw_id=user_raw_id,
                                 is_quoted_url=True,
                                 is_mention=False,
                                 tweet_type='retweet')
            # 2-3-2) replied quote, focusing on quoted_status
            #                       treated as reply edge
            elif in_reply_to_status_id is not None:
                logger.debug(
                    '2-1-c) Saving edges for quoting part of reply ...')
                # in_reply_to_user
                self._save_edges(url_map,
                                 quoted_jd['entities'],
                                 tweet_id,
                                 tw_raw_id,
                                 from_raw_id=user_raw_id,
                                 to_raw_id=in_reply_to_user_id,
                                 is_quoted_url=True,
                                 is_mention=False,
                                 tweet_type='reply')
                # mentions
                for m in jd['entities']['user_mentions']:
                    to_raw_id = m.get('id')
                    if to_raw_id and to_raw_id != in_reply_to_user_id:
                        self._save_edges(url_map,
                                         quoted_jd['entities'],
                                         tweet_id,
                                         tw_raw_id,
                                         from_raw_id=user_raw_id,
                                         to_raw_id=to_raw_id,
                                         is_quoted_url=True,
                                         is_mention=True,
                                         tweet_type='reply')
            # 2-3-3) pure quote
            else:
                logger.debug(
                    '2-1-c) Saving edge for pure quote part of quote ...')
                self._save_edges(url_map,
                                 quoted_jd['entities'],
                                 tweet_id,
                                 tw_raw_id,
                                 from_raw_id=quoted_user_jd['id'],
                                 to_raw_id=user_raw_id,
                                 is_quoted_url=True,
                                 is_mention=False,
                                 tweet_type='quote')
                logger.debug(
                    '2-1-c) Saving edges for original part of quote ...')
                for m in jd['entities']['user_mentions']:
                    to_raw_id = m.get('id')
                    if to_raw_id:
                        self._save_edges(url_map,
                                         jd['entities'],
                                         tweet_id,
                                         tw_raw_id,
                                         from_raw_id=user_raw_id,
                                         to_raw_id=to_raw_id,
                                         is_quoted_url=False,
                                         is_mention=True,
                                         tweet_type='quote')
        # 2-4) original tweet
        if retweeted_status_id is None and in_reply_to_status_id is None\
            and quoted_status_id is None and 'entities' in jd and\
            'user_mentions' in jd['entities']:
            logger.debug('2-1-d) Saving edges for original tweet ...')
            for m in jd['entities']['user_mentions']:
                to_raw_id = m.get('id')
                if to_raw_id:
                    self._save_edges(url_map,
                                     jd['entities'],
                                     tweet_id,
                                     tw_raw_id,
                                     from_raw_id=user_raw_id,
                                     to_raw_id=to_raw_id,
                                     is_quoted_url=False,
                                     is_mention=True,
                                     tweet_type='origin')
        # saving all mentions ...
        logger.debug('3) Saving all mentions ...')
        # add the in_reply_to_user
        mentions_set.add((in_reply_to_user_id, in_reply_to_screen_name))
        for user_raw_id, screen_name in mentions_set:
            create_or_update_muser(self.session,
                                   data=dict(raw_id=user_raw_id,
                                             screen_name=screen_name,
                                             updated_at=created_at))
        # saving hashtags
        logger.debug('3) creating hashtags')
        if self.saved_tweet is False:
            for hashtag in hashtags_set:
                mhashtag = get_or_create_m(self.session,
                                           Hashtag,
                                           data=dict(text=hashtag),
                                           fb_uk='text')
                self.session.add(
                    AssTweetHashtag(tweet_id=tweet_id, hashtag_id=mhashtag.id))
                try:
                    self.session.commit()
                except IntegrityError as e:
                    logger.error('ass_tweet_hashtag IntegrityError, see: %s',
                                 e)
                    self.session.rollback()
        # saving associate tweet
        logger.debug('3 Saving ass_tweet ...')
        if self.saved_tweet is False:
            create_m(self.session,
                     AssTweet,
                     data=dict(id=tweet_id,
                               retweeted_status_id=retweeted_status_id,
                               quoted_status_id=quoted_status_id,
                               in_reply_to_status_id=in_reply_to_status_id))
        logger.debug('Parsing one tweet, done.')