def process_item(self, item, spider): """Main function that process URL item (first phase).""" # validate URL length if len(item['raw']) > MAX_URL_LEN: item['raw'] = item['raw'][:MAX_URL_LEN] logger.error('Raw URL too long, trucate it! %r', item['raw']) # parse raw URL purl = get_parsed_url(item['raw']) if purl is None or purl.hostname is None: raise DropItem('Invalide URL') site_id = belongs_to_site(purl.hostname, self.site_tuples) if site_id is None: raise DropItem('Offsite domain: %s', item) item['site_id'] = site_id # insert URL into table try: get_or_create_murl(spider.session, item, spider.platform_id) except SQLAlchemyError as e: logger.error(e) spider.session.rollback() raise DropItem('Fail to insert database of url: %s', item) return item
def parse_existed_one(self, tw_id, jd, session, g_urls_map, g_uusers_set, g_edges_set): """The main parse function. This function will parse tweet into different components corresponding to related table records. Parameters --------- jd : json Tweet json data. tw_id : integer If tweet has been saved, tw_db_id is the id of """ logger.debug('Parsing tweet %r begin ...', jd['id']) logger.debug('Level 1 parsing, roughly parse ...') l_urls, l_mentions, l_hashtags = self._parse_l1(jd) # Make sure we do saved and fetched all url_ids for u in l_urls['union']: if g_urls_map.get(u) is None: if len(u) > MAX_URL_LEN: logger.warning( 'URL %s of tweet %s was ignored because of too long', u, jd['id']) murl_id = -1 else: logger.warning( 'Previously incomplete parsing, missing %s of tweet %s', u, jd['id']) murl_id = get_or_create_murl( session, data=dict(raw=u), platform_id=self.platform_id).id # Saving AssTweetUrl session.add(AssTweetUrl(tweet_id=tw_id, url_id=murl_id)) try: session.commit() except IntegrityError as e: logger.error('ass_tweet_url IntegrityError, see: %s', e) session.rollback() g_urls_map[u] = murl_id logger.debug('Level 2 parsing, deeply parse ...') self._parse_l2(jd, l_urls, l_mentions, g_urls_map, g_uusers_set, g_edges_set)
def parse(self, jd): """The main parse function. Parameters --------- jd : json Tweet json data. Procedures ---------- 1) validate `jd` 2) extract URL and hashtag from `jd` 3) insert into database """ logger.debug('Parsing one tweet, begin') # # validation # try: tw_raw_id = jd['id'] created_at = utc_from_str(jd['created_at']) user_raw_id = jd['user']['id'] except KeyError as e: logger.error('Invalid tweet: %s', e) return None # # extract url, hashtag and associated tweet status id # urls_set = set() hashtags_set = set() entities_list = [] if 'entities' in jd: entities_list.append(jd['entities']) if 'quoted_status' in jd: q_jd = jd['quoted_status'] if 'entities' in q_jd: entities_list.append(q_jd['entities']) if 'retweeted_status' in jd: re_jd = jd['retweeted_status'] if 'entities' in re_jd: entities_list.append(re_jd['entities']) if 'quoted_status' in re_jd and\ 'entities' in re_jd['quoted_status']: entities_list.append(re_jd['quoted_status']['entities']) for entities in entities_list: if entities: self._parse_entities(entities, urls_set, hashtags_set) # This tweet should contain urls if len(urls_set) == 0 and self.save_none_url_tweet is False: logger.debug('No url found in %s, ignore!', tw_raw_id) return None # # Insert into database # # creating user logger.debug('creating user') muser = get_or_create_m(self.session, TwitterUser, data=dict(raw_id=user_raw_id), fb_uk='raw_id') # creating tweet logger.debug('creating tweet') mtweet = Tweet(raw_id=tw_raw_id, json_data=jd, created_at=created_at, user_id=muser.id) self.session.add(mtweet) try: self.session.commit() logger.debug('Inserted tweet %r', tw_raw_id) except IntegrityError as e: logger.warning('Tweet %s existed in db: %s', tw_raw_id, e) self.session.rollback() return # creating urls logger.debug('creating urls') for url in urls_set: murl = get_or_create_murl(self.session, data=dict(raw=url), platform_id=self.platform_id) self.session.add(AssTweetUrl(tweet_id=mtweet.id, url_id=murl.id)) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_url IntegrityError, see: %s', e) self.session.rollback() # creating hashtags logger.debug('creating hashtags') for hashtag in hashtags_set: mhashtag = get_or_create_m(self.session, Hashtag, data=dict(text=hashtag), fb_uk='text') self.session.add( AssTweetHashtag(tweet_id=mtweet.id, hashtag_id=mhashtag.id)) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_hashtag IntegrityError, see: %s', e) self.session.rollback() # paring associate tweet q1 = """ INSERT INTO ass_tweet (id, retweeted_status_id, quoted_status_id, in_reply_to_status_id) SELECT id, CAST(json_data#>>'{retweeted_status, id}' AS BIGINT), CAST(json_data#>>'{quoted_status, id}' AS BIGINT), CAST(json_data->>'in_reply_to_status_id' AS BIGINT) FROM tweet WHERE id=:tweet_id """ q1 = text(q1).bindparams(tweet_id=mtweet.id) try: self.session.execute(q1) self.session.commit() except DataError as e: # Handle \u0000 exception that postgresql json do not support logger.warning(e) self.session.rollback() q2 = r""" UPDATE tweet SET json_data=regexp_replace( json_data::text, '\\u0000', '\\\\u0000', 'g')::json WHERE id=:tweet_id """ q2 = text(q2).bindparams(tweet_id=mtweet.id) self.session.execute(q2) self.session.commit() logger.warning('json_data is updated (\\u0000 to \\\\u0000)') self.session.execute(q1) self.session.commit() logger.debug('Parsing one tweet, done.')
def parse_new_one(self, jd, session, g_urls_map, g_uusers_set, g_edges_set): # validate jd jd = replace_null_byte(jd) try: tw_raw_id = jd['id'] created_at = utc_from_str(jd['created_at']) user_raw_id = jd['user']['id'] except KeyError as e: logger.error('Invalid tweet: %s', e) return None # parsing, level 1 l_urls, l_mentions, l_hashtags = self._parse_l1(jd) if len(l_urls['union']) == 0 and self.save_none_url_tweet is False: logger.warning('Ignore tweet %r with no urls!', tw_raw_id) return None # saving, level 1 logger.debug('Saving this user ...') muser = get_or_create_m(session, TwitterUser, data=dict(raw_id=user_raw_id), fb_uk='raw_id') logger.debug('Saving this tweet ...') muser_id = muser.id mtweet = Tweet(raw_id=tw_raw_id, json_data=jd, created_at=created_at, user_id=muser_id) session.add(mtweet) try: session.commit() logger.debug('Inserted tweet %r', tw_raw_id) except IntegrityError as e: logger.warning('Tweet %s existed in db: %s', tw_raw_id, e) session.rollback() return None mtweet_id = mtweet.id logger.debug('Saving AssTweet ...') retweeted_status_id = None quoted_status_id = None if 'quoted_status' in jd: quoted_status_id = jd['quoted_status']['id'] if 'retweeted_status' in jd: retweeted_status_id = jd['retweeted_status']['id'] in_reply_to_status_id = jd['in_reply_to_status_id'] session.add( AssTweet(id=mtweet_id, retweeted_status_id=retweeted_status_id, quoted_status_id=quoted_status_id, in_reply_to_status_id=in_reply_to_status_id)) try: session.commit() except IntegrityError as e: logger.warning(e) session.rollback() logger.debug('Saving urls ...') for u in l_urls['union']: if len(u) > MAX_URL_LEN: murl_id = -1 else: murl_id = get_or_create_murl(session, data=dict(raw=u), platform_id=self.platform_id).id # Saving AssTweetUrl session.add(AssTweetUrl(tweet_id=mtweet_id, url_id=murl_id)) try: session.commit() except IntegrityError as e: logger.error('ass_tweet_url IntegrityError, see: %s', e) session.rollback() g_urls_map[u] = murl_id # creating hashtags logger.debug('creating hashtags ...') for hashtag in l_hashtags['union']: mhashtag = get_or_create_m(session, Hashtag, data=dict(text=hashtag), fb_uk='text') session.add( AssTweetHashtag(tweet_id=mtweet.id, hashtag_id=mhashtag.id)) try: session.commit() except IntegrityError as e: logger.error('ass_tweet_hashtag IntegrityError, see: %s', e) session.rollback() self._parse_l2(jd, l_urls, l_mentions, g_urls_map, g_uusers_set, g_edges_set)
def parse(self, jd): """The main parse function. Parameters --------- jd : json Tweet json data. Procedures ---------- 1) do roughly parsing to validate `jd` 2) carefully parsing and insert into database 3) other associations """ logger.debug('Parsing one tweet, begin ...') # # 1) do roughly parsing to validate the tweet # # 1-1) parsing necessary fields, if failed then it is not a valid tweet logger.debug('Replacing null byte if existing ...') jd = replace_null_byte(jd, self.fp) logger.debug('1) Roughly parsing ...') try: tw_raw_id = jd['id'] created_at = utc_from_str(jd['created_at']) user_raw_id = jd['user']['id'] except KeyError as e: logger.error('Invalid tweet: %s', e) return None # 1-2) roughly parsing entities_list = [] quoted_status_id = None retweeted_status_id = None if 'entities' in jd: entities_list.append(jd['entities']) if 'quoted_status' in jd: quoted_jd = jd['quoted_status'] quoted_user_jd = jd['quoted_status']['user'] quoted_status_id = quoted_jd['id'] if 'entities' in quoted_jd: entities_list.append(quoted_jd['entities']) if 'retweeted_status' in jd: retweeted_jd = jd['retweeted_status'] retweeted_user_jd = jd['retweeted_status']['user'] retweeted_status_id = retweeted_jd['id'] if 'entities' in retweeted_jd: entities_list.append(retweeted_jd['entities']) in_reply_to_status_id = jd['in_reply_to_status_id'] in_reply_to_user_id = jd['in_reply_to_user_id'] in_reply_to_screen_name = jd['in_reply_to_screen_name'] urls_set = set() hashtags_set = set() mentions_set = set() for entities in entities_list: if entities: self._parse_entities(entities, urls_set, hashtags_set, mentions_set) # This tweet should contain urls if len(urls_set) == 0 and self.save_none_url_tweet is False: logger.warning('No url found in tweet %s, ignore!', tw_raw_id) return None # # 2) carefully parsing and saving into database # logger.debug('2) Carefully parsing and saving ...') logger.debug('2-0) Saving twitter_user raw_id=%s ...', user_raw_id) muser = get_or_create_m(self.session, TwitterUser, data=dict(raw_id=user_raw_id), fb_uk='raw_id') logger.debug('Saving this user into twitter_user_union as well ...') create_or_update_muser( self.session, data=dict(raw_id=user_raw_id, screen_name=jd['user']['screen_name'], followers_count=jd['user']['followers_count'], profile=jd['user'], updated_at=created_at)) # creating tweet logger.debug('2-0) Saving tweet raw_id=%s ...', tw_raw_id) if self.saved_tweet is True: mtweet = self.session.query(Tweet).filter_by( raw_id=tw_raw_id).one() else: mtweet = Tweet(raw_id=tw_raw_id, json_data=jd, created_at=created_at, user_id=muser.id) self.session.add(mtweet) try: self.session.commit() logger.debug('Inserted tweet %r', tw_raw_id) except IntegrityError as e: logger.warning('Tweet %s existed in db: %s', tw_raw_id, e) self.session.rollback() return None tweet_id = mtweet.id # Saving all urls and mapping the saved id url_map = dict() logger.debug('2-0) Saving all urls and associating with tweet...') for url in urls_set: murl = get_or_create_murl(self.session, data=dict(raw=url), platform_id=self.platform_id) url_map[url] = murl.id # saving ass_tweet_url if self.saved_tweet is False: self.session.add( AssTweetUrl(tweet_id=tweet_id, url_id=url_map[url])) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_url IntegrityError, see: %s', e) self.session.rollback() # 2-1) retweet, focusing on retweeted_status # edge direction: from retweeted_user to current user if retweeted_status_id is not None: logger.debug( '2-1-a) Saving the retweeted user into twitter_user_union ...') retweeted_user_id = retweeted_user_jd['id'] retweeted_screen_name = retweeted_user_jd['screen_name'] create_or_update_muser( self.session, data=dict(raw_id=retweeted_user_id, screen_name=retweeted_screen_name, followers_count=retweeted_user_jd['followers_count'], profile=retweeted_user_jd, updated_at=created_at)) # retweeted user has been saved above, should be removed from mentions try: mentions_set.remove((retweeted_user_id, retweeted_screen_name)) except KeyError as e: logger.warning('Tweet %r: retweeted user not in mentions', tw_raw_id) logger.debug('2-1-a) Saving edges for retweet ...') self._save_edges(url_map, retweeted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=retweeted_user_id, to_raw_id=user_raw_id, is_quoted_url=False, is_mention=False, tweet_type='retweet') # 2-2) reply, focusing on current status # edges direction: from current user to mentions if in_reply_to_status_id is not None: # mentioned users would be saved later logger.debug('2-1-b) Saving edges for reply ...') # in_reply_to_user self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=in_reply_to_user_id, is_quoted_url=False, is_mention=False, tweet_type='reply') # mentions for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id and to_raw_id != in_reply_to_user_id: self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=False, is_mention=True, tweet_type='reply') # 2-3) quote if quoted_status_id is not None: logger.debug( '2-1-c) Saving the quoted user into twitter_user_union ...') quoted_user_id = quoted_user_jd['id'] quoted_screen_name = quoted_user_jd['screen_name'] create_or_update_muser( self.session, data=dict(raw_id=quoted_user_id, screen_name=quoted_screen_name, followers_count=quoted_user_jd['followers_count'], profile=quoted_user_jd, updated_at=created_at)) # 2-3-1) retweeted quote, focusing on quoted_status # treated as retweet edge if retweeted_status_id is not None: logger.debug( '2-1-c) Saving edges for quoting part of retweet ...') self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=retweeted_user_jd['id'], to_raw_id=user_raw_id, is_quoted_url=True, is_mention=False, tweet_type='retweet') # 2-3-2) replied quote, focusing on quoted_status # treated as reply edge elif in_reply_to_status_id is not None: logger.debug( '2-1-c) Saving edges for quoting part of reply ...') # in_reply_to_user self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=in_reply_to_user_id, is_quoted_url=True, is_mention=False, tweet_type='reply') # mentions for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id and to_raw_id != in_reply_to_user_id: self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=True, is_mention=True, tweet_type='reply') # 2-3-3) pure quote else: logger.debug( '2-1-c) Saving edge for pure quote part of quote ...') self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=quoted_user_jd['id'], to_raw_id=user_raw_id, is_quoted_url=True, is_mention=False, tweet_type='quote') logger.debug( '2-1-c) Saving edges for original part of quote ...') for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id: self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=False, is_mention=True, tweet_type='quote') # 2-4) original tweet if retweeted_status_id is None and in_reply_to_status_id is None\ and quoted_status_id is None and 'entities' in jd and\ 'user_mentions' in jd['entities']: logger.debug('2-1-d) Saving edges for original tweet ...') for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id: self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=False, is_mention=True, tweet_type='origin') # saving all mentions ... logger.debug('3) Saving all mentions ...') # add the in_reply_to_user mentions_set.add((in_reply_to_user_id, in_reply_to_screen_name)) for user_raw_id, screen_name in mentions_set: create_or_update_muser(self.session, data=dict(raw_id=user_raw_id, screen_name=screen_name, updated_at=created_at)) # saving hashtags logger.debug('3) creating hashtags') if self.saved_tweet is False: for hashtag in hashtags_set: mhashtag = get_or_create_m(self.session, Hashtag, data=dict(text=hashtag), fb_uk='text') self.session.add( AssTweetHashtag(tweet_id=tweet_id, hashtag_id=mhashtag.id)) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_hashtag IntegrityError, see: %s', e) self.session.rollback() # saving associate tweet logger.debug('3 Saving ass_tweet ...') if self.saved_tweet is False: create_m(self.session, AssTweet, data=dict(id=tweet_id, retweeted_status_id=retweeted_status_id, quoted_status_id=quoted_status_id, in_reply_to_status_id=in_reply_to_status_id)) logger.debug('Parsing one tweet, done.')