def parse_node(self, response, node): """Parse response into UrlItem.""" item = UrlItem() if self.provider == 'self': link = node.xpath('link/text()').extract_first() elif self.provider == 'feedburner': link = node.xpath( '*[local-name()="origLink"]/text()').extract_first() else: logger.error('Unrecognized feed provider %r', self.provider) return date_published = node.xpath('pubDate/text()').extract_first() if link is not None: item['raw'] = link.strip() item['date_published'] = utc_from_str(date_published) yield item else: logger.error('Unexpected item: (%s, %s) from %r', link, date_published, response.url) return
def parse(self, jd): """The main parse function. Parameters --------- jd : json Tweet json data. Procedures ---------- 1) validate `jd` 2) extract URL and hashtag from `jd` 3) insert into database """ logger.debug('Parsing one tweet, begin') # # validation # try: tw_raw_id = jd['id'] created_at = utc_from_str(jd['created_at']) user_raw_id = jd['user']['id'] except KeyError as e: logger.error('Invalid tweet: %s', e) return None # # extract url, hashtag and associated tweet status id # urls_set = set() hashtags_set = set() entities_list = [] if 'entities' in jd: entities_list.append(jd['entities']) if 'quoted_status' in jd: q_jd = jd['quoted_status'] if 'entities' in q_jd: entities_list.append(q_jd['entities']) if 'retweeted_status' in jd: re_jd = jd['retweeted_status'] if 'entities' in re_jd: entities_list.append(re_jd['entities']) if 'quoted_status' in re_jd and\ 'entities' in re_jd['quoted_status']: entities_list.append(re_jd['quoted_status']['entities']) for entities in entities_list: if entities: self._parse_entities(entities, urls_set, hashtags_set) # This tweet should contain urls if len(urls_set) == 0 and self.save_none_url_tweet is False: logger.debug('No url found in %s, ignore!', tw_raw_id) return None # # Insert into database # # creating user logger.debug('creating user') muser = get_or_create_m(self.session, TwitterUser, data=dict(raw_id=user_raw_id), fb_uk='raw_id') # creating tweet logger.debug('creating tweet') mtweet = Tweet(raw_id=tw_raw_id, json_data=jd, created_at=created_at, user_id=muser.id) self.session.add(mtweet) try: self.session.commit() logger.debug('Inserted tweet %r', tw_raw_id) except IntegrityError as e: logger.warning('Tweet %s existed in db: %s', tw_raw_id, e) self.session.rollback() return # creating urls logger.debug('creating urls') for url in urls_set: murl = get_or_create_murl(self.session, data=dict(raw=url), platform_id=self.platform_id) self.session.add(AssTweetUrl(tweet_id=mtweet.id, url_id=murl.id)) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_url IntegrityError, see: %s', e) self.session.rollback() # creating hashtags logger.debug('creating hashtags') for hashtag in hashtags_set: mhashtag = get_or_create_m(self.session, Hashtag, data=dict(text=hashtag), fb_uk='text') self.session.add( AssTweetHashtag(tweet_id=mtweet.id, hashtag_id=mhashtag.id)) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_hashtag IntegrityError, see: %s', e) self.session.rollback() # paring associate tweet q1 = """ INSERT INTO ass_tweet (id, retweeted_status_id, quoted_status_id, in_reply_to_status_id) SELECT id, CAST(json_data#>>'{retweeted_status, id}' AS BIGINT), CAST(json_data#>>'{quoted_status, id}' AS BIGINT), CAST(json_data->>'in_reply_to_status_id' AS BIGINT) FROM tweet WHERE id=:tweet_id """ q1 = text(q1).bindparams(tweet_id=mtweet.id) try: self.session.execute(q1) self.session.commit() except DataError as e: # Handle \u0000 exception that postgresql json do not support logger.warning(e) self.session.rollback() q2 = r""" UPDATE tweet SET json_data=regexp_replace( json_data::text, '\\u0000', '\\\\u0000', 'g')::json WHERE id=:tweet_id """ q2 = text(q2).bindparams(tweet_id=mtweet.id) self.session.execute(q2) self.session.commit() logger.warning('json_data is updated (\\u0000 to \\\\u0000)') self.session.execute(q1) self.session.commit() logger.debug('Parsing one tweet, done.')
def search(self, query, n1=100, n2=100000, sort_by='relevant', use_lucene_syntax=False, min_score_of_recent_sorting=0.4, min_date_published=None): """Return the matched articles from lucene. Parameters ---------- query : string The query string. n1 : int How many result finally returned. n2 : int How many search results returned when sort by recent. sort_by : string {'relevant', 'recent'}, the sorting order when doing lucene searching. min_score_of_recent_sorting : float The min score when sorting by 'recent'. min_date_published : datetime The min date_published when filtering lucene searching results. Returns ------- tuple (total_hits, df), where total_hits represents the total number of hits and df is a pandas.DataFrame object. df.columns = ['id', 'canonical_url', 'title', 'date_published', 'domain', 'site_type', 'score'] """ if min_date_published is not None: dt2 = datetime.utcnow() if isinstance(min_date_published, datetime): dt1 = min_date_published elif isinstance(min_date_published, str): dt1 = utc_from_str(min_date_published) q_dates = self.query_between_dates(dt1, dt2) try: if use_lucene_syntax is False: query = clean_query(query) q = self.mul_parser.parse(self.mul_parser, query) logger.warning(q) if 'date_published:' in query: end = query.find('AND date_published') q_without_date_publushed = query[:end] logger.warning(q_without_date_publushed) q = self.mul_parser.parse(self.mul_parser, q_without_date_publushed) date_published_splits = query.split('date_published:[') date_range = date_published_splits[len(date_published_splits) - 1] date_range = date_range[:-1] logger.warning(date_range) if 'TO' in date_range: date_range_splits = date_range.split('TO') dt1_string = date_range_splits[0] # handling when regex presents if '*' in dt1_string: date1_end = dt1_string.find('*') - 1 dt1_string = dt1_string[:date1_end] logger.warning(dt1_string) dt1 = utc_from_str(dt1_string) dt2_string = date_range_splits[1] if '*' in dt2_string: date2_end = dt2_string.find('*') - 1 dt2_string = dt2_string[:date2_end] logger.warning(dt2_string) dt2 = utc_from_str(dt2_string) query_dates = self.query_between_dates(dt1, dt2) q = combine_queries(q, query_dates) if min_date_published is not None: q = combine_queries(q, q_dates) logger.warning('Parsed query: %s', q) except Exception as e: logger.error(e) if use_lucene_syntax is True: raise APIParseError("""Error when parse the query string! \ You are quering with lucene syntax, be careful of your query string!""") else: raise APIParseError('Error when parse the query string!') cnames = [ 'id', 'canonical_url', 'title', 'date_published', 'domain', 'site_type', 'score' ] if sort_by == 'relevant': top_docs = self.isearcher.search(q, n1) score_docs = top_docs.scoreDocs total_hits = top_docs.totalHits if total_hits == 0: df = pd.DataFrame() else: records = [self.fetch_one_doc(sd) for sd in score_docs] # Index in each record of canonical URL and title canonical_url, title = 1, 2 # Store 2-tuples of (site, article title) as keys in dict then # turn back to list unique_docs = dict() for record in records: key = (record[canonical_url], record[title]) if key not in unique_docs: unique_docs[key] = record # Include only unique records records = list(unique_docs.values()) df = pd.DataFrame(records, columns=cnames) df['date_published'] = pd.to_datetime(df['date_published']) return total_hits, df elif sort_by == 'recent': counter = 0 records = [] top_field_docs = self.isearcher.search(q, n2, self.sort_by_recent, True, True) if top_field_docs.maxScore >= min_score_of_recent_sorting: for sd in top_field_docs.scoreDocs: if sd.score >= min_score_of_recent_sorting: records.append(self.fetch_one_doc(sd)) counter += 1 if counter == n1: break if counter == 0: df = pd.DataFrame() else: df = pd.DataFrame(records, columns=cnames) df['date_published'] = pd.to_datetime(df['date_published']) return counter, df
def _parse_l2(self, jd): """Second Level parsing, the main function is to build parsed objects for the most complicate table: twitter_network_edge Parameters --------- jd: JSON A tweet JSON object. """ # # Make sure up-using class members are flushed # self.full_user = list() self.edges = set() # start parsing tweet_raw_id = jd['id'] user_raw_id = jd['user']['id'] user_screen_name = jd['user']['screen_name'] self.created_at = utc_from_str(jd['created_at']) # add this user as full_user self.full_user.append( (user_raw_id, user_screen_name, jd['user']['followers_count'], jd['user'], self.created_at)) quoted_status_id = None retweeted_status_id = None if 'quoted_status' in jd: quoted_user_id = jd['quoted_status']['user']['id'] quoted_screen_name = jd['quoted_status']['user']['screen_name'] quoted_status_id = jd['quoted_status']['id'] self.full_user.append( (quoted_user_id, quoted_screen_name, jd['quoted_status']['user']['followers_count'], jd['quoted_status']['user'], self.created_at)) if 'retweeted_status' in jd: retweeted_user_id = jd['retweeted_status']['user']['id'] retweeted_screen_name = jd['retweeted_status']['user'][ 'screen_name'] retweeted_status_id = jd['retweeted_status']['id'] self.full_user.append( (retweeted_user_id, retweeted_screen_name, jd['retweeted_status']['user']['followers_count'], jd['retweeted_status']['user'], self.created_at)) in_reply_to_status_id = jd['in_reply_to_status_id'] in_reply_to_user_id = jd['in_reply_to_user_id'] in_reply_to_screen_name = jd['in_reply_to_screen_name'] if in_reply_to_user_id is not None and\ in_reply_to_screen_name is not None: self.in_reply_to_user = (in_reply_to_user_id, in_reply_to_screen_name) self.ass_tweet = (tweet_raw_id, retweeted_status_id, quoted_status_id, in_reply_to_status_id) # # Building edges # # 2-1) retweet, focusing on retweeted_status # edge direction: from retweeted_user to current user if retweeted_status_id is not None: logger.debug('2-1-a) building edges for retweet=%s', tweet_raw_id) for u in self.urls['retweet']: self.edges.add((tweet_raw_id, retweeted_user_id, user_raw_id, u, False, False, 'retweet')) # 2-2) reply, focusing on current status # edges direction: from current user to mentions if in_reply_to_status_id is not None: logger.debug('2-1-b) building edges for reply=%s', tweet_raw_id) # in_reply_to_user, edge for url in self.urls['this']: self.edges.add((tweet_raw_id, user_raw_id, in_reply_to_user_id, url, False, False, 'reply')) # mentions, edges for mention_id, mention_screen_name in self.mentions['this']: if mention_id != in_reply_to_user_id: for url in self.urls['this']: self.edges.add((tweet_raw_id, user_raw_id, mention_id, url, False, True, 'reply')) # 2-3) quote if quoted_status_id is not None: # 2-3-1) retweeted quote, focusing on quoted_status # treated as retweet edge if retweeted_status_id is not None: logger.debug( '2-1-c) building edges for the quote of a retweet=%s', tweet_raw_id) for url in self.urls['quote']: self.edges.add((tweet_raw_id, retweeted_user_id, user_raw_id, url, True, False, 'retweet')) # 2-3-2) replied quote, focusing on quoted_status # treated as reply edge elif in_reply_to_status_id is not None: logger.debug( '2-1-c) building edges for the quote of a reply=%s', tweet_raw_id) # in_reply_to_user, edges for quoted url for url in self.urls['quote']: self.edges.add( (tweet_raw_id, user_raw_id, in_reply_to_user_id, url, True, False, 'reply')) # mentions, edges for quoted url for mention_id, mention_screen_name in self.mentions['this']: if mention_id != in_reply_to_user_id: for url in self.urls['quote']: self.edges.add( (tweet_raw_id, user_raw_id, mention_id, url, True, True, 'reply')) # 2-3-3) pure quote else: logger.debug('2-1-c) Building edges for a pure quote=%s', tweet_raw_id) # a. information edges: from quoted_user to this_user # for urls inputted by quoted user for url in self.urls['quote']: self.edges.add((tweet_raw_id, quoted_user_id, user_raw_id, url, True, False, 'quote')) # b. information edges: from this_user to mentioned_users # of this_user # for both urls inputted by this user # and quoted_user for mention_id, mention_screen_name in self.mentions['this']: for url in self.urls['quote']: self.edges.add((tweet_raw_id, user_raw_id, mention_id, url, True, True, 'quote')) for url in self.urls['this']: self.edges.add((tweet_raw_id, user_raw_id, mention_id, url, False, True, 'quote')) # 2-4) original tweet if retweeted_status_id is None and in_reply_to_status_id is None\ and quoted_status_id is None: logger.debug('2-1-d) building edges for original tweet=%s', tweet_raw_id) for mention_id, mention_screen in self.mentions['this']: for url in self.urls['this']: self.edges.add((tweet_raw_id, user_raw_id, mention_id, url, False, True, 'origin'))
def search(self, query, n1=100, n2=100000, sort_by='relevant', use_lucene_syntax=False, min_score_of_recent_sorting=0.4, min_date_published=None): """Return the matched articles from lucene. Parameters ---------- query : string The query string. n1 : int How many result finally returned. n2 : int How many search results returned when sort by recent. sort_by : string {'relevant', 'recent'}, the sorting order when doing lucene searching. min_score_of_recent_sorting : float The min score when sorting by 'recent'. min_date_published : datetime<Plug>(neosnippet_expand) The min date_published when filtering lucene searching results. Returns ------- tuple (total_hits, df), where total_hits represents the total number of hits and df is a pandas.DataFrame object. df.columns = ['id', 'canonical_url', 'title', 'date_published', 'domain', 'site_type', 'score'] """ if min_date_published is not None: dt2 = datetime.utcnow() if isinstance(min_date_published, datetime): dt1 = min_date_published elif isinstance(min_date_published, basestring): dt1 = utc_from_str(min_date_published) sf = self.prepare_chained_filter(dt1, dt2) else: sf = self.dup_filter try: if use_lucene_syntax is False: query = clean_query(query) q = self.mul_parser.parse(self.mul_parser, query) logger.debug('Parsed query: %s', q) except Exception as e: logger.error(e) if use_lucene_syntax is True: raise APIParseError("""Error when parse the query string! \ You are quering with lucene syntax, be careful of your query string!""") else: raise APIParseError('Error when parse the query string!') cnames = ['id', 'canonical_url', 'title', 'date_published', 'domain', 'site_type', 'score'] if sort_by == 'relevant': top_docs = self.isearcher.search(q, sf, n1) score_docs = top_docs.scoreDocs total_hits = top_docs.totalHits if total_hits == 0: df = pd.DataFrame() else: records = [self.fetch_one_doc(sd) for sd in score_docs] df = pd.DataFrame(records, columns=cnames) df['date_published'] = pd.to_datetime(df['date_published']) return total_hits, df elif sort_by == 'recent': counter = 0 records = [] top_field_docs = self.isearcher.search(q, sf, n2, self.sort_by_recent, True, True) if top_field_docs.maxScore >= min_score_of_recent_sorting: for sd in top_field_docs.scoreDocs: if sd.score >= min_score_of_recent_sorting: records.append(self.fetch_one_doc(sd)) counter += 1 if counter == n1: break if counter == 0: df = pd.DataFrame() else: df = pd.DataFrame(records, columns=cnames) df['date_published'] = pd.to_datetime(df['date_published']) return counter, df
def parse_new_one(self, jd, session, g_urls_map, g_uusers_set, g_edges_set): # validate jd jd = replace_null_byte(jd) try: tw_raw_id = jd['id'] created_at = utc_from_str(jd['created_at']) user_raw_id = jd['user']['id'] except KeyError as e: logger.error('Invalid tweet: %s', e) return None # parsing, level 1 l_urls, l_mentions, l_hashtags = self._parse_l1(jd) if len(l_urls['union']) == 0 and self.save_none_url_tweet is False: logger.warning('Ignore tweet %r with no urls!', tw_raw_id) return None # saving, level 1 logger.debug('Saving this user ...') muser = get_or_create_m(session, TwitterUser, data=dict(raw_id=user_raw_id), fb_uk='raw_id') logger.debug('Saving this tweet ...') muser_id = muser.id mtweet = Tweet(raw_id=tw_raw_id, json_data=jd, created_at=created_at, user_id=muser_id) session.add(mtweet) try: session.commit() logger.debug('Inserted tweet %r', tw_raw_id) except IntegrityError as e: logger.warning('Tweet %s existed in db: %s', tw_raw_id, e) session.rollback() return None mtweet_id = mtweet.id logger.debug('Saving AssTweet ...') retweeted_status_id = None quoted_status_id = None if 'quoted_status' in jd: quoted_status_id = jd['quoted_status']['id'] if 'retweeted_status' in jd: retweeted_status_id = jd['retweeted_status']['id'] in_reply_to_status_id = jd['in_reply_to_status_id'] session.add( AssTweet(id=mtweet_id, retweeted_status_id=retweeted_status_id, quoted_status_id=quoted_status_id, in_reply_to_status_id=in_reply_to_status_id)) try: session.commit() except IntegrityError as e: logger.warning(e) session.rollback() logger.debug('Saving urls ...') for u in l_urls['union']: if len(u) > MAX_URL_LEN: murl_id = -1 else: murl_id = get_or_create_murl(session, data=dict(raw=u), platform_id=self.platform_id).id # Saving AssTweetUrl session.add(AssTweetUrl(tweet_id=mtweet_id, url_id=murl_id)) try: session.commit() except IntegrityError as e: logger.error('ass_tweet_url IntegrityError, see: %s', e) session.rollback() g_urls_map[u] = murl_id # creating hashtags logger.debug('creating hashtags ...') for hashtag in l_hashtags['union']: mhashtag = get_or_create_m(session, Hashtag, data=dict(text=hashtag), fb_uk='text') session.add( AssTweetHashtag(tweet_id=mtweet.id, hashtag_id=mhashtag.id)) try: session.commit() except IntegrityError as e: logger.error('ass_tweet_hashtag IntegrityError, see: %s', e) session.rollback() self._parse_l2(jd, l_urls, l_mentions, g_urls_map, g_uusers_set, g_edges_set)
def parse(self, jd): """The main parse function. Parameters --------- jd : json Tweet json data. Procedures ---------- 1) do roughly parsing to validate `jd` 2) carefully parsing and insert into database 3) other associations """ logger.debug('Parsing one tweet, begin ...') # # 1) do roughly parsing to validate the tweet # # 1-1) parsing necessary fields, if failed then it is not a valid tweet logger.debug('Replacing null byte if existing ...') jd = replace_null_byte(jd, self.fp) logger.debug('1) Roughly parsing ...') try: tw_raw_id = jd['id'] created_at = utc_from_str(jd['created_at']) user_raw_id = jd['user']['id'] except KeyError as e: logger.error('Invalid tweet: %s', e) return None # 1-2) roughly parsing entities_list = [] quoted_status_id = None retweeted_status_id = None if 'entities' in jd: entities_list.append(jd['entities']) if 'quoted_status' in jd: quoted_jd = jd['quoted_status'] quoted_user_jd = jd['quoted_status']['user'] quoted_status_id = quoted_jd['id'] if 'entities' in quoted_jd: entities_list.append(quoted_jd['entities']) if 'retweeted_status' in jd: retweeted_jd = jd['retweeted_status'] retweeted_user_jd = jd['retweeted_status']['user'] retweeted_status_id = retweeted_jd['id'] if 'entities' in retweeted_jd: entities_list.append(retweeted_jd['entities']) in_reply_to_status_id = jd['in_reply_to_status_id'] in_reply_to_user_id = jd['in_reply_to_user_id'] in_reply_to_screen_name = jd['in_reply_to_screen_name'] urls_set = set() hashtags_set = set() mentions_set = set() for entities in entities_list: if entities: self._parse_entities(entities, urls_set, hashtags_set, mentions_set) # This tweet should contain urls if len(urls_set) == 0 and self.save_none_url_tweet is False: logger.warning('No url found in tweet %s, ignore!', tw_raw_id) return None # # 2) carefully parsing and saving into database # logger.debug('2) Carefully parsing and saving ...') logger.debug('2-0) Saving twitter_user raw_id=%s ...', user_raw_id) muser = get_or_create_m(self.session, TwitterUser, data=dict(raw_id=user_raw_id), fb_uk='raw_id') logger.debug('Saving this user into twitter_user_union as well ...') create_or_update_muser( self.session, data=dict(raw_id=user_raw_id, screen_name=jd['user']['screen_name'], followers_count=jd['user']['followers_count'], profile=jd['user'], updated_at=created_at)) # creating tweet logger.debug('2-0) Saving tweet raw_id=%s ...', tw_raw_id) if self.saved_tweet is True: mtweet = self.session.query(Tweet).filter_by( raw_id=tw_raw_id).one() else: mtweet = Tweet(raw_id=tw_raw_id, json_data=jd, created_at=created_at, user_id=muser.id) self.session.add(mtweet) try: self.session.commit() logger.debug('Inserted tweet %r', tw_raw_id) except IntegrityError as e: logger.warning('Tweet %s existed in db: %s', tw_raw_id, e) self.session.rollback() return None tweet_id = mtweet.id # Saving all urls and mapping the saved id url_map = dict() logger.debug('2-0) Saving all urls and associating with tweet...') for url in urls_set: murl = get_or_create_murl(self.session, data=dict(raw=url), platform_id=self.platform_id) url_map[url] = murl.id # saving ass_tweet_url if self.saved_tweet is False: self.session.add( AssTweetUrl(tweet_id=tweet_id, url_id=url_map[url])) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_url IntegrityError, see: %s', e) self.session.rollback() # 2-1) retweet, focusing on retweeted_status # edge direction: from retweeted_user to current user if retweeted_status_id is not None: logger.debug( '2-1-a) Saving the retweeted user into twitter_user_union ...') retweeted_user_id = retweeted_user_jd['id'] retweeted_screen_name = retweeted_user_jd['screen_name'] create_or_update_muser( self.session, data=dict(raw_id=retweeted_user_id, screen_name=retweeted_screen_name, followers_count=retweeted_user_jd['followers_count'], profile=retweeted_user_jd, updated_at=created_at)) # retweeted user has been saved above, should be removed from mentions try: mentions_set.remove((retweeted_user_id, retweeted_screen_name)) except KeyError as e: logger.warning('Tweet %r: retweeted user not in mentions', tw_raw_id) logger.debug('2-1-a) Saving edges for retweet ...') self._save_edges(url_map, retweeted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=retweeted_user_id, to_raw_id=user_raw_id, is_quoted_url=False, is_mention=False, tweet_type='retweet') # 2-2) reply, focusing on current status # edges direction: from current user to mentions if in_reply_to_status_id is not None: # mentioned users would be saved later logger.debug('2-1-b) Saving edges for reply ...') # in_reply_to_user self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=in_reply_to_user_id, is_quoted_url=False, is_mention=False, tweet_type='reply') # mentions for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id and to_raw_id != in_reply_to_user_id: self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=False, is_mention=True, tweet_type='reply') # 2-3) quote if quoted_status_id is not None: logger.debug( '2-1-c) Saving the quoted user into twitter_user_union ...') quoted_user_id = quoted_user_jd['id'] quoted_screen_name = quoted_user_jd['screen_name'] create_or_update_muser( self.session, data=dict(raw_id=quoted_user_id, screen_name=quoted_screen_name, followers_count=quoted_user_jd['followers_count'], profile=quoted_user_jd, updated_at=created_at)) # 2-3-1) retweeted quote, focusing on quoted_status # treated as retweet edge if retweeted_status_id is not None: logger.debug( '2-1-c) Saving edges for quoting part of retweet ...') self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=retweeted_user_jd['id'], to_raw_id=user_raw_id, is_quoted_url=True, is_mention=False, tweet_type='retweet') # 2-3-2) replied quote, focusing on quoted_status # treated as reply edge elif in_reply_to_status_id is not None: logger.debug( '2-1-c) Saving edges for quoting part of reply ...') # in_reply_to_user self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=in_reply_to_user_id, is_quoted_url=True, is_mention=False, tweet_type='reply') # mentions for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id and to_raw_id != in_reply_to_user_id: self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=True, is_mention=True, tweet_type='reply') # 2-3-3) pure quote else: logger.debug( '2-1-c) Saving edge for pure quote part of quote ...') self._save_edges(url_map, quoted_jd['entities'], tweet_id, tw_raw_id, from_raw_id=quoted_user_jd['id'], to_raw_id=user_raw_id, is_quoted_url=True, is_mention=False, tweet_type='quote') logger.debug( '2-1-c) Saving edges for original part of quote ...') for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id: self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=False, is_mention=True, tweet_type='quote') # 2-4) original tweet if retweeted_status_id is None and in_reply_to_status_id is None\ and quoted_status_id is None and 'entities' in jd and\ 'user_mentions' in jd['entities']: logger.debug('2-1-d) Saving edges for original tweet ...') for m in jd['entities']['user_mentions']: to_raw_id = m.get('id') if to_raw_id: self._save_edges(url_map, jd['entities'], tweet_id, tw_raw_id, from_raw_id=user_raw_id, to_raw_id=to_raw_id, is_quoted_url=False, is_mention=True, tweet_type='origin') # saving all mentions ... logger.debug('3) Saving all mentions ...') # add the in_reply_to_user mentions_set.add((in_reply_to_user_id, in_reply_to_screen_name)) for user_raw_id, screen_name in mentions_set: create_or_update_muser(self.session, data=dict(raw_id=user_raw_id, screen_name=screen_name, updated_at=created_at)) # saving hashtags logger.debug('3) creating hashtags') if self.saved_tweet is False: for hashtag in hashtags_set: mhashtag = get_or_create_m(self.session, Hashtag, data=dict(text=hashtag), fb_uk='text') self.session.add( AssTweetHashtag(tweet_id=tweet_id, hashtag_id=mhashtag.id)) try: self.session.commit() except IntegrityError as e: logger.error('ass_tweet_hashtag IntegrityError, see: %s', e) self.session.rollback() # saving associate tweet logger.debug('3 Saving ass_tweet ...') if self.saved_tweet is False: create_m(self.session, AssTweet, data=dict(id=tweet_id, retweeted_status_id=retweeted_status_id, quoted_status_id=quoted_status_id, in_reply_to_status_id=in_reply_to_status_id)) logger.debug('Parsing one tweet, done.')