class TwitterSearch(): __solr = None __core = None __api = None def __init__(self, oauth): super().__init__() self.__solr = SolrClient(iu.solr_url) self.__core = iu.solr_core_tweets self.__api = tweepy.API(oauth) def index(self, keywords): for keyword in keywords: count = 0 for status in tweepy.Cursor(self.__api.search, q=keyword, tweet_mode="extended", lang="en").items(500): count += 1 # created_at_time str_created_at = status.created_at str_solr_time = str_created_at.utcnow().strftime( SOLR_TIME_PATTERN) docs = [{ 'id': status.id, 'created_at': str_solr_time, 'status_text': status.full_text }] self.__solr.index(self.__core, docs) print(str(count) + "," + keyword) code = iu.commit(iu.solr_core_tweets)
def update_pmi_scores(existing_tags: dict, existing_tag_pairs: dict, solr: SolrClient, core_name, batch_commit): count = 0 batch = [] for tag_pair, data in existing_tag_pairs.items(): count += 1 if count > batch_commit: solr.index(core_name, batch) code = util.commit(core_name) count = 0 batch = [] logger.info("\t done batch size={}".format(batch_commit)) co_freq = data[util.tag_index_field_frequency] tags = tag_pair.split(" ") t1_freq = existing_tags[tags[0]][util.tag_index_field_frequency] t2_freq = existing_tags[tags[1]][util.tag_index_field_frequency] if co_freq==0: pmi=0 else: pmi = numpy.emath.log(co_freq / (t1_freq * t2_freq + util.score_denominator_min)) data[util.tag_index_field_pmi] = pmi data[util.tag_index_field_text] =tag_pair data[util.tag_index_field_type] =1 batch.append(data) # commit the rest solr.index(core_name, batch) code = util.commit(core_name)
def update_tagrisk_scores(existing_tags: dict, solr: SolrClient, core_name, batch_commit): count = 0 batch = [] for tag, data in existing_tags.items(): count += 1 if count > batch_commit: solr.index(core_name, batch) code = util.commit(core_name) count = 0 batch = [] logger.info("\t done batch size={}".format(batch_commit)) freq = data[util.tag_index_field_frequency] freqh = data[util.tag_index_field_frequencyh] if freqh==0: riskscore=0 else: riskscore = numpy.math.sqrt(freqh / (freq+ util.score_denominator_min)) data[util.tag_index_field_risk_score] = riskscore data[util.tag_index_field_text] =tag data[util.tag_index_field_type] =0 batch.append(data) # commit the rest solr.index(core_name, batch) code = util.commit(core_name)
def update_ml_tag(solr: SolrClient, tweets_core_name, tags_core_name, docs, feat_vectorizer, ml_model, selected_features, hate_indicative_features, scaling_option, sysout, logger): tweets = [] for d in docs: text = d['status_text'] if "rt @" in text.lower(): start = text.lower().index("rt @") + 4 text = text[start].strip() tweets.append(text) #ml classify, also compute risk scores logger.info("begin ml classification for tweets={}, time={}".format( len(tweets), datetime.datetime.now())) tags, risk_scores = ml_tag(tweets, feat_vectorizer, ml_model, selected_features, hate_indicative_features, scaling_option, sysout, logger, solr, tags_core_name) logger.info("ml classification done. updating solr index...{}".format( datetime.datetime.now())) count = 0 for idx, tag in enumerate(tags): if tag == 0: count += 1 #print(d['status_text']) d = docs[idx] d['ml_tag'] = str(tag) d['tweet_risk'] = risk_scores[idx] print(count) solr.index(tweets_core_name, docs) code = iu.commit(tweets_core_name)
def handle(self, *args, **options): total = 0 cycle = 0 try: # Retrieve the Search and Field models from the database solr = SolrClient(settings.SOLR_SERVER_URL) try: self.search_target = Search.objects.get( search_id=options['search']) self.solr_core = self.search_target.solr_core_name self.all_fields = Field.objects.filter( search_id=self.search_target) if options['nothing_to_report']: self.search_fields = Field.objects.filter( search_id=self.search_target, alt_format='ALL') | Field.objects.filter( search_id=self.search_target, alt_format='NTR') else: self.search_fields = Field.objects.filter( search_id=self.search_target, alt_format='ALL') | Field.objects.filter( search_id=self.search_target, alt_format='') for search_field in self.search_fields: self.csv_fields[search_field.field_id] = search_field codes = Code.objects.filter(field_id=search_field) # Most csv_fields will not have codes, so the queryset will be zero length if len(codes) > 0: code_dict = {} for code in codes: code_dict[code.code_id.lower()] = code self.field_codes[search_field.field_id] = code_dict except Search.DoesNotExist as x: self.logger.error('Search not found: "{0}"'.format(x)) exit(-1) except Field.DoesNotExist as x1: self.logger.error( 'Fields not found for search: "{0}"'.format(x1)) # Process the records in the CSV file one at a time with open(options['csv'], 'r', encoding='utf-8-sig', errors="ignore") as csv_file: csv_reader = csv.DictReader(csv_file, dialect='excel') solr_items = [] for csv_record in csv_reader: # Clear out the Solr core. on the first line if total == 0 and not options['nothing_to_report']: solr.delete_doc_by_query(self.solr_core, "*:*") print("Purging all records") elif total == 0 and options['nothing_to_report']: solr.delete_doc_by_query(self.solr_core, "format:NTR") solr.commit(self.solr_core, softCommit=True) print("Purging NTR records") total += 1 cycle += 1 # Call plugins if they exist for this search type. This is where a developer can introduce # code to customize the data that is loaded into Solr for a particular search. search_type_plugin = 'search.plugins.{0}'.format( options['search']) if search_type_plugin in self.discovered_plugins: include, filtered_record = self.discovered_plugins[ search_type_plugin].filter_csv_record( csv_record, self.search_target, self.csv_fields, self.field_codes, 'NTR' if options['nothing_to_report'] else '') if not include: continue else: csv_record = filtered_record # Create a dictionary for each record loaded into Solr solr_record = { 'format': 'NTR' if options['nothing_to_report'] else 'DEFAULT' } for csv_field in csv_reader.fieldnames: # Verify that it is a known field if csv_field not in self.csv_fields and csv_field not in ( 'owner_org_title', 'owner_org'): self.logger.error( "CSV files contains unknown field: {0}".format( csv_field)) exit(-1) if csv_field == 'owner_org_title': continue # Handle multi-valued fields here if self.csv_fields[csv_field].solr_field_multivalued: solr_record[csv_field] = csv_record[ csv_field].split(',') # Copy fields fo report cannot use multi-values - so directly populate with original string if self.csv_fields[csv_field].solr_field_export: for extra_field in self.csv_fields[ csv_field].solr_field_export.split( ','): solr_record[extra_field] = csv_record[ csv_field] else: solr_record[csv_field] = csv_record[csv_field] # Automatically expand out dates and numbers for use with Solr export handler if self.csv_fields[ csv_field].solr_field_type == 'pdate': try: if csv_record[csv_field]: csv_date = datetime.strptime( csv_record[csv_field], '%Y-%m-%d') solr_record[csv_field + '_en'] = format_date( csv_date, locale='en') solr_record[csv_field + '_fr'] = format_date( csv_date, locale='fr') if self.csv_fields[ csv_field].is_default_year: solr_record['year'] = csv_date.year if self.csv_fields[ csv_field].is_default_month: solr_record['month'] = csv_date.month else: solr_record[csv_field + '_en'] = '' solr_record[csv_field + '_fr'] = '' except ValueError as x2: self.logger.error( 'Invalid date: "{0}"'.format(x2)) solr_record[csv_field] = '' continue elif self.csv_fields[csv_field].solr_field_type in [ 'pint', 'pfloat' ]: if solr_record[csv_field]: if solr_record[csv_field] == '.': solr_record[csv_field] = "0" csv_decimal = parse_decimal( solr_record[csv_field], locale='en_US') if self.csv_fields[ csv_field].solr_field_is_currency: solr_record[csv_field + '_en'] = format_currency( csv_decimal, 'CAD', locale='en_CA') solr_record[csv_field + '_fr'] = format_currency( csv_decimal, 'CAD', locale='fr_CA') else: solr_record[csv_field + '_en'] = format_decimal( csv_decimal, locale='en_CA') solr_record[csv_field + '_fr'] = format_decimal( csv_decimal, locale='fr_CA') else: solr_record[csv_field + '_en'] = '' solr_record[csv_field + '_fr'] = '' # Lookup the expanded code value from the codes dict of dict if csv_field in self.field_codes: if csv_record[csv_field]: if self.csv_fields[ csv_field].solr_field_multivalued: codes_en = [] codes_fr = [] for code_value in csv_record[ csv_field].split(","): if code_value.lower( ) in self.field_codes[csv_field]: codes_en.append( self.field_codes[csv_field] [code_value.lower()].label_en) codes_fr.append( self.field_codes[csv_field] [code_value.lower()].label_fr) else: self.logger.info( "Unknown code value: {0} for field: {1}" .format(code_value, csv_field)) solr_record[csv_field + '_en'] = codes_en solr_record[csv_field + '_fr'] = codes_fr else: if csv_record[csv_field].lower( ) in self.field_codes[csv_field]: solr_record[csv_field + '_en'] = self.field_codes[ csv_field][csv_record[ csv_field].lower( )].label_en solr_record[csv_field + '_fr'] = self.field_codes[ csv_field][csv_record[ csv_field].lower( )].label_fr else: self.logger.info( "Unknown code value: {0} for field: {1}" .format(csv_record[csv_field], csv_field)) solr_record = self.set_empty_fields(solr_record) # Set the Solr ID field (Nothing To Report records are excluded) if not options['nothing_to_report']: if self.search_target.id_fields: id_values = [] for id_field in self.search_target.id_fields.split( ","): id_values.append(csv_record[id_field]) solr_record['id'] = ",".join(id_values) else: if 'month' in solr_record: solr_record['id'] = "{0}-{1}-{2}".format( solr_record['owner_org'], solr_record['year'], solr_record['month']) elif 'quarter' in solr_record: solr_record['id'] = "{0}-{1}-{2}".format( solr_record['owner_org'], solr_record['year'], solr_record['quarter']) # Call plugins if they exist for this search type. This is where a developer can introduce # code to customize the data that is loaded into Solr for a particular search. if search_type_plugin in self.discovered_plugins: solr_record = self.discovered_plugins[ search_type_plugin].load_csv_record( csv_record, solr_record, self.search_target, self.csv_fields, self.field_codes, 'NTR' if options['nothing_to_report'] else '') solr_items.append(solr_record) # Write to Solr whenever the cycle threshold is reached if cycle >= self.cycle_on: # try to connect to Solr up to 10 times for countdown in reversed(range(10)): try: solr.index(self.solr_core, solr_items) print("{0} rows processed".format(total)) cycle = 0 solr_items.clear() break except ConnectionError as cex: if not countdown: raise print( "Solr error: {0}. Waiting to try again ... {1}" .format(cex, countdown)) time.sleep((10 - countdown) * 5) # Write and remaining records to Solr and commit if cycle > 0: # try to connect to Solr up to 10 times for countdown in reversed(range(10)): try: solr.index(self.solr_core, solr_items) total += len(solr_items) print("{0} rows processed".format(cycle)) cycle = 0 solr_items.clear() break except ConnectionError as cex: if not countdown: raise print( "Solr error: {0}. Waiting to try again ... {1}" .format(cex, countdown)) time.sleep((10 - countdown) * 5) solr.commit(self.solr_core, softCommit=True, waitSearcher=True) print("Total rows processed: {0}".format(total)) except Exception as x: self.logger.error('Unexpected Error "{0}"'.format(x))
class TwitterStream(StreamListener): __solr = None __core = None __count = 0 __count_retweet = 0 __google_maps = None def __init__(self, google_api_key): super().__init__() self.__solr = SolrClient(iu.solr_url) self.__core = iu.solr_core_tweets self.__google_maps = GoogleMaps(api_key=google_api_key) # self.__ml_model=util.load_ml_model(ml_model_file) # self.__selected_features = mutil.read_preselected_features(False, ml_selected_features) def ignoreRetweet(self, status_text): if "rt @" in status_text.lower(): self.__count_retweet += 1 if IGNORE_RETWEETS: return True return False def on_data(self, data): self.__count += 1 #print(self.__count) if self.__count % 200 == 0: code = urllib.request. \ code = iu.commit(iu.solr_core_tweets) now = datetime.datetime.now() print("{} processed: {}, where {} are retweets".format( now, self.__count, self.__count_retweet)) logger.info("{} processed: {}, where {} are retweets".format( now, self.__count, self.__count_retweet)) jdata = None try: jdata = json.loads(data) if jdata is not None and "id" in jdata.keys() \ and not self.ignoreRetweet(jdata["text"]): # created_at_time str_created_at = jdata["created_at"] time = datetime.datetime.strptime(str_created_at, TWITTER_TIME_PATTERN) str_solr_time = time.utcnow().strftime(SOLR_TIME_PATTERN) # entities hashtags hashtags = jdata["entities"]["hashtags"] hashtag_list = [] for hashtag in hashtags: hashtag_list.append(hashtag["text"].lower()) # entities urls urls = jdata["entities"]["urls"] url_list = [] for url in urls: url_list.append(url["expanded_url"]) # entities symbols symbols = jdata["entities"]["symbols"] symbols_list = [] for symbol in symbols: symbols_list.append(symbol["text"]) # entities user_mentions user_mentions = jdata["entities"]["user_mentions"] user_mention_list = [] for um in user_mentions: user_mention_list.append(um["id"]) # quoted status id if exists if "quoted_status_id" in jdata: quoted_status_id = jdata["quoted_status_id"] else: quoted_status_id = None # place exists place = jdata["place"] if place is not None: place_full_name = place["full_name"] place_coordinates = place['bounding_box']['coordinates'][ 0][0] else: place_full_name = None place_coordinates = None coordinates = jdata["coordinates"] # user_location, only compute geocode if other means have failed geocode_coordinates_of_user_location = [] str_user_loc = jdata["user"]["location"] if str_user_loc is not None and "," in str_user_loc: str_user_loc = str_user_loc.split(",")[0].strip() if str_user_loc is not None and len( str_user_loc ) < 25 and coordinates is None and place_full_name is None: geocode_obj = None if str_user_loc in LOCATION_COORDINATES.keys(): geocode_obj = LOCATION_COORDINATES[str_user_loc] else: # geocode_obj=None #currently the api for getting geo codes seems to be unstable try: geocode_obj = self.__google_maps.search( location=str_user_loc) if (geocode_obj is not None): geocode_obj = geocode_obj.first() LOCATION_COORDINATES[str_user_loc] = geocode_obj if geocode_obj is not None: geocode_coordinates_of_user_location.append( geocode_obj.lat) geocode_coordinates_of_user_location.append( geocode_obj.lng) except Exception as exc: #traceback.print_exc(file=sys.stdout) print("\t\t gmap error={}".format( str_user_loc, exc)) logger.error("\t\t gmap: {}".format(str_user_loc)) try: geocode_obj = geolocator.geocode(str_user_loc) LOCATION_COORDINATES[ str_user_loc] = geocode_obj if geocode_obj is not None: geocode_coordinates_of_user_location.append( geocode_obj.latitude) geocode_coordinates_of_user_location.append( geocode_obj.longitude) except Exception as exc: #traceback.print_exc(file=sys.stdout) print("\t\t GeoPy error={}".format( str_user_loc, exc)) logger.error( "\t\t GeoPy {}".format(str_user_loc)) # ml_tag=util.ml_tag(jdata['text'], feat_vectorizer,self.__ml_model, self.__selected_features, # SCALING_STRATEGY, self.__sysout, logger) ml_tag = '0' if random.random() < 0.2 else '2' tweet_risk = random.uniform(0, 1.0) if coordinates == None: coordinates = place_coordinates if coordinates == None: coordinates = geocode_coordinates_of_user_location coord_lat = None coord_lon = None if coordinates is not None and len(coordinates) > 0: coord_lat = coordinates[0] coord_lon = coordinates[1] docs = [{ 'id': jdata["id"], 'created_at': str_solr_time, 'coordinate_lat': coord_lat, 'coordinate_lon': coord_lon, 'favorite_count': jdata["favorite_count"], 'in_reply_to_screen_name': jdata["in_reply_to_screen_name"], 'in_reply_to_status_id': jdata["in_reply_to_status_id"], 'in_reply_to_user_id': jdata["in_reply_to_user_id"], 'lang': jdata["lang"], 'place_full_name': place_full_name, 'place_coordinates': place_coordinates, 'retweet_count': jdata["retweet_count"], 'retweeted': jdata["retweeted"], 'quoted_status_id': quoted_status_id, 'status_text': jdata["text"], 'entities_hashtag': hashtag_list, 'entities_symbol': symbols_list, 'entities_url': url_list, 'entities_user_mention': user_mention_list, 'user_id': jdata["user"]["id"], 'user_screen_name': jdata["user"]["screen_name"], 'user_statuses_count': jdata["user"]["statuses_count"], 'user_friends_count': jdata["user"]["friends_count"], 'user_followers_count': jdata["user"]["followers_count"], 'user_location': str_user_loc, 'user_location_coordinates': geocode_coordinates_of_user_location, 'ml_tag': ml_tag, "tweet_risk": tweet_risk }] self.__solr.index(self.__core, docs) except Exception as exc: traceback.print_exc(file=sys.stdout) print( "Error encountered for {}, error:{} (see log file for details)" .format(self.__count, exc)) if jdata is not None and "id" in jdata.keys(): tweet_id = jdata["id"] else: tweet_id = "[failed to parse]" logger.info( "Error encountered for counter={}, tweet={}, error:{} (see log file for details)" .format(self.__count, tweet_id, exc)) if jdata is not None: file = LOG_DIR + "/" + str(tweet_id) + ".txt" logger.info("\t input data json written to {}".format(file)) with open(file, 'w') as outfile: json.dump(jdata, outfile) pass return (True) def on_error(self, status): print(status) def on_status(self, status): print(status.text)
import os import json import requests CC_LINKS_FILES_DIRECTORIES = [] SOLR_INSTANCE_URL = "" SOLR_CORE = "" solr_client = SolrClient(SOLR_INSTANCE_URL) def get_url_content(url): user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36' headers = {'User-Agent': user_agent} resp = requests.get(url, headers=headers) return resp.text for directory in CC_LINKS_FILES_DIRECTORIES: files = [f for f in os.listdir(directory) if os.path.isfile(f)] for file in files: docs = [] with open(directory + '/' + file, 'r') as f: for line in f: json_obj = json.loads(line, encoding="utf-8") url = json_obj["url"] text = get_url_content(url) docs.append({"file_name": file, "html": text}) solr_client.index(SOLR_CORE, docs) solr_client.commit(SOLR_CORE, openSearcher=True)
class TwitterStream(StreamListener): #https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object __solr_url = None __solr = None __core = None __count = 0 __count_retweet = 0 def __init__(self, solr_url): super().__init__() self.__solr_url = solr_url self.__solr = SolrClient(solr_url) self.__core = SOLR_CORE_TWEETS def on_data(self, data): self.__count += 1 if self.__count % COMMIT_BATCH_SIZE == 0: code = commit(SOLR_CORE_TWEETS, self.__solr_url) now = datetime.datetime.now() print("{} processed: {}".format(now, self.__count)) logger.info("{} processed: {}".format(now, self.__count)) jdata = None try: jdata = json.loads(data) if jdata is not None and "id" in jdata.keys(): # created_at_time str_created_at = jdata["created_at"] time = datetime.datetime.strptime(str_created_at, TWITTER_TIME_PATTERN) str_solr_time = time.utcnow().strftime(SOLR_TIME_PATTERN) doc = { 'id': jdata["id_str"], 'created_at': str_solr_time, 'lang': jdata["lang"] } if "extended_tweet" in jdata: doc['status_text'] = jdata["extended_tweet"]["full_text"] else: doc['status_text'] = jdata["text"] self.collect_tweet_entities(doc, jdata) self.collect_tweet_quote_info(doc, jdata) self.collect_retweet_info(doc, jdata) self.collect_tweet_favorite_info(doc, jdata) self.collect_tweet_location_info(doc, jdata) self.collect_tweet_reply_info(doc, jdata) self.collect_tweet_user_info(doc, jdata) self.__solr.index(self.__core, [doc]) except Exception as exc: traceback.print_exc(file=sys.stdout) print( "Error encountered for {}, error:{} (see log file for details)" .format(self.__count, exc)) if jdata is not None and "id" in jdata.keys(): tweet_id = jdata["id"] else: tweet_id = "[failed to parse]" logger.info( "Error encountered for counter={}, tweet={}, error:{} (see log file for details)" .format(self.__count, tweet_id, exc)) if jdata is not None: file = LOG_DIR + "/" + str(tweet_id) + ".txt" logger.info("\t input data json written to {}".format(file)) with open(file, 'w') as outfile: json.dump(jdata, outfile) pass return (True) def on_error(self, status): print(status) def on_status(self, status): print(status.text) def collect_tweet_entities(self, doc: dict, tweet_json: dict): ##################### tweet entities ################### # entities hashtags if "extended_tweet" in tweet_json: entities = tweet_json["extended_tweet"]["entities"] else: entities = tweet_json["entities"] hashtags = entities["hashtags"] if 'entities_hashtag' in doc: hashtag_list = doc['entities_hashtag'] else: hashtag_list = [] for hashtag in hashtags: h = hashtag["text"].lower() if not h in hashtag_list: hashtag_list.append(h) # entities urls urls = entities["urls"] if 'entities_url' in doc: url_list = doc['entities_url'] else: url_list = [] for url in urls: url_list.append(url["expanded_url"]) # entities symbols symbols = entities["symbols"] if 'entities_symbol' in doc: symbols_list = doc['entities_symbol'] else: symbols_list = [] for symbol in symbols: s = symbol["text"].lower() if not s in symbols_list: symbols_list.append(s) # entities user_mentions user_mentions = entities["user_mentions"] if 'entities_user_mention' in doc: user_mention_list = doc['entities_user_mention'] else: user_mention_list = [] for um in user_mentions: id = str(um["id_str"]).lower() if not id in user_mention_list: user_mention_list.append(id) # media if "extended_tweet" in tweet_json and "extended_entities" in tweet_json[ "extended_tweet"]: ext_entities = tweet_json["extended_tweet"]["extended_entities"] elif "extended_entities" in tweet_json: ext_entities = tweet_json["extended_entities"] else: ext_entities = None if ext_entities is not None: media = ext_entities["media"] if media is not None: doc['entities_media_url'] = media[0]["media_url"] doc['entities_media_type'] = media[0]["type"] doc['entities_hashtag'] = hashtag_list doc['entities_symbol'] = symbols_list doc['entities_url'] = url_list doc['entities_user_mention'] = user_mention_list def collect_tweet_quote_info(self, doc: dict, tweet_json: dict): ################# quote #################### # quoted status id if exists if "quoted_status_id_str" in tweet_json: quoted_status_id = tweet_json["quoted_status_id_str"] self.collect_tweet_entities(doc, tweet_json['quoted_status']) else: quoted_status_id = None doc['quoted_status_id_str'] = quoted_status_id doc['is_quote_status'] = tweet_json["is_quote_status"] if "quote_count" in tweet_json: doc['quote_count'] = tweet_json["quote_count"] def collect_tweet_reply_info(self, doc: dict, tweet_json: dict): if "in_reply_to_screen_name" in tweet_json: doc['in_reply_to_screen_name'] = tweet_json[ "in_reply_to_screen_name"] if "in_reply_to_status_id_str" in tweet_json: doc['in_reply_to_status_id_str'] = tweet_json[ "in_reply_to_status_id_str"] if "in_reply_to_user_id_str" in tweet_json: doc['in_reply_to_user_id_str'] = tweet_json[ "in_reply_to_user_id_str"] doc['reply_count'] = tweet_json["reply_count"] def collect_retweet_info(self, doc: dict, tweet_json: dict): doc['retweet_count'] = tweet_json["retweet_count"] doc['retweeted'] = tweet_json["retweeted"] if "retweeted_status" in tweet_json: doc['retweeted_status_id_str'] = tweet_json["retweeted_status"][ "id_str"] self.collect_tweet_entities(doc, tweet_json['retweeted_status']) def collect_tweet_favorite_info(self, doc: dict, tweet_json: dict): doc['favorite_count'] = tweet_json["favorite_count"] #nullable def collect_tweet_user_info(self, doc: dict, tweet_json: dict): doc['user_id_str'] = tweet_json["user"]["id_str"] doc['user_screen_name'] = tweet_json["user"]["screen_name"] doc['user_statuses_count'] = tweet_json["user"]["statuses_count"] doc['user_friends_count'] = tweet_json["user"]["friends_count"] doc['user_followers_count'] = tweet_json["user"]["followers_count"] doc['user_desc'] = tweet_json["user"]["description"] def collect_tweet_location_info(self, doc: dict, tweet_json: dict): # place exists place = tweet_json["place"] if place is not None: place_full_name = place["full_name"] place_coordinates = place['bounding_box']['coordinates'][0][0] else: place_full_name = None place_coordinates = None coordinates = tweet_json["coordinates"] # user_location, only compute geocode if other means have failed # if coordinates == None: # coordinates = place_coordinates # # coord_lat = None # coord_lon = None # if coordinates is not None and len(coordinates) > 0: # coord_lat = coordinates[0] # coord_lon = coordinates[1] #doc['coordinate_lat']=coord_lat #doc['coordinate_lon']=coord_lon doc['place_full_name'] = place_full_name doc['place_coordinates'] = place_coordinates