def kwmatch(lc, keywords=keywords): text = ' '+lc[t_col('t_text')]+' '+lc[t_col('t_quote')]+' ' text = parselogic.reformat(text, emojis=None, mode=4.5) #format text for matching for k in keywords: #for each keyword matched = parselogic.match(k, text) #test it against the tweet if matched: return True return False
def checkForKWs(self, kwtext): hit = False formattedtext = parselogic.reformat(kwtext, self.emojis, mode=4.5, lcase=True) for kw in self.keywords: if parselogic.match(kw, formattedtext): hit = True self.n_matches += 1 break # DOES THIS WORK TO SAVE PROCESSING? return hit
def gethashtags(data=data, hash_n=hash_n): hashdict = {} hashtop = {} for line in data: # Get tweet text: l_list = line.split(self.delimit) text = ' ' + l_list[cols['t_text']] + ' ' + l_list[ cols['t_quote']] + ' ' text = parselogic.reformat(text, emojis=None, mode=4.5, lcase=True) text = re.sub('\\', ' ', text) text = re.sub('# ', '#', text) text = re.sub('#', ' #', text) # Get unique hashtags from within tweet text: hashtags = list( set(part[1:] for part in text.split() if part.startswith('#'))) # Remove empty entry, if present try: hashtags.remove('') except: pass # Add hashtag and count to hashdict for hashtag in hashtags: # Remove punctuation from hashtag strings: hashtag = hashtag.strip(string.punctuation) # Add hashtag to hashdict: if hashtag not in hashdict: hashdict[hashtag] = 1 else: hashdict[hashtag] += 1 n = 0 for tophash in sorted(hashdict, key=hashdict.get, reverse=True): if n <= hash_n or not hash_n: # Add top hashtags to hashtop dict hashtop[str(tophash)] = hashdict[tophash] n = n + 1 else: # Stop after hash_n is reached break return hashtop, data
def writeToCSV(self, data, parsed_text, parsed_quote, fn, count): entities = [] outfile = self.dirOut + str( fn[:14] + '_data' + self.out_extension) # Changed from .csv to .tsv (20180716 JC) ################### ### User-level data ################### entities.append('\'' + str(data['user']['id'])) # u_id entities.append( data['user']['screen_name']) #.encode('utf-8')) # u_handle if data['user']['name']: name = parselogic.reformat(data['user']['name'], self.emojis, mode=1.0, lcase=self.lcase) entities.append(name) #.encode('utf-8')) # u_name else: entities.append('') if data['user']['description']: desc = parselogic.reformat(data['user']['description'], self.emojis, mode=self.mode, lcase=self.lcase) entities.append(desc) #.encode('utf-8')) # u_desc else: entities.append('') try: entities.append(data['user']['url']) # u_url except: entities.append('') try: created = parselogic.ts(data['user']['created_at'], format=True) ######### except: created = data['user']['created_at'] entities.append(created) # u_create entities.append(str(data['user']['statuses_count'])) #u_tweets entities.append(str(data['user']['friends_count'])) # u_fo_out entities.append(str(data['user']['followers_count'])) # u_fo_in entities.append(str(data['user']['favourites_count'])) # u_likes #Deprecated try: entities.append(str(int(data['user']['utc_offset']) / 3600)) except: entities.append('') try: loc = parselogic.reformat(data['user']['location'], self.emojis, mode=self.mode, lcase=self.lcase) entities.append(loc) except: entities.append('') #Deprecated if str(data['user']['geo_enabled']) == 'true': entities.append(1) # u_geotag else: entities.append(0) # u_geotag try: entities.append(data['user']['lang']) # u_lang except: entities.append('') try: entities.append(data['user']['profile_image_url']) # u_imgurl except: entities.append('') try: entities.append(data['user']['profile_banner_url']) # u_bgurl except: entities.append('') if str(data['user']['protected']) == 'true': entities.append(1) # u_privat else: entities.append(0) # u_privat if str(data['user']['verified']) == 'true': entities.append(1) # u_verify else: entities.append(0) # u_verify # placeholder for tracking number of captured tweets / user entities.append('') # u_n_capt #################### ### Tweet-level data #################### try: t_id = ('\'' + data['id_str']) except: try: t_id = ('\'' + str(data['id'])) # t_id except: t_id = '\'' entities.append(t_id) # t_id text = parselogic.reformat(parsed_text, self.emojis, mode=self.mode, lcase=self.lcase) entities.append(text) # t_text quote = parselogic.reformat(parsed_quote, self.emojis, mode=self.mode, lcase=self.lcase) entities.append(quote) # t_quote entities.append('http://twitter.com/' + str(data['user']['screen_name']) + '/status/' + t_id.strip('\'')) # t_url try: date = parselogic.ts(data['created_at'], format=True) ########## except: date = data['created_at'] entities.append(date) # t_date coords = decoder.getCoords(self, data) coords_str = str(coords[1]) + ' ' + str(coords[0]) entities.append(coords_str) # t_geolat t_geolon poly_coords = decoder.getPolygonCoords(self, data) entities.append(poly_coords[0] + ' ' + poly_coords[1]) try: place = parselogic.reformat(data['place']['full_name'], self.emojis, mode=1.0, lcase=self.lcase) except: place = '' entities.append(place) # t_place try: lang = data['lang'] except: lang = '' entities.append(lang) # t_lang try: entities.append('\'' + data['in_reply_to_status_id_str']) # re_t_tid entities.append('\'' + data['in_reply_to_user_id_str']) # re_u_id except: entities.append('') # re_t_id entities.append('') # re_u_id try: entities.append('\'' + data['quoted_status']['id_str']) # qu_t_tid except: entities.append('') try: entities.append('\'' + data['quoted_status']['user']['id_str']) # qu_u_id except: entities.append('') try: entities.append(data['quoted_status']['retweet_count']) # qu_n_rt except: entities.append('') try: entities.append( data['quoted_status']['favorite_count']) # qu_n_fav except: entities.append('') try: entities.append(data['quoted_status']['reply_count']) # qu_n_rep except: entities.append('') try: entities.append(data['quoted_status']['quote_count']) # qu_n_quo except: entities.append('') try: entities.append('\'' + data['retweeted_status']['id_str']) # rt_t_tid except: entities.append('') try: entities.append( '\'' + data['retweeted_status']['user']['id_str']) # rt_u_id except: entities.append('') try: entities.append( data['retweeted_status']['retweet_count']) # rt_n_rt except: entities.append('') try: entities.append( data['retweeted_status']['favorite_count']) # rt_n_fav except: entities.append('') try: entities.append( data['retweeted_status']['reply_count']) # rt_n_rep except: entities.append('') try: entities.append( data['retweeted_status']['quote_count']) # rt_n_quo except: entities.append('') #Added for age prediction modelling if str(data['user']['default_profile']).lower() == 'true': entities.append(1) else: entities.append(0) if str(data['user']['default_profile_image']).lower() == 'true': entities.append(1) else: entities.append(0) try: entities.append(data['user']['listed_count']) # u_utcoff except: entities.append('') try: entities.append(len(data['entities']['hashtags'])) except: entities.append('') try: entities.append(len(data['entities']['urls'])) except: entities.append('') try: entities.append(len(data['entities']['user_mentions'])) except: entities.append('') try: media_list = data['extended_entities']['media'] entities.append(len(media_list)) except: entities.append(0) ### Might want to update to csv.writer dependency? with open(outfile, 'a', encoding='utf-8') as csvfile: saveFile = csv.writer(csvfile, delimiter='\t', lineterminator='\n') if count == 0: saveFile.writerow([ 'u_id', 'u_handle', 'u_name', 'u_desc', 'u_url', 'u_create', 'u_tweets', 'u_fo_out', 'u_fo_in', 'u_likes', 'u_utcoff', 'u_locate', 'u_geotag', 'u_lang', 'u_imgurl', 'u_bgurl', 'u_privat', 'u_verify', 'u_n_capt', 't_id', 't_text', 't_quote', 't_url', 't_date', 't_geopoint', 't_geopoly', 't_place', 't_lang', 're_t_id', 're_u_id', 'qu_t_id', 'qu_u_id', 'qu_n_qu', 'qu_n_re', 'qu_n_rt', 'qu_n_fav', 'rt_t_tid', 'rt_u_id', 'rt_n_qu', 'rt_n_re', 'rt_n_rt', 'rt_n_fav', 'u_profile', 'u_profile_img', 'u_list', 't_hashtags', 't_urls', 't_mentions', 't_media' ]) saveFile.writerow([entity for entity in entities])
def __init__(self, data, dirout='', header=True, combine=True, kw_redux=[], kw_incol=12, quote_incol=13, rt_ignore=True, rt_incol=9, geo_only=False, geo_incol=16): self.data = data self.kw_incol = kw_incol self.returns = {} self.delimit = ',' if len(kw_redux) > 0 or rt_ignore or geo_only: read_tweet = True else: read_tweet = False if type(data) == str: # String data should refer to a directory of CSV data files datadict = {} datalines = [] import os try: files = sorted(os.listdir(data)) except: raise IOError( 'String object passed, but it is not a valid directory.') i_total = 0 i_rt_total = 0 i_kw = 0 i_rt_kw = 0 for fn in files: ###### <--- Implement date range constraints about here if fn[-4:] in ['.csv', '.tsv']: i_line = 0 with open(data + fn, 'r') as infile: for line in infile.readlines(): if i_line == 0: if '\t' in line: self.delimit = '\t' #Determine if TSV file format from 1st line if i_line == 0 and header: head = line #If first line is expected to be a header, save the line as "head" else: if read_tweet: # If tweets need to be read l_list = line.split(self.delimit) added = False ignored = False is_rt = False if rt_ignore and len(l_list[rt_incol]) > 0: i_rt_total = i_rt_total + 1 is_rt = True ignored = True # Ignore RTs if geo_only and len(l_list[geo_incol]) < 1: ignored = True # Ignore non-Geocoded if len(kw_redux) > 0: kw_is_rt = False for kw in kw_redux: # Remember to search text and quoted text! text = l_list[ kw_incol] + ' ' + l_list[ quote_incol] text = parselogic.reformat( text, mode=4.5, lcase=True) #if kw in text.lower() and not added and not ignored: if parselogic.match(kw, text): if not added and not ignored: datadict.setdefault( fn, []).append(line) datalines.append(line) added = True i_kw = i_kw + 1 elif is_rt: kw_is_rt = True if kw_is_rt: i_kw = i_kw + 1 i_rt_kw = i_rt_kw + 1 else: if not added and not ignored: datadict.setdefault( fn, []).append(line) datalines.append(line) added = True else: # Fast and simple datadict.setdefault(fn, []).append(line) datalines.append(line) i_line = i_line + 1 #Counting all read lines per file i_total = i_total + i_line - 1 #Total data lines (-1 for header line) self.head = head self.datadict = datadict self.data = datalines print('\n----- FILE TOTALS:') print('Tweets observed: ' + str(i_total)) print('Retweets observed: ' + str(i_rt_total)) print('Original tweets: ' + str(i_total - i_rt_total)) if len(kw_redux) > 0: print('\n----- KEYWORD-MATCHED:') print('Tweets observed: ' + str(i_kw)) print('Retweets observed: ' + str(i_rt_kw)) print('Original tweets: ' + str(i_kw - i_rt_kw)) if rt_ignore: print('\nIGNORING RETWEETS...\n') print('Tweets in sample: ' + str(len(datalines)))
def __init__(self, data=None, head=None, dir_in='', datafiles=[], header=True, rt_ignore=True, geo_only=False, kw_redux=[], uid_redux=[], val_redux=[]): self.data = data self.returns = {} self.delimit = '\t' cols = parselogic.t_col() self.col = cols match_kw = False match_uid = False match_val = False # If no data was passed, then read-in data if not data: datadict = {} datalines = [] read_tweet = False i_total = 0 i_rt_total = 0 i_kw = 0 i_rt_kw = 0 i_uid = 0 i_rt_uid = 0 i_val = 0 i_rt_val = 0 if len(kw_redux) > 0: read_tweet = True match_kw = True if len(uid_redux) > 0: read_tweet = True match_uid = True if len(val_redux) > 0: read_tweet = True match_val = True if rt_ignore or geo_only: read_tweet = True for fn in datafiles: i_line = 0 with open(dir_in + fn, 'r', encoding='utf-8') as infile: for line in infile: if i_line == 0 and header: head = line #If first line is expected to be a header, save the line as "head" cols = parselogic.t_col(head=head, delimit=self.delimit) self.col = cols else: if read_tweet: # If tweets need to be read l_list = line.split(self.delimit) added = False ignored = False is_rt = False matched_kw = False matched_uid = False try: if len( l_list[cols['rt_t_tid']] ) > 0: #this coloum is misnamed in some files. i_rt_total = i_rt_total + 1 is_rt = True if rt_ignore: ignored = True # Ignore RTs except: if len(l_list[cols['rt_t_id']] ) > 0: #it should be this i_rt_total = i_rt_total + 1 is_rt = True if rt_ignore: ignored = True # Ignore RTs if geo_only and len( l_list[cols['u_geotag']]) == 0: ignored = True # Ignore non-Geocoded if match_kw: text = ' ' + l_list[ cols['t_text']] + ' ' + l_list[ cols['t_quote']] + ' ' text = parselogic.reformat(text, emojis=None, mode=4.5, lcase=True) kw_is_rt = False for kw in kw_redux: if parselogic.match(kw, text): matched_kw = True i_kw += 1 if is_rt: kw_is_rt = True i_rt_kw += 1 break if not matched_kw: ignored = True if match_uid and not ignored: if l_list[cols['u_id']] in uid_redux: matched_uid = True i_uid += 1 if is_rt: uid_is_rt = True i_rt_uid += 1 else: ignored = True if match_val and not ignored: matched_val = parselogic.criteria_match( l_list=l_list, cols=cols, vals=val_redux) if matched_val: i_val += 1 if is_rt: val_is_rt = True i_rt_val += 1 else: ignored = True if not ignored: datadict.setdefault(fn, []).append(line) datalines.append(line) else: # Nothing to check datadict.setdefault(fn, []).append(line) datalines.append(line) i_line += 1 #Counting all read lines per file i_total = i_total + i_line - 1 #Total data lines (-1 for header line) self.head = head self.datadict = datadict self.data = datalines print('\n----- FILE TOTALS:') print('All Tweets+RTs: ' + str(i_total)) print('Retweets: ' + str(i_rt_total)) print('Original Tweets: ' + str(i_total - i_rt_total)) if match_kw: print('\n----- KEYWORD MATCHED:') print('All Tweets+RTs: ', i_kw) print('Retweets: ', i_rt_kw) print('Original Tweets: ', i_kw - i_rt_kw) if match_uid: if match_kw: print('\n----- USER_ID MATCHED: (w/in keyword matched)') else: print('\n----- USER_ID MATCHED:') if rt_ignore: print('Retweets ignored') else: print('All Tweets+RTs: ', i_uid) print('Retweets: ', i_rt_uid) print('Original Tweets: ', i_uid - i_rt_uid) if match_val: if match_uid: print('\n----- VALUE MATCHED: (w/in user_id matched)') elif match_kw: print('\n----- VALUE MATCHED: (w/in keyword matched)') else: print('\n----- VALUE MATCHED:') if rt_ignore: print('Retweets ignored') else: print('All Tweets+RTs: ', i_val) print('Retweets: ', i_rt_val) print('Original Tweets: ', i_val - i_rt_val) print('\nTweets in sample: ', len(datalines))
def DataDict(dir_in, start=None, end=None, rt_ignore=True, rt_col=36, loc_check=False, loc_col=11, text_col=20, quote_col=21, f_ext='.tsv', delimiter='\t', keywords=[]): date_list = [] datafiles = parselogic.filelist(dir_in, f_ext=f_ext, start=start, end=end) for fi in datafiles: date_list.append(fi[:8]) data_dict = {} for date in list(set(date_list)): data_dict[date] = {} data_dict[date]['_tweets'] = 0 data_dict[date]['_tweets_inloc'] = 0 #loc_check functionality not used data_dict[date]['_rts_ignored'] = 0 data_dict[date]['_hits'] = 0 data_dict[date]['_hits_inloc'] = 0 #loc_check functionality not used data_dict[date]['_error'] = 0 #_error functionality not used for k in keywords: data_dict[date][k] = 0 for f in datafiles: with open(dir_in + f, 'r', encoding='utf-8') as o: i = 0 for l in o: #for each line in the open file i += 1 try: text = l.split(delimiter)[text_col] + ' ' + l.split( delimiter)[ quote_col] #get text from tweet and quote fields except Exception as e: print('error: ' + str(e)) print('file: ' + f) print('line: ' + str(i)) print('data: ' + str(l)) text = parselogic.reformat(text, emojis=None, mode=4.5) #format text for matching hit = False if l.split(delimiter)[text_col].strip() in [ 'text', 't_text' ]: #ignore header row pass else: if rt_ignore and not len( l.split(delimiter)[rt_col].strip()) > 0: for k in keywords: #for each line of keyword(s) matched = parselogic.match( k, text) #test it against the tweet if matched: data_dict[f[:8]][k] += 1 hit = True elif rt_ignore: data_dict[f[:8]]['_rts_ignored'] += 1 else: for k in keywords: #for each line of keyword(s) matched = parselogic.match( k, text) #test it against the tweet if matched: data_dict[f[:8]][k] += 1 hit = True if hit: data_dict[f[:8]]['_hits'] += 1 #if loc_check: data_dict[f[:8]]['_hits_inloc'] += parselogic.locmatch(l.split(delimiter)[loc_col]) #loc_check functionality not used data_dict[f[:8]]['_tweets'] += 1 #if loc_check: data_dict[f[:8]]['_tweets_inloc'] += parselogic.locmatch(l.split(delimiter)[loc_col]) #loc_check functionality not used return data_dict