def kwmatch(lc, keywords=keywords): text = ' '+lc[t_col('t_text')]+' '+lc[t_col('t_quote')]+' ' text = parselogic.reformat(text, emojis=None, mode=4.5) #format text for matching for k in keywords: #for each keyword matched = parselogic.match(k, text) #test it against the tweet if matched: return True return False
def checkForKWs(self, kwtext): hit = False formattedtext = parselogic.reformat(kwtext, self.emojis, mode=4.5, lcase=True) for kw in self.keywords: if parselogic.match(kw, formattedtext): hit = True self.n_matches += 1 break # DOES THIS WORK TO SAVE PROCESSING? return hit
def __init__(self, data, dirout='', header=True, combine=True, kw_redux=[], kw_incol=12, quote_incol=13, rt_ignore=True, rt_incol=9, geo_only=False, geo_incol=16): self.data = data self.kw_incol = kw_incol self.returns = {} self.delimit = ',' if len(kw_redux) > 0 or rt_ignore or geo_only: read_tweet = True else: read_tweet = False if type(data) == str: # String data should refer to a directory of CSV data files datadict = {} datalines = [] import os try: files = sorted(os.listdir(data)) except: raise IOError( 'String object passed, but it is not a valid directory.') i_total = 0 i_rt_total = 0 i_kw = 0 i_rt_kw = 0 for fn in files: ###### <--- Implement date range constraints about here if fn[-4:] in ['.csv', '.tsv']: i_line = 0 with open(data + fn, 'r') as infile: for line in infile.readlines(): if i_line == 0: if '\t' in line: self.delimit = '\t' #Determine if TSV file format from 1st line if i_line == 0 and header: head = line #If first line is expected to be a header, save the line as "head" else: if read_tweet: # If tweets need to be read l_list = line.split(self.delimit) added = False ignored = False is_rt = False if rt_ignore and len(l_list[rt_incol]) > 0: i_rt_total = i_rt_total + 1 is_rt = True ignored = True # Ignore RTs if geo_only and len(l_list[geo_incol]) < 1: ignored = True # Ignore non-Geocoded if len(kw_redux) > 0: kw_is_rt = False for kw in kw_redux: # Remember to search text and quoted text! text = l_list[ kw_incol] + ' ' + l_list[ quote_incol] text = parselogic.reformat( text, mode=4.5, lcase=True) #if kw in text.lower() and not added and not ignored: if parselogic.match(kw, text): if not added and not ignored: datadict.setdefault( fn, []).append(line) datalines.append(line) added = True i_kw = i_kw + 1 elif is_rt: kw_is_rt = True if kw_is_rt: i_kw = i_kw + 1 i_rt_kw = i_rt_kw + 1 else: if not added and not ignored: datadict.setdefault( fn, []).append(line) datalines.append(line) added = True else: # Fast and simple datadict.setdefault(fn, []).append(line) datalines.append(line) i_line = i_line + 1 #Counting all read lines per file i_total = i_total + i_line - 1 #Total data lines (-1 for header line) self.head = head self.datadict = datadict self.data = datalines print('\n----- FILE TOTALS:') print('Tweets observed: ' + str(i_total)) print('Retweets observed: ' + str(i_rt_total)) print('Original tweets: ' + str(i_total - i_rt_total)) if len(kw_redux) > 0: print('\n----- KEYWORD-MATCHED:') print('Tweets observed: ' + str(i_kw)) print('Retweets observed: ' + str(i_rt_kw)) print('Original tweets: ' + str(i_kw - i_rt_kw)) if rt_ignore: print('\nIGNORING RETWEETS...\n') print('Tweets in sample: ' + str(len(datalines)))
def __init__(self, data=None, head=None, dir_in='', datafiles=[], header=True, rt_ignore=True, geo_only=False, kw_redux=[], uid_redux=[], val_redux=[]): self.data = data self.returns = {} self.delimit = '\t' cols = parselogic.t_col() self.col = cols match_kw = False match_uid = False match_val = False # If no data was passed, then read-in data if not data: datadict = {} datalines = [] read_tweet = False i_total = 0 i_rt_total = 0 i_kw = 0 i_rt_kw = 0 i_uid = 0 i_rt_uid = 0 i_val = 0 i_rt_val = 0 if len(kw_redux) > 0: read_tweet = True match_kw = True if len(uid_redux) > 0: read_tweet = True match_uid = True if len(val_redux) > 0: read_tweet = True match_val = True if rt_ignore or geo_only: read_tweet = True for fn in datafiles: i_line = 0 with open(dir_in + fn, 'r', encoding='utf-8') as infile: for line in infile: if i_line == 0 and header: head = line #If first line is expected to be a header, save the line as "head" cols = parselogic.t_col(head=head, delimit=self.delimit) self.col = cols else: if read_tweet: # If tweets need to be read l_list = line.split(self.delimit) added = False ignored = False is_rt = False matched_kw = False matched_uid = False try: if len( l_list[cols['rt_t_tid']] ) > 0: #this coloum is misnamed in some files. i_rt_total = i_rt_total + 1 is_rt = True if rt_ignore: ignored = True # Ignore RTs except: if len(l_list[cols['rt_t_id']] ) > 0: #it should be this i_rt_total = i_rt_total + 1 is_rt = True if rt_ignore: ignored = True # Ignore RTs if geo_only and len( l_list[cols['u_geotag']]) == 0: ignored = True # Ignore non-Geocoded if match_kw: text = ' ' + l_list[ cols['t_text']] + ' ' + l_list[ cols['t_quote']] + ' ' text = parselogic.reformat(text, emojis=None, mode=4.5, lcase=True) kw_is_rt = False for kw in kw_redux: if parselogic.match(kw, text): matched_kw = True i_kw += 1 if is_rt: kw_is_rt = True i_rt_kw += 1 break if not matched_kw: ignored = True if match_uid and not ignored: if l_list[cols['u_id']] in uid_redux: matched_uid = True i_uid += 1 if is_rt: uid_is_rt = True i_rt_uid += 1 else: ignored = True if match_val and not ignored: matched_val = parselogic.criteria_match( l_list=l_list, cols=cols, vals=val_redux) if matched_val: i_val += 1 if is_rt: val_is_rt = True i_rt_val += 1 else: ignored = True if not ignored: datadict.setdefault(fn, []).append(line) datalines.append(line) else: # Nothing to check datadict.setdefault(fn, []).append(line) datalines.append(line) i_line += 1 #Counting all read lines per file i_total = i_total + i_line - 1 #Total data lines (-1 for header line) self.head = head self.datadict = datadict self.data = datalines print('\n----- FILE TOTALS:') print('All Tweets+RTs: ' + str(i_total)) print('Retweets: ' + str(i_rt_total)) print('Original Tweets: ' + str(i_total - i_rt_total)) if match_kw: print('\n----- KEYWORD MATCHED:') print('All Tweets+RTs: ', i_kw) print('Retweets: ', i_rt_kw) print('Original Tweets: ', i_kw - i_rt_kw) if match_uid: if match_kw: print('\n----- USER_ID MATCHED: (w/in keyword matched)') else: print('\n----- USER_ID MATCHED:') if rt_ignore: print('Retweets ignored') else: print('All Tweets+RTs: ', i_uid) print('Retweets: ', i_rt_uid) print('Original Tweets: ', i_uid - i_rt_uid) if match_val: if match_uid: print('\n----- VALUE MATCHED: (w/in user_id matched)') elif match_kw: print('\n----- VALUE MATCHED: (w/in keyword matched)') else: print('\n----- VALUE MATCHED:') if rt_ignore: print('Retweets ignored') else: print('All Tweets+RTs: ', i_val) print('Retweets: ', i_rt_val) print('Original Tweets: ', i_val - i_rt_val) print('\nTweets in sample: ', len(datalines))
def DataDict(dir_in, start=None, end=None, rt_ignore=True, rt_col=36, loc_check=False, loc_col=11, text_col=20, quote_col=21, f_ext='.tsv', delimiter='\t', keywords=[]): date_list = [] datafiles = parselogic.filelist(dir_in, f_ext=f_ext, start=start, end=end) for fi in datafiles: date_list.append(fi[:8]) data_dict = {} for date in list(set(date_list)): data_dict[date] = {} data_dict[date]['_tweets'] = 0 data_dict[date]['_tweets_inloc'] = 0 #loc_check functionality not used data_dict[date]['_rts_ignored'] = 0 data_dict[date]['_hits'] = 0 data_dict[date]['_hits_inloc'] = 0 #loc_check functionality not used data_dict[date]['_error'] = 0 #_error functionality not used for k in keywords: data_dict[date][k] = 0 for f in datafiles: with open(dir_in + f, 'r', encoding='utf-8') as o: i = 0 for l in o: #for each line in the open file i += 1 try: text = l.split(delimiter)[text_col] + ' ' + l.split( delimiter)[ quote_col] #get text from tweet and quote fields except Exception as e: print('error: ' + str(e)) print('file: ' + f) print('line: ' + str(i)) print('data: ' + str(l)) text = parselogic.reformat(text, emojis=None, mode=4.5) #format text for matching hit = False if l.split(delimiter)[text_col].strip() in [ 'text', 't_text' ]: #ignore header row pass else: if rt_ignore and not len( l.split(delimiter)[rt_col].strip()) > 0: for k in keywords: #for each line of keyword(s) matched = parselogic.match( k, text) #test it against the tweet if matched: data_dict[f[:8]][k] += 1 hit = True elif rt_ignore: data_dict[f[:8]]['_rts_ignored'] += 1 else: for k in keywords: #for each line of keyword(s) matched = parselogic.match( k, text) #test it against the tweet if matched: data_dict[f[:8]][k] += 1 hit = True if hit: data_dict[f[:8]]['_hits'] += 1 #if loc_check: data_dict[f[:8]]['_hits_inloc'] += parselogic.locmatch(l.split(delimiter)[loc_col]) #loc_check functionality not used data_dict[f[:8]]['_tweets'] += 1 #if loc_check: data_dict[f[:8]]['_tweets_inloc'] += parselogic.locmatch(l.split(delimiter)[loc_col]) #loc_check functionality not used return data_dict