Beispiel #1
0
 def kwmatch(lc, keywords=keywords):
     text = ' '+lc[t_col('t_text')]+' '+lc[t_col('t_quote')]+' '
     text = parselogic.reformat(text, emojis=None, mode=4.5) #format text for matching 
     for k in keywords: #for each keyword
         matched = parselogic.match(k, text) #test it against the tweet
         if matched:
             return True
     return False
Beispiel #2
0
 def checkForKWs(self, kwtext):
     hit = False
     formattedtext = parselogic.reformat(kwtext,
                                         self.emojis,
                                         mode=4.5,
                                         lcase=True)
     for kw in self.keywords:
         if parselogic.match(kw, formattedtext):
             hit = True
             self.n_matches += 1
             break  # DOES THIS WORK TO SAVE PROCESSING?
     return hit
Beispiel #3
0
    def __init__(self,
                 data,
                 dirout='',
                 header=True,
                 combine=True,
                 kw_redux=[],
                 kw_incol=12,
                 quote_incol=13,
                 rt_ignore=True,
                 rt_incol=9,
                 geo_only=False,
                 geo_incol=16):
        self.data = data
        self.kw_incol = kw_incol
        self.returns = {}
        self.delimit = ','

        if len(kw_redux) > 0 or rt_ignore or geo_only:
            read_tweet = True
        else:
            read_tweet = False

        if type(data) == str:
            # String data should refer to a directory of CSV data files

            datadict = {}
            datalines = []

            import os

            try:
                files = sorted(os.listdir(data))
            except:
                raise IOError(
                    'String object passed, but it is not a valid directory.')
            i_total = 0
            i_rt_total = 0
            i_kw = 0
            i_rt_kw = 0
            for fn in files:
                ###### <--- Implement date range constraints about here
                if fn[-4:] in ['.csv', '.tsv']:
                    i_line = 0
                    with open(data + fn, 'r') as infile:
                        for line in infile.readlines():
                            if i_line == 0:
                                if '\t' in line:
                                    self.delimit = '\t'  #Determine if TSV file format from 1st line

                            if i_line == 0 and header:
                                head = line  #If first line is expected to be a header, save the line as "head"
                            else:
                                if read_tweet:  # If tweets need to be read
                                    l_list = line.split(self.delimit)
                                    added = False
                                    ignored = False
                                    is_rt = False
                                    if rt_ignore and len(l_list[rt_incol]) > 0:
                                        i_rt_total = i_rt_total + 1
                                        is_rt = True
                                        ignored = True  # Ignore RTs
                                    if geo_only and len(l_list[geo_incol]) < 1:
                                        ignored = True  # Ignore non-Geocoded

                                    if len(kw_redux) > 0:
                                        kw_is_rt = False
                                        for kw in kw_redux:
                                            # Remember to search text and quoted text!
                                            text = l_list[
                                                kw_incol] + ' ' + l_list[
                                                    quote_incol]
                                            text = parselogic.reformat(
                                                text, mode=4.5, lcase=True)
                                            #if kw in text.lower() and not added and not ignored:
                                            if parselogic.match(kw, text):
                                                if not added and not ignored:
                                                    datadict.setdefault(
                                                        fn, []).append(line)
                                                    datalines.append(line)
                                                    added = True
                                                    i_kw = i_kw + 1
                                                elif is_rt:
                                                    kw_is_rt = True
                                        if kw_is_rt:
                                            i_kw = i_kw + 1
                                            i_rt_kw = i_rt_kw + 1
                                    else:
                                        if not added and not ignored:
                                            datadict.setdefault(
                                                fn, []).append(line)
                                            datalines.append(line)
                                            added = True
                                else:  # Fast and simple
                                    datadict.setdefault(fn, []).append(line)
                                    datalines.append(line)
                            i_line = i_line + 1  #Counting all read lines per file
                        i_total = i_total + i_line - 1  #Total data lines (-1 for header line)

            self.head = head
            self.datadict = datadict
            self.data = datalines
            print('\n----- FILE TOTALS:')
            print('Tweets observed:   ' + str(i_total))
            print('Retweets observed: ' + str(i_rt_total))
            print('Original tweets:   ' + str(i_total - i_rt_total))

            if len(kw_redux) > 0:
                print('\n----- KEYWORD-MATCHED:')
                print('Tweets observed:   ' + str(i_kw))
                print('Retweets observed: ' + str(i_rt_kw))
                print('Original tweets:   ' + str(i_kw - i_rt_kw))

            if rt_ignore: print('\nIGNORING RETWEETS...\n')

            print('Tweets in sample:  ' + str(len(datalines)))
Beispiel #4
0
    def __init__(self,
                 data=None,
                 head=None,
                 dir_in='',
                 datafiles=[],
                 header=True,
                 rt_ignore=True,
                 geo_only=False,
                 kw_redux=[],
                 uid_redux=[],
                 val_redux=[]):
        self.data = data
        self.returns = {}
        self.delimit = '\t'

        cols = parselogic.t_col()
        self.col = cols

        match_kw = False
        match_uid = False
        match_val = False

        # If no data was passed, then read-in data
        if not data:
            datadict = {}
            datalines = []

            read_tweet = False
            i_total = 0
            i_rt_total = 0
            i_kw = 0
            i_rt_kw = 0
            i_uid = 0
            i_rt_uid = 0
            i_val = 0
            i_rt_val = 0

            if len(kw_redux) > 0:
                read_tweet = True
                match_kw = True

            if len(uid_redux) > 0:
                read_tweet = True
                match_uid = True

            if len(val_redux) > 0:
                read_tweet = True
                match_val = True

            if rt_ignore or geo_only:
                read_tweet = True

            for fn in datafiles:
                i_line = 0
                with open(dir_in + fn, 'r', encoding='utf-8') as infile:
                    for line in infile:
                        if i_line == 0 and header:
                            head = line  #If first line is expected to be a header, save the line as "head"
                            cols = parselogic.t_col(head=head,
                                                    delimit=self.delimit)
                            self.col = cols
                        else:
                            if read_tweet:  # If tweets need to be read
                                l_list = line.split(self.delimit)
                                added = False
                                ignored = False
                                is_rt = False
                                matched_kw = False
                                matched_uid = False

                                try:
                                    if len(
                                            l_list[cols['rt_t_tid']]
                                    ) > 0:  #this coloum is misnamed in some files.
                                        i_rt_total = i_rt_total + 1
                                        is_rt = True
                                        if rt_ignore:
                                            ignored = True  # Ignore RTs
                                except:
                                    if len(l_list[cols['rt_t_id']]
                                           ) > 0:  #it should be this
                                        i_rt_total = i_rt_total + 1
                                        is_rt = True
                                        if rt_ignore:
                                            ignored = True  # Ignore RTs

                                if geo_only and len(
                                        l_list[cols['u_geotag']]) == 0:
                                    ignored = True  # Ignore non-Geocoded

                                if match_kw:
                                    text = ' ' + l_list[
                                        cols['t_text']] + ' ' + l_list[
                                            cols['t_quote']] + ' '
                                    text = parselogic.reformat(text,
                                                               emojis=None,
                                                               mode=4.5,
                                                               lcase=True)
                                    kw_is_rt = False
                                    for kw in kw_redux:
                                        if parselogic.match(kw, text):
                                            matched_kw = True
                                            i_kw += 1
                                            if is_rt:
                                                kw_is_rt = True
                                                i_rt_kw += 1
                                            break
                                    if not matched_kw:
                                        ignored = True

                                if match_uid and not ignored:
                                    if l_list[cols['u_id']] in uid_redux:
                                        matched_uid = True
                                        i_uid += 1
                                        if is_rt:
                                            uid_is_rt = True
                                            i_rt_uid += 1
                                    else:
                                        ignored = True

                                if match_val and not ignored:
                                    matched_val = parselogic.criteria_match(
                                        l_list=l_list,
                                        cols=cols,
                                        vals=val_redux)
                                    if matched_val:
                                        i_val += 1
                                        if is_rt:
                                            val_is_rt = True
                                            i_rt_val += 1
                                    else:
                                        ignored = True

                                if not ignored:
                                    datadict.setdefault(fn, []).append(line)
                                    datalines.append(line)

                            else:  # Nothing to check
                                datadict.setdefault(fn, []).append(line)
                                datalines.append(line)
                        i_line += 1  #Counting all read lines per file
                    i_total = i_total + i_line - 1  #Total data lines (-1 for header line)

            self.head = head
            self.datadict = datadict
            self.data = datalines
            print('\n----- FILE TOTALS:')
            print('All Tweets+RTs:       ' + str(i_total))
            print('Retweets:             ' + str(i_rt_total))
            print('Original Tweets:      ' + str(i_total - i_rt_total))

            if match_kw:
                print('\n----- KEYWORD MATCHED:')
                print('All Tweets+RTs:      ', i_kw)
                print('Retweets:            ', i_rt_kw)
                print('Original Tweets:     ', i_kw - i_rt_kw)
            if match_uid:
                if match_kw:
                    print('\n----- USER_ID MATCHED: (w/in keyword matched)')
                else:
                    print('\n----- USER_ID MATCHED:')
                if rt_ignore:
                    print('Retweets ignored')
                else:
                    print('All Tweets+RTs:      ', i_uid)
                    print('Retweets:            ', i_rt_uid)
                print('Original Tweets:     ', i_uid - i_rt_uid)
            if match_val:
                if match_uid:
                    print('\n----- VALUE MATCHED: (w/in user_id matched)')
                elif match_kw:
                    print('\n----- VALUE MATCHED: (w/in keyword matched)')
                else:
                    print('\n----- VALUE MATCHED:')
                if rt_ignore:
                    print('Retweets ignored')
                else:
                    print('All Tweets+RTs:      ', i_val)
                    print('Retweets:            ', i_rt_val)
                print('Original Tweets:     ', i_val - i_rt_val)

            print('\nTweets in sample:    ', len(datalines))
Beispiel #5
0
def DataDict(dir_in,
             start=None,
             end=None,
             rt_ignore=True,
             rt_col=36,
             loc_check=False,
             loc_col=11,
             text_col=20,
             quote_col=21,
             f_ext='.tsv',
             delimiter='\t',
             keywords=[]):
    date_list = []
    datafiles = parselogic.filelist(dir_in, f_ext=f_ext, start=start, end=end)
    for fi in datafiles:
        date_list.append(fi[:8])

    data_dict = {}
    for date in list(set(date_list)):
        data_dict[date] = {}
        data_dict[date]['_tweets'] = 0
        data_dict[date]['_tweets_inloc'] = 0  #loc_check functionality not used
        data_dict[date]['_rts_ignored'] = 0
        data_dict[date]['_hits'] = 0
        data_dict[date]['_hits_inloc'] = 0  #loc_check functionality not used
        data_dict[date]['_error'] = 0  #_error functionality not used
        for k in keywords:
            data_dict[date][k] = 0

    for f in datafiles:
        with open(dir_in + f, 'r', encoding='utf-8') as o:
            i = 0
            for l in o:  #for each line in the open file
                i += 1
                try:
                    text = l.split(delimiter)[text_col] + ' ' + l.split(
                        delimiter)[
                            quote_col]  #get text from tweet and quote fields
                except Exception as e:
                    print('error: ' + str(e))
                    print('file:  ' + f)
                    print('line:  ' + str(i))
                    print('data:  ' + str(l))

                text = parselogic.reformat(text, emojis=None,
                                           mode=4.5)  #format text for matching
                hit = False
                if l.split(delimiter)[text_col].strip() in [
                        'text', 't_text'
                ]:  #ignore header row
                    pass
                else:
                    if rt_ignore and not len(
                            l.split(delimiter)[rt_col].strip()) > 0:
                        for k in keywords:  #for each line of keyword(s)
                            matched = parselogic.match(
                                k, text)  #test it against the tweet
                            if matched:
                                data_dict[f[:8]][k] += 1
                                hit = True
                    elif rt_ignore:
                        data_dict[f[:8]]['_rts_ignored'] += 1
                    else:
                        for k in keywords:  #for each line of keyword(s)
                            matched = parselogic.match(
                                k, text)  #test it against the tweet
                            if matched:
                                data_dict[f[:8]][k] += 1
                                hit = True

                    if hit:
                        data_dict[f[:8]]['_hits'] += 1
                        #if loc_check: data_dict[f[:8]]['_hits_inloc'] += parselogic.locmatch(l.split(delimiter)[loc_col]) #loc_check functionality not used

                    data_dict[f[:8]]['_tweets'] += 1
                    #if loc_check: data_dict[f[:8]]['_tweets_inloc'] += parselogic.locmatch(l.split(delimiter)[loc_col]) #loc_check functionality not used

    return data_dict