Ejemplo n.º 1
0
 def kwmatch(lc, keywords=keywords):
     text = ' '+lc[t_col('t_text')]+' '+lc[t_col('t_quote')]+' '
     text = parselogic.reformat(text, emojis=None, mode=4.5) #format text for matching 
     for k in keywords: #for each keyword
         matched = parselogic.match(k, text) #test it against the tweet
         if matched:
             return True
     return False
Ejemplo n.º 2
0
 def checkForKWs(self, kwtext):
     hit = False
     formattedtext = parselogic.reformat(kwtext,
                                         self.emojis,
                                         mode=4.5,
                                         lcase=True)
     for kw in self.keywords:
         if parselogic.match(kw, formattedtext):
             hit = True
             self.n_matches += 1
             break  # DOES THIS WORK TO SAVE PROCESSING?
     return hit
Ejemplo n.º 3
0
        def gethashtags(data=data, hash_n=hash_n):
            hashdict = {}
            hashtop = {}
            for line in data:
                # Get tweet text:
                l_list = line.split(self.delimit)
                text = ' ' + l_list[cols['t_text']] + ' ' + l_list[
                    cols['t_quote']] + ' '
                text = parselogic.reformat(text,
                                           emojis=None,
                                           mode=4.5,
                                           lcase=True)
                text = re.sub('\\', ' ', text)
                text = re.sub('# ', '#', text)
                text = re.sub('#', ' #', text)
                # Get unique hashtags from within tweet text:
                hashtags = list(
                    set(part[1:] for part in text.split()
                        if part.startswith('#')))
                # Remove empty entry, if present
                try:
                    hashtags.remove('')
                except:
                    pass
                # Add hashtag and count to hashdict
                for hashtag in hashtags:
                    # Remove punctuation from hashtag strings:
                    hashtag = hashtag.strip(string.punctuation)
                    # Add hashtag to hashdict:
                    if hashtag not in hashdict:
                        hashdict[hashtag] = 1
                    else:
                        hashdict[hashtag] += 1
            n = 0
            for tophash in sorted(hashdict, key=hashdict.get, reverse=True):
                if n <= hash_n or not hash_n:

                    # Add top hashtags to hashtop dict
                    hashtop[str(tophash)] = hashdict[tophash]
                    n = n + 1
                else:
                    # Stop after hash_n is reached
                    break
            return hashtop, data
Ejemplo n.º 4
0
    def writeToCSV(self, data, parsed_text, parsed_quote, fn, count):

        entities = []

        outfile = self.dirOut + str(
            fn[:14] + '_data' +
            self.out_extension)  # Changed from .csv to .tsv (20180716 JC)

        ###################
        ### User-level data
        ###################

        entities.append('\'' + str(data['user']['id']))  # u_id

        entities.append(
            data['user']['screen_name'])  #.encode('utf-8')) # u_handle

        if data['user']['name']:
            name = parselogic.reformat(data['user']['name'],
                                       self.emojis,
                                       mode=1.0,
                                       lcase=self.lcase)
            entities.append(name)  #.encode('utf-8')) # u_name
        else:
            entities.append('')

        if data['user']['description']:
            desc = parselogic.reformat(data['user']['description'],
                                       self.emojis,
                                       mode=self.mode,
                                       lcase=self.lcase)
            entities.append(desc)  #.encode('utf-8')) # u_desc
        else:
            entities.append('')

        try:
            entities.append(data['user']['url'])  # u_url
        except:
            entities.append('')

        try:
            created = parselogic.ts(data['user']['created_at'],
                                    format=True)  #########
        except:
            created = data['user']['created_at']
        entities.append(created)  # u_create

        entities.append(str(data['user']['statuses_count']))  #u_tweets
        entities.append(str(data['user']['friends_count']))  # u_fo_out
        entities.append(str(data['user']['followers_count']))  # u_fo_in
        entities.append(str(data['user']['favourites_count']))  # u_likes

        #Deprecated
        try:
            entities.append(str(int(data['user']['utc_offset']) / 3600))
        except:
            entities.append('')

        try:
            loc = parselogic.reformat(data['user']['location'],
                                      self.emojis,
                                      mode=self.mode,
                                      lcase=self.lcase)
            entities.append(loc)
        except:
            entities.append('')
        #Deprecated
        if str(data['user']['geo_enabled']) == 'true':
            entities.append(1)  # u_geotag
        else:
            entities.append(0)  # u_geotag

        try:
            entities.append(data['user']['lang'])  # u_lang
        except:
            entities.append('')

        try:
            entities.append(data['user']['profile_image_url'])  # u_imgurl
        except:
            entities.append('')

        try:
            entities.append(data['user']['profile_banner_url'])  # u_bgurl
        except:
            entities.append('')

        if str(data['user']['protected']) == 'true':
            entities.append(1)  # u_privat
        else:
            entities.append(0)  # u_privat

        if str(data['user']['verified']) == 'true':
            entities.append(1)  # u_verify
        else:
            entities.append(0)  # u_verify

        # placeholder for tracking number of captured tweets / user
        entities.append('')  # u_n_capt

        ####################
        ### Tweet-level data
        ####################

        try:
            t_id = ('\'' + data['id_str'])
        except:
            try:
                t_id = ('\'' + str(data['id']))  # t_id
            except:
                t_id = '\''
        entities.append(t_id)  # t_id

        text = parselogic.reformat(parsed_text,
                                   self.emojis,
                                   mode=self.mode,
                                   lcase=self.lcase)
        entities.append(text)  # t_text

        quote = parselogic.reformat(parsed_quote,
                                    self.emojis,
                                    mode=self.mode,
                                    lcase=self.lcase)
        entities.append(quote)  # t_quote

        entities.append('http://twitter.com/' +
                        str(data['user']['screen_name']) + '/status/' +
                        t_id.strip('\''))  # t_url

        try:
            date = parselogic.ts(data['created_at'], format=True)  ##########
        except:
            date = data['created_at']
        entities.append(date)  # t_date

        coords = decoder.getCoords(self, data)
        coords_str = str(coords[1]) + ' ' + str(coords[0])
        entities.append(coords_str)  # t_geolat t_geolon

        poly_coords = decoder.getPolygonCoords(self, data)
        entities.append(poly_coords[0] + ' ' + poly_coords[1])

        try:
            place = parselogic.reformat(data['place']['full_name'],
                                        self.emojis,
                                        mode=1.0,
                                        lcase=self.lcase)
        except:
            place = ''
        entities.append(place)  # t_place

        try:
            lang = data['lang']
        except:
            lang = ''
        entities.append(lang)  # t_lang

        try:
            entities.append('\'' +
                            data['in_reply_to_status_id_str'])  # re_t_tid
            entities.append('\'' + data['in_reply_to_user_id_str'])  # re_u_id
        except:
            entities.append('')  # re_t_id
            entities.append('')  # re_u_id

        try:
            entities.append('\'' + data['quoted_status']['id_str'])  # qu_t_tid
        except:
            entities.append('')

        try:
            entities.append('\'' +
                            data['quoted_status']['user']['id_str'])  # qu_u_id
        except:
            entities.append('')
        try:
            entities.append(data['quoted_status']['retweet_count'])  # qu_n_rt
        except:
            entities.append('')
        try:
            entities.append(
                data['quoted_status']['favorite_count'])  # qu_n_fav
        except:
            entities.append('')
        try:
            entities.append(data['quoted_status']['reply_count'])  # qu_n_rep
        except:
            entities.append('')
        try:
            entities.append(data['quoted_status']['quote_count'])  # qu_n_quo
        except:
            entities.append('')
        try:
            entities.append('\'' +
                            data['retweeted_status']['id_str'])  # rt_t_tid
        except:
            entities.append('')
        try:
            entities.append(
                '\'' + data['retweeted_status']['user']['id_str'])  # rt_u_id
        except:
            entities.append('')
        try:
            entities.append(
                data['retweeted_status']['retweet_count'])  # rt_n_rt
        except:
            entities.append('')
        try:
            entities.append(
                data['retweeted_status']['favorite_count'])  # rt_n_fav
        except:
            entities.append('')
        try:
            entities.append(
                data['retweeted_status']['reply_count'])  # rt_n_rep
        except:
            entities.append('')
        try:
            entities.append(
                data['retweeted_status']['quote_count'])  # rt_n_quo
        except:
            entities.append('')

        #Added for age prediction modelling
        if str(data['user']['default_profile']).lower() == 'true':
            entities.append(1)
        else:
            entities.append(0)

        if str(data['user']['default_profile_image']).lower() == 'true':
            entities.append(1)
        else:
            entities.append(0)

        try:
            entities.append(data['user']['listed_count'])  # u_utcoff
        except:
            entities.append('')

        try:
            entities.append(len(data['entities']['hashtags']))
        except:
            entities.append('')

        try:
            entities.append(len(data['entities']['urls']))
        except:
            entities.append('')

        try:
            entities.append(len(data['entities']['user_mentions']))
        except:
            entities.append('')

        try:
            media_list = data['extended_entities']['media']
            entities.append(len(media_list))
        except:
            entities.append(0)

        ### Might want to update to csv.writer dependency?
        with open(outfile, 'a', encoding='utf-8') as csvfile:
            saveFile = csv.writer(csvfile, delimiter='\t', lineterminator='\n')
            if count == 0:

                saveFile.writerow([
                    'u_id', 'u_handle', 'u_name', 'u_desc', 'u_url',
                    'u_create', 'u_tweets', 'u_fo_out', 'u_fo_in', 'u_likes',
                    'u_utcoff', 'u_locate', 'u_geotag', 'u_lang', 'u_imgurl',
                    'u_bgurl', 'u_privat', 'u_verify', 'u_n_capt', 't_id',
                    't_text', 't_quote', 't_url', 't_date', 't_geopoint',
                    't_geopoly', 't_place', 't_lang', 're_t_id', 're_u_id',
                    'qu_t_id', 'qu_u_id', 'qu_n_qu', 'qu_n_re', 'qu_n_rt',
                    'qu_n_fav', 'rt_t_tid', 'rt_u_id', 'rt_n_qu', 'rt_n_re',
                    'rt_n_rt', 'rt_n_fav', 'u_profile', 'u_profile_img',
                    'u_list', 't_hashtags', 't_urls', 't_mentions', 't_media'
                ])

            saveFile.writerow([entity for entity in entities])
Ejemplo n.º 5
0
    def __init__(self,
                 data,
                 dirout='',
                 header=True,
                 combine=True,
                 kw_redux=[],
                 kw_incol=12,
                 quote_incol=13,
                 rt_ignore=True,
                 rt_incol=9,
                 geo_only=False,
                 geo_incol=16):
        self.data = data
        self.kw_incol = kw_incol
        self.returns = {}
        self.delimit = ','

        if len(kw_redux) > 0 or rt_ignore or geo_only:
            read_tweet = True
        else:
            read_tweet = False

        if type(data) == str:
            # String data should refer to a directory of CSV data files

            datadict = {}
            datalines = []

            import os

            try:
                files = sorted(os.listdir(data))
            except:
                raise IOError(
                    'String object passed, but it is not a valid directory.')
            i_total = 0
            i_rt_total = 0
            i_kw = 0
            i_rt_kw = 0
            for fn in files:
                ###### <--- Implement date range constraints about here
                if fn[-4:] in ['.csv', '.tsv']:
                    i_line = 0
                    with open(data + fn, 'r') as infile:
                        for line in infile.readlines():
                            if i_line == 0:
                                if '\t' in line:
                                    self.delimit = '\t'  #Determine if TSV file format from 1st line

                            if i_line == 0 and header:
                                head = line  #If first line is expected to be a header, save the line as "head"
                            else:
                                if read_tweet:  # If tweets need to be read
                                    l_list = line.split(self.delimit)
                                    added = False
                                    ignored = False
                                    is_rt = False
                                    if rt_ignore and len(l_list[rt_incol]) > 0:
                                        i_rt_total = i_rt_total + 1
                                        is_rt = True
                                        ignored = True  # Ignore RTs
                                    if geo_only and len(l_list[geo_incol]) < 1:
                                        ignored = True  # Ignore non-Geocoded

                                    if len(kw_redux) > 0:
                                        kw_is_rt = False
                                        for kw in kw_redux:
                                            # Remember to search text and quoted text!
                                            text = l_list[
                                                kw_incol] + ' ' + l_list[
                                                    quote_incol]
                                            text = parselogic.reformat(
                                                text, mode=4.5, lcase=True)
                                            #if kw in text.lower() and not added and not ignored:
                                            if parselogic.match(kw, text):
                                                if not added and not ignored:
                                                    datadict.setdefault(
                                                        fn, []).append(line)
                                                    datalines.append(line)
                                                    added = True
                                                    i_kw = i_kw + 1
                                                elif is_rt:
                                                    kw_is_rt = True
                                        if kw_is_rt:
                                            i_kw = i_kw + 1
                                            i_rt_kw = i_rt_kw + 1
                                    else:
                                        if not added and not ignored:
                                            datadict.setdefault(
                                                fn, []).append(line)
                                            datalines.append(line)
                                            added = True
                                else:  # Fast and simple
                                    datadict.setdefault(fn, []).append(line)
                                    datalines.append(line)
                            i_line = i_line + 1  #Counting all read lines per file
                        i_total = i_total + i_line - 1  #Total data lines (-1 for header line)

            self.head = head
            self.datadict = datadict
            self.data = datalines
            print('\n----- FILE TOTALS:')
            print('Tweets observed:   ' + str(i_total))
            print('Retweets observed: ' + str(i_rt_total))
            print('Original tweets:   ' + str(i_total - i_rt_total))

            if len(kw_redux) > 0:
                print('\n----- KEYWORD-MATCHED:')
                print('Tweets observed:   ' + str(i_kw))
                print('Retweets observed: ' + str(i_rt_kw))
                print('Original tweets:   ' + str(i_kw - i_rt_kw))

            if rt_ignore: print('\nIGNORING RETWEETS...\n')

            print('Tweets in sample:  ' + str(len(datalines)))
Ejemplo n.º 6
0
    def __init__(self,
                 data=None,
                 head=None,
                 dir_in='',
                 datafiles=[],
                 header=True,
                 rt_ignore=True,
                 geo_only=False,
                 kw_redux=[],
                 uid_redux=[],
                 val_redux=[]):
        self.data = data
        self.returns = {}
        self.delimit = '\t'

        cols = parselogic.t_col()
        self.col = cols

        match_kw = False
        match_uid = False
        match_val = False

        # If no data was passed, then read-in data
        if not data:
            datadict = {}
            datalines = []

            read_tweet = False
            i_total = 0
            i_rt_total = 0
            i_kw = 0
            i_rt_kw = 0
            i_uid = 0
            i_rt_uid = 0
            i_val = 0
            i_rt_val = 0

            if len(kw_redux) > 0:
                read_tweet = True
                match_kw = True

            if len(uid_redux) > 0:
                read_tweet = True
                match_uid = True

            if len(val_redux) > 0:
                read_tweet = True
                match_val = True

            if rt_ignore or geo_only:
                read_tweet = True

            for fn in datafiles:
                i_line = 0
                with open(dir_in + fn, 'r', encoding='utf-8') as infile:
                    for line in infile:
                        if i_line == 0 and header:
                            head = line  #If first line is expected to be a header, save the line as "head"
                            cols = parselogic.t_col(head=head,
                                                    delimit=self.delimit)
                            self.col = cols
                        else:
                            if read_tweet:  # If tweets need to be read
                                l_list = line.split(self.delimit)
                                added = False
                                ignored = False
                                is_rt = False
                                matched_kw = False
                                matched_uid = False

                                try:
                                    if len(
                                            l_list[cols['rt_t_tid']]
                                    ) > 0:  #this coloum is misnamed in some files.
                                        i_rt_total = i_rt_total + 1
                                        is_rt = True
                                        if rt_ignore:
                                            ignored = True  # Ignore RTs
                                except:
                                    if len(l_list[cols['rt_t_id']]
                                           ) > 0:  #it should be this
                                        i_rt_total = i_rt_total + 1
                                        is_rt = True
                                        if rt_ignore:
                                            ignored = True  # Ignore RTs

                                if geo_only and len(
                                        l_list[cols['u_geotag']]) == 0:
                                    ignored = True  # Ignore non-Geocoded

                                if match_kw:
                                    text = ' ' + l_list[
                                        cols['t_text']] + ' ' + l_list[
                                            cols['t_quote']] + ' '
                                    text = parselogic.reformat(text,
                                                               emojis=None,
                                                               mode=4.5,
                                                               lcase=True)
                                    kw_is_rt = False
                                    for kw in kw_redux:
                                        if parselogic.match(kw, text):
                                            matched_kw = True
                                            i_kw += 1
                                            if is_rt:
                                                kw_is_rt = True
                                                i_rt_kw += 1
                                            break
                                    if not matched_kw:
                                        ignored = True

                                if match_uid and not ignored:
                                    if l_list[cols['u_id']] in uid_redux:
                                        matched_uid = True
                                        i_uid += 1
                                        if is_rt:
                                            uid_is_rt = True
                                            i_rt_uid += 1
                                    else:
                                        ignored = True

                                if match_val and not ignored:
                                    matched_val = parselogic.criteria_match(
                                        l_list=l_list,
                                        cols=cols,
                                        vals=val_redux)
                                    if matched_val:
                                        i_val += 1
                                        if is_rt:
                                            val_is_rt = True
                                            i_rt_val += 1
                                    else:
                                        ignored = True

                                if not ignored:
                                    datadict.setdefault(fn, []).append(line)
                                    datalines.append(line)

                            else:  # Nothing to check
                                datadict.setdefault(fn, []).append(line)
                                datalines.append(line)
                        i_line += 1  #Counting all read lines per file
                    i_total = i_total + i_line - 1  #Total data lines (-1 for header line)

            self.head = head
            self.datadict = datadict
            self.data = datalines
            print('\n----- FILE TOTALS:')
            print('All Tweets+RTs:       ' + str(i_total))
            print('Retweets:             ' + str(i_rt_total))
            print('Original Tweets:      ' + str(i_total - i_rt_total))

            if match_kw:
                print('\n----- KEYWORD MATCHED:')
                print('All Tweets+RTs:      ', i_kw)
                print('Retweets:            ', i_rt_kw)
                print('Original Tweets:     ', i_kw - i_rt_kw)
            if match_uid:
                if match_kw:
                    print('\n----- USER_ID MATCHED: (w/in keyword matched)')
                else:
                    print('\n----- USER_ID MATCHED:')
                if rt_ignore:
                    print('Retweets ignored')
                else:
                    print('All Tweets+RTs:      ', i_uid)
                    print('Retweets:            ', i_rt_uid)
                print('Original Tweets:     ', i_uid - i_rt_uid)
            if match_val:
                if match_uid:
                    print('\n----- VALUE MATCHED: (w/in user_id matched)')
                elif match_kw:
                    print('\n----- VALUE MATCHED: (w/in keyword matched)')
                else:
                    print('\n----- VALUE MATCHED:')
                if rt_ignore:
                    print('Retweets ignored')
                else:
                    print('All Tweets+RTs:      ', i_val)
                    print('Retweets:            ', i_rt_val)
                print('Original Tweets:     ', i_val - i_rt_val)

            print('\nTweets in sample:    ', len(datalines))
Ejemplo n.º 7
0
def DataDict(dir_in,
             start=None,
             end=None,
             rt_ignore=True,
             rt_col=36,
             loc_check=False,
             loc_col=11,
             text_col=20,
             quote_col=21,
             f_ext='.tsv',
             delimiter='\t',
             keywords=[]):
    date_list = []
    datafiles = parselogic.filelist(dir_in, f_ext=f_ext, start=start, end=end)
    for fi in datafiles:
        date_list.append(fi[:8])

    data_dict = {}
    for date in list(set(date_list)):
        data_dict[date] = {}
        data_dict[date]['_tweets'] = 0
        data_dict[date]['_tweets_inloc'] = 0  #loc_check functionality not used
        data_dict[date]['_rts_ignored'] = 0
        data_dict[date]['_hits'] = 0
        data_dict[date]['_hits_inloc'] = 0  #loc_check functionality not used
        data_dict[date]['_error'] = 0  #_error functionality not used
        for k in keywords:
            data_dict[date][k] = 0

    for f in datafiles:
        with open(dir_in + f, 'r', encoding='utf-8') as o:
            i = 0
            for l in o:  #for each line in the open file
                i += 1
                try:
                    text = l.split(delimiter)[text_col] + ' ' + l.split(
                        delimiter)[
                            quote_col]  #get text from tweet and quote fields
                except Exception as e:
                    print('error: ' + str(e))
                    print('file:  ' + f)
                    print('line:  ' + str(i))
                    print('data:  ' + str(l))

                text = parselogic.reformat(text, emojis=None,
                                           mode=4.5)  #format text for matching
                hit = False
                if l.split(delimiter)[text_col].strip() in [
                        'text', 't_text'
                ]:  #ignore header row
                    pass
                else:
                    if rt_ignore and not len(
                            l.split(delimiter)[rt_col].strip()) > 0:
                        for k in keywords:  #for each line of keyword(s)
                            matched = parselogic.match(
                                k, text)  #test it against the tweet
                            if matched:
                                data_dict[f[:8]][k] += 1
                                hit = True
                    elif rt_ignore:
                        data_dict[f[:8]]['_rts_ignored'] += 1
                    else:
                        for k in keywords:  #for each line of keyword(s)
                            matched = parselogic.match(
                                k, text)  #test it against the tweet
                            if matched:
                                data_dict[f[:8]][k] += 1
                                hit = True

                    if hit:
                        data_dict[f[:8]]['_hits'] += 1
                        #if loc_check: data_dict[f[:8]]['_hits_inloc'] += parselogic.locmatch(l.split(delimiter)[loc_col]) #loc_check functionality not used

                    data_dict[f[:8]]['_tweets'] += 1
                    #if loc_check: data_dict[f[:8]]['_tweets_inloc'] += parselogic.locmatch(l.split(delimiter)[loc_col]) #loc_check functionality not used

    return data_dict