def __init__(self):

        # Global spell checker
        #self.d = enchant.DictWithPWL("en_US", '/data1/wboag/ml/twitvec/twitvec/spelling/output.txt')
        self.d = enchant.Dict("en_US")

        # Common abbreviations and mistakes
        self.common = {}
        abbrevs = os.path.join(enabled_modules['spell'], 'abbrv.txt')
        with open(abbrevs, 'r') as f:
            for line in f.readlines():
                if line == '\n': continue
                abbrev, full = tuple(line.strip('\n').split(' || '))
                self.common[abbrev] = full

        # Load cache of spell-corrected words
        self.cache = Cache('B-enchant')
    def __init__(self, data=[]):
        # Lookup cache (constantly rerunning tagger takes time)
        self.cache = Cache('ark_tweet')

        # Unescape data
        self.h = HTMLParser()

        # Resolve and cache all currently uncached tweets
        self.resolve(data)
    def __init__(self, tagger, data=[]):
        # Lookup cache (constantly rerunning tagger takes time)
        cache_file = os.path.join(enabled_modules['caches'], 'twitter_nlp')
        self.cache = Cache(cache_file)

        # Output from the tagger
        self._words    = {}
        self._entities = {}
        self._pos      = {}
        self._events   = {}

        # Unescape data
        self.h = HTMLParser()

        # Resolve tweets
        self.tagger = tagger
        self.resolve(data)
    def __init__(self):

        # Global spell checker
        #self.d = enchant.DictWithPWL("en_US", '/data1/wboag/ml/twitvec/twitvec/spelling/output.txt')
        self.d = enchant.Dict("en_US")

        # Common abbreviations and mistakes
        self.common = {}
        abbrevs = os.path.join('/data1/nlp-data/twitter/tools/spell/abbrv.txt')
        with open(abbrevs,'r') as f:
            for line in f.readlines():
                if line == '\n': continue
                abbrev,full = tuple(line.strip('\n').split(' || '))
                self.common[abbrev] = full

        # Load cache of spell-corrected words
        self.cache = Cache('B-enchant')
class TwitterNLP:

    def __init__(self, tagger, data=[]):
        # Lookup cache (constantly rerunning tagger takes time)
        cache_file = os.path.join(enabled_modules['caches'], 'twitter_nlp')
        self.cache = Cache(cache_file)

        # Output from the tagger
        self._words    = {}
        self._entities = {}
        self._pos      = {}
        self._events   = {}

        # Unescape data
        self.h = HTMLParser()

        # Resolve tweets
        self.tagger = tagger
        self.resolve(data)


    def resolve(self, data):

        #print 'resolve length: ', len(data)

        data = [ self.h.unescape(twt).strip() for twt in set(data) ]

        # Tag the data
        if self.tagger:

            # Tag all uncached data
            uncached = [ twt for twt in data if not self.cache.has_key(twt) ]

            #print uncached
            #print 'len     : ', len(uncached)
            #print 'uncached: '
            #for twt in uncached: print '\t', twt
            #print '\n\n\n'

            if uncached:
                if self.tagger == 'cache':
                    msg = 'Uncached twitter_nlp data. Tagger must be installed.'
                    raise Exception(msg)
                partial = interface_nlp.resolve(self.tagger, uncached)
                for twt,tag in zip(uncached,partial):
                    self.cache.add_map(twt, tag)

            # Lookup all tags
            tagged = [ self.cache.get_map(twt) for twt in data ]

        else:
            tagged = []

        # Store the data in the object
        for twt,tags in zip(data,tagged):
            self._words[twt]    = [  '/'.join(t.split('/')[:-3]) for t in tags ]
            self._entities[twt] = [           t.split('/')[ -3]  for t in tags ]
            self._pos[twt]      = [           t.split('/')[ -2]  for t in tags ]
            self._events[twt]   = [           t.split('/')[ -1]  for t in tags ]
            #print 'tweet:    ', twt
            #print 'words:    ', self._words[twt]
            #print 'entities: ', self._entities[twt]
            #print 'POS:      ', self._pos[twt]
            #print 'events:   ', self._events[twt]
            #print



    def tokens(self, twt):
        twt = self.h.unescape(twt).strip()
        if twt not in self._words: 
            print 'not in: ', twt
            return []
        else:
            return self._words[twt]



    def entities(self, twt):
        twt = self.h.unescape(twt).strip()

        etype = None
        ents = []
        curr = []

        #print twt
        if twt not in self._words: return []

        for i in range(len(self._words[twt])):
            w   = self._words[   twt][i]
            tag = self._entities[twt][i]
            #print '\t', w, '\t', tag

            # Assumes 'I' never comes before a 'O'
            if tag[0] == 'I':
                curr.append(w)
            else:
                if curr:
                    ents.append( (etype,' '.join(curr)) )
                    curr = []

                if tag[0] == 'B':
                    etype = tag[2:]
                    curr = [w]

        # Flush remaining entity (if necessary)
        if curr: ents.append( (etype,' '.join(curr)) )

        #print ents
        return ents




    def brown(self, twt):
        twt = self.h.unescape(twt).strip()

        etype = None
        ents = []
        curr = []

        #print twt
        if twt not in self._words: return []

        for i in range(len(self._words[twt])):
            w   = self._words[   twt][i]
            tag = self._entities[twt][i]
            #print '\t', w, '\t', tag

            # Replace non-'O' with entity label
            if tag[0] != 'I':
                if curr:
                    ents.append( ' '.join(curr) )
                    curr = []

                if tag[0] == 'B':
                    curr = [tag[2:]]
                else:
                    curr = [w]


        # Flush remaining entity (if necessary)
        if curr: ents.append( ' '.join(curr) )

        #print ents
        #print 

        return ents



    def update(self, data):

        """
        update()

        Purpose: Run the tagger on a batch of tweets (rather than individually)

        @param data. A list of strings (each string is the text of a tweet)
        """

        self.resolve(data)



    def features(self, twt):

        """
        features()

        Purpose: Get twitter_nlp features

        @param twt.  The string text of a tweet.
        @return      A feature dictionary.
        """

        # Feature dictionary
        feats = {}

        # Escape text if not already done
        twt = self.h.unescape(twt).strip()

        # Feature: Entity types
        ents = self.entities(twt)
        for ent in ents:
            feats[ ('entity_type', ent[0]) ] = .5
            feats[ ('entity',      ent[1]) ] = .5

        # Feature: Brown Cluster bigrams
        clustered = self.brown(twt)
        for i in range(len(clustered)-1):
            bigram = clustered[i:i+2]
            feats[ ('brown_bigram',(clustered[i],clustered[i+1])) ] = .5

        # Feature: POS counts
        pos_counts = defaultdict(lambda:0)        
        for pos in self._pos[twt]:
            if pos not in string.punctuation:
                pos_counts[pos] += 1
        for pos,count in pos_counts.items():
            featname = 'pos_count-%s' % pos
            feats[featname] = count

        #print 'nlp: ', twt
        #print '\t', feats

        return feats
Exemple #6
0
    def __init__(self, sids=[], data=[]):
        # Tweet cache
        self.cache = Cache('twitter_data')

        # Cache all given data
        self.resolve(sids, data)
Exemple #7
0
class TwitterData:
    def __init__(self, sids=[], data=[]):
        # Tweet cache
        self.cache = Cache('twitter_data')

        # Cache all given data
        self.resolve(sids, data)

    def resolve(self, sids, data):
        """
        resolve()

        Purpose: Wrapper for interface_twitter.resolve() (to use object's cache)

        @param sids.  A list of twiiter IDs.
        @return       A list of tweets.
        """

        # Compile list of tweets that need to be quieried with API
        uncached = [sid for sid in sids if not self.cache.has_key(sid)]

        #print 'uncached: ', len(uncached)

        # Use API to lookup uncached tweets
        if uncached:
            partial = interface_twitter.resolve(uncached)
            for sid, twt in zip(uncached, partial):
                self.cache.add_map(sid, twt)

        # Get all tweets
        resolved = []
        for txt, sid in zip(data, sids):
            twt = self.cache.get_map(sid)
            if txt == twt['text']:
                res = twt
            else:
                res = None
            #print 'res: ', res
            resolved.append(res)

        return resolved

    def lookup(self, sids):
        """
        resolve()

        Purpose: Wrapper for interface_twitter.resolve() (to use object's cache)

        @param sids.  A list of twiiter IDs.
        @return       A list of tweets.
        """

        # Compile list of tweets that need to be quieried with API
        uncached = [sid for sid in sids if not self.cache.has_key(sid)]

        #print 'uncached: ', len(uncached)

        # Use API to lookup uncached tweets
        if uncached:
            partial = interface_twitter.resolve(uncached)
            for sid, twt in zip(uncached, partial):
                self.cache.add_map(sid, twt)

        # Get all tweets
        resolved = []
        for sid in sids:
            twt = self.cache.get_map(sid)
            resolved.append(twt)

        return resolved

    def features(self, sid):
        """
        features()

        Purpose: Get features from tweet meta data

        @param sids.  A tweet ID
        @return       A dictionary of meta data features.
        """

        # Get tweet
        tweet = self.cache.get_map(sid)

        if tweet == None: return {}

        # Extract features
        feats = {}

        # Not available
        if tweet['text'] == 'Not Available': return {}

        # Features: Retweet & Favorite counts
        feats['favorite_count'] = tweet['favorite_count']  # 2
        feats['retweet_count'] = tweet['retweet_count']  # 1

        # Feature: Whether username contains word 'news'
        if 'news' in tweet['user']['screen_name'].lower():
            feats['is_news'] = 1
        if 'news' in tweet['user']['name'].lower():
            feats['is_news'] = 1

        # Feature: Whether tweet is reply
        if tweet['in_reply_to_status_id_str']:
            feats['is_reply'] = 1

        return feats
    def __init__(self, sids=[], data=[]):
        # Tweet cache
        self.cache = Cache('twitter_data')

        # Cache all given data
        self.resolve(sids, data)
class TwitterData:

    def __init__(self, sids=[], data=[]):
        # Tweet cache
        self.cache = Cache('twitter_data')

        # Cache all given data
        self.resolve(sids, data)



    def resolve(self, sids, data):

        """
        resolve()

        Purpose: Wrapper for interface_twitter.resolve() (to use object's cache)

        @param sids.  A list of twiiter IDs.
        @return       A list of tweets.
        """

        # Compile list of tweets that need to be quieried with API
        uncached = [  sid  for  sid  in  sids  if not self.cache.has_key(sid)  ]

        #print 'uncached: ', len(uncached)

        # Use API to lookup uncached tweets
        if uncached:
            partial = interface_twitter.resolve(uncached)
            for sid,twt in zip(uncached,partial):
                self.cache.add_map(sid,twt)

        # Get all tweets
        resolved = []
        for txt,sid in zip(data,sids):
            twt = self.cache.get_map(sid)
            if txt == twt['text']:
                res = twt
            else:
                res = None
            #print 'res: ', res
            resolved.append(res)

        return resolved



    def lookup(self, sids):

        """
        resolve()

        Purpose: Wrapper for interface_twitter.resolve() (to use object's cache)

        @param sids.  A list of twiiter IDs.
        @return       A list of tweets.
        """

        # Compile list of tweets that need to be quieried with API
        uncached = [  sid  for  sid  in  sids  if not self.cache.has_key(sid)  ]

        #print 'uncached: ', len(uncached)

        # Use API to lookup uncached tweets
        if uncached:
            partial = interface_twitter.resolve(uncached)
            for sid,twt in zip(uncached,partial):
                self.cache.add_map(sid,twt)

        # Get all tweets
        resolved = []
        for sid in sids:
            twt = self.cache.get_map(sid)
            resolved.append(twt)

        return resolved



    def features(self, sid):

        """
        features()

        Purpose: Get features from tweet meta data

        @param sids.  A tweet ID
        @return       A dictionary of meta data features.
        """

        # Get tweet
        tweet = self.cache.get_map(sid)

        if tweet == None: return {}

        # Extract features
        feats = {}

        # Not available
        if tweet['text'] == 'Not Available': return {}

        # Features: Retweet & Favorite counts
        feats['favorite_count' ] = tweet['favorite_count'] # 2
        feats['retweet_count'  ] = tweet['retweet_count']  # 1

        # Feature: Whether username contains word 'news'
        if 'news' in tweet['user']['screen_name'].lower():
            feats['is_news'] = 1
        if 'news' in tweet['user']['name'].lower():
            feats['is_news'] = 1

        # Feature: Whether tweet is reply
        if tweet['in_reply_to_status_id_str']:
            feats['is_reply'] = 1

        return feats
class SpellChecker:
    def __init__(self):

        # Global spell checker
        #self.d = enchant.DictWithPWL("en_US", '/data1/wboag/ml/twitvec/twitvec/spelling/output.txt')
        self.d = enchant.Dict("en_US")

        # Common abbreviations and mistakes
        self.common = {}
        abbrevs = os.path.join(enabled_modules['spell'], 'abbrv.txt')
        with open(abbrevs, 'r') as f:
            for line in f.readlines():
                if line == '\n': continue
                abbrev, full = tuple(line.strip('\n').split(' || '))
                self.common[abbrev] = full

        # Load cache of spell-corrected words
        self.cache = Cache('B-enchant')

    def correct_spelling(self, phrase, pos=None):

        #return phrase

        # Memoized?
        key = tuple(phrase)
        #if self.cache.has_key(key):
        if False:
            return self.cache.get_map(key)

        cands = []

        # Build all possible candidates
        for i, w in enumerate(phrase):

            if _debug: print w

            # Special form
            if do_not_alter(w, pos, i):

                if _debug: print '\tSTATIC'
                cands.append([w])

            # Numbers
            elif re.search('\d', w):

                if _debug: print '\tNumber'
                cands.append(['000'])

            # Regexes
            elif re.search('^a*(?:h+q?a+)+h*$', w):
                if _debug: print '\tHAHA'
                cands.append(['haha'])
            elif re.search('^(?:h+e+)*$', w):
                if _debug: print '\tHEHE'
                cands.append(['haha'])
            elif re.search('^o*(?:xo)+x*$', w):
                if _debug: print '\tXOXO'
                cands.append(['xoxo'])
            elif re.search('^l(?:ol)+$', w):
                if _debug: print '\tLOLOL'
                cands.append(['lol'])

            # Common abbreviations / mistakes
            elif w.lower() in self.common:

                if _debug: print '\tCOMMON'
                cands.append([self.common[w.lower()]])

            # Normal
            else:

                # FIXME: do this during tokenization
                cand = w
                if w[-2:] == "'s": cand = w[:-2]
                if w[-2:] == "'m": cand = w[:-2]
                if w[-3:] == "'ve": cand = w[:-2]
                if w[-3:] == "'ll": cand = w[:-2]
                if len(cand): w = cand

                # ends in exlamation mark context
                exclamation = False
                if re.search('^[^!]*!$', w):
                    if _debug: '\tECLAMATION'
                    w = w.strip('!')
                    exclamation = True

                # Capitalized often means proper noun
                if w[0].isupper():
                    if _debug: '\tMAYBE PROPER NOUN'
                    possible = [w]

                # Spelled correct?
                elif self.d.check(w):
                    if _debug: print '\tCORRECT!'
                    possible = [w]

                # Try fixing with repeated characters
                elif elongated_characters(w):
                    # Remove duplicated characters down to just 2 remaining
                    if _debug: print '\tELONGATED'
                    possible = [remove_duplicates(w)]
                    #print w, '\t->\t', possible[0]

                # Leading apostraphe
                elif (w[-1] == "'") and (self.d.check(w[:-1])):
                    if _debug: print '\tAPOSTRAPHE!'
                    possible = [w[:-1]]

                # Word not separated from punctuation
                elif (w[0] in punctuation) or (w[-1] in punctuation):
                    # Separate word from leading and trailing punctuation
                    match = re.search("([^a-zA-Z]*)([a-zA-z']*)([^a-zA-Z]*)",
                                      w)
                    leading, word, trailing = match.groups()

                    possible = []
                    if leading: possible.append(leading)
                    if word in self.common:
                        possible.append(self.common[word])
                    else:
                        possible.append(word)
                    if trailing: possible.append(trailing)

                    #print w, ' -> ', possible

                # Backoff to spell checker correction
                else:
                    if _debug: print '\tCHECKING SUGGESTIONS'

                    #if not self.cache.has_key(w):
                    if True:

                        # Run spell ccorrection
                        possible = self.d.suggest(w)

                        # If no matches, then use original
                        if possible == []:
                            possible = [w]
                        '''
                        if (w not in seen) and (edit_distance(w,possible[0])<=2):
                            seen.add(w)
                            print phrase[max(i-3,0):i+4]
                            print w, '\t', possible[0], '\t', edit_distance(w,possible[0])
                            print
                        '''

                        # good prediction?
                        if enabled_modules['caches'] is not None:
                            if edit_distance(w, possible[0]) <= 2:
                                self.cache.add_map(key, possible)
                            else:
                                self.cache.add_map(key, w)

                    # lookup cached spell corrections
                    else:
                        possible = self.cache.get_map(w)

                # trailing exclamation
                if exclamation:
                    possible = [w + ' !' for w in possible]

                cands.append(possible)

        #for c in cands:
        #    print c
        #print

        # Select proper candidate
        corrected = [choices[0] for choices in cands]

        # memoize
        if enabled_modules['caches'] is not None:
            self.cache.add_map(key, corrected)

        return corrected
class SpellChecker:


    def __init__(self):

        # Global spell checker
        #self.d = enchant.DictWithPWL("en_US", '/data1/wboag/ml/twitvec/twitvec/spelling/output.txt')
        self.d = enchant.Dict("en_US")

        # Common abbreviations and mistakes
        self.common = {}
        abbrevs = os.path.join('/data1/nlp-data/twitter/tools/spell/abbrv.txt')
        with open(abbrevs,'r') as f:
            for line in f.readlines():
                if line == '\n': continue
                abbrev,full = tuple(line.strip('\n').split(' || '))
                self.common[abbrev] = full

        # Load cache of spell-corrected words
        self.cache = Cache('B-enchant')



    def correct_spelling(self, phrase, pos=None):

        #return phrase

        # Memoized?
        key = tuple(phrase)
        #if self.cache.has_key(key): 
        if False:
            return self.cache.get_map(key)

        cands = []

        # Build all possible candidates
        for i,w in enumerate(phrase):

            if _debug: print w

            # Special form
            if do_not_alter(w,pos,i):

                if _debug: print '\tSTATIC'
                cands.append([w])

            # Numbers
            elif re.search('\d',w):

                if _debug: print '\tNumber'
                cands.append(['000'])

            # Regexes
            elif re.search('^a*(?:h+q?a+)+h*$',w):
                if _debug: print '\tHAHA'
                cands.append(['haha'])
            elif re.search('^(?:h+e+)*$',w):
                if _debug: print '\tHEHE'
                cands.append(['haha'])
            elif re.search('^o*(?:xo)+x*$',w):
                if _debug: print '\tXOXO'
                cands.append(['xoxo'])
            elif re.search('^l(?:ol)+$',w):
                if _debug: print '\tLOLOL'
                cands.append(['lol'])

            # Common abbreviations / mistakes
            elif w.lower() in self.common:

                if _debug: print '\tCOMMON'
                cands.append([self.common[w.lower()]])

            # Normal
            else:

                # FIXME: do this during tokenization
                if w[-2:] ==  "'s": w = w[:-2]
                if w[-2:] ==  "'m": w = w[:-2]
                if w[-3:] == "'ve": w = w[:-2]
                if w[-3:] == "'ll": w = w[:-2]

                # ends in exlamation mark context
                exclamation = False
                if re.search('^[^!]*!$',w): 
                    w = w.strip('!')
                    exclamation = True

                # Capitalized often means proper noun
                if w[0].isupper():
                    if _debug: '\tMAYBE PROPER NOUN'
                    possible = [w]

                # Spelled correct?
                elif self.d.check(w):
                    if _debug: print '\tCORRECT!'
                    possible = [w]

                # Try fixing with repeated characters
                elif elongated_characters(w):
                    # Remove duplicated characters down to just 2 remaining
                    if _debug: print '\tELONGATED'
                    possible = [remove_duplicates(w)]
                    #print w, '\t->\t', possible[0]

                # Leading apostraphe
                elif (w[-1] == "'") and (self.d.check(w[:-1])):
                    if _debug: print '\tAPOSTRAPHE!'
                    possible = [w[:-1]]

                # Word not separated from punctuation
                elif (w[0] in punctuation) or (w[-1] in punctuation):
                    # Separate word from leading and trailing punctuation
                    match = re.search("([^a-zA-Z]*)([a-zA-z']*)([^a-zA-Z]*)",w)
                    leading,word,trailing = match.groups()

                    possible = []
                    if leading: possible.append(leading)
                    if word in self.common:
                        possible.append(self.common[word])
                    else:
                        possible.append(word)
                    if trailing: possible.append(trailing)

                    #print w, ' -> ', possible

                # Backoff to spell checker correction
                else:
                    if _debug: print '\tCHECKING SUGGESTIONS'

                    #if not self.cache.has_key(w):
                    if True: 

                        # Run spell ccorrection
                        possible = self.d.suggest(w)

                        # If no matches, then use original
                        if possible == []: 
                            possible = [w]

                        '''
                        if (w not in seen) and (edit_distance(w,possible[0])<=2):
                            seen.add(w)
                            print phrase[max(i-3,0):i+4]
                            print w, '\t', possible[0], '\t', edit_distance(w,possible[0])
                            print
                        '''

                        # good prediction?
                        if edit_distance(w,possible[0]) <= 2:
                            self.cache.add_map(key,possible)
                        else:
                            self.cache.add_map(key,w)

                    # lookup cached spell corrections
                    else:
                        possible = self.cache.get_map(w)

                # trailing exclamation 
                if exclamation:
                    possible = [ w + ' !' for w in possible ]

                cands.append(possible)

        #for c in cands:
        #    print c
        #print

        # Select proper candidate
        corrected = [ choices[0] for choices in cands ] 

        # memoize
        self.cache.add_map(key,corrected)

        return corrected