Beispiel #1
0
 def ExtractStatus(cls, status):
     if not status.text or not status.user: return None
     allwords = status.text.lower().split()
     nolinkwords = []
     tokens = []
     for word in allwords:
         if word.startswith(('http://', 'https://', 'www.')):
             if word.find('bit.ly') > -1:
                 try:
                    w = extract.grabDomain(extract.grabCanonicalUrl(word)).lower()
                 except: 
                     # Exceeded bit.ly api rate limit 
                     w = extract.grabDomain(word).lower()
             else: w = extract.grabDomain(word).lower()
             tokens.append(w)
         else: nolinkwords.append(word)
     words = "".join(nolinkwords)
     words = cls.splitre.split(words)
     tokens.extend([w for w in words if (w not in cls.stopwords and w)])
     tokens.append('USER: {0}'.format(status.user.id))
     try: 
         reply_to_user_id = status.in_reply_to_user_id
         if reply_to_user_id: 
             tokens.append('IN_REPLY_TO_USER_ID: {0}'.format(reply_to_user_id))
     except: pass
     return tokens
Beispiel #2
0
 def ExtractWord(cls, word):
     """
     Extract a word to get rid of punctuation, stems words, and changes 
     links to the domain only. Returns None for stopwords. Unfortunately
     this is probably the bottleneck according to profiling.
     """
     word = word.strip(cls.punctuation)
     if word.startswith(('http://', 'https://', 'www.')) or \
             cls._contains(word, '.com') or cls._contains(word, '.ly'):
         if cls._contains(word, 'bit.ly'):
             #return extract.grabDomain(word).lower()
             # Exceeded bit.ly api rate limit
             try:
                 return extract.grabDomain(extract.grabCanonicalUrl(word)).lower()
             except: pass
         return extract.grabDomain(word).lower()
     word = word.lower()
     if word in cls.stopwords:
         return None      
     return cls.stemmer.stem(word)