def ExtractStatus(cls, status): if not status.text or not status.user: return None allwords = status.text.lower().split() nolinkwords = [] tokens = [] for word in allwords: if word.startswith(('http://', 'https://', 'www.')): if word.find('bit.ly') > -1: try: w = extract.grabDomain(extract.grabCanonicalUrl(word)).lower() except: # Exceeded bit.ly api rate limit w = extract.grabDomain(word).lower() else: w = extract.grabDomain(word).lower() tokens.append(w) else: nolinkwords.append(word) words = "".join(nolinkwords) words = cls.splitre.split(words) tokens.extend([w for w in words if (w not in cls.stopwords and w)]) tokens.append('USER: {0}'.format(status.user.id)) try: reply_to_user_id = status.in_reply_to_user_id if reply_to_user_id: tokens.append('IN_REPLY_TO_USER_ID: {0}'.format(reply_to_user_id)) except: pass return tokens
def ExtractWord(cls, word): """ Extract a word to get rid of punctuation, stems words, and changes links to the domain only. Returns None for stopwords. Unfortunately this is probably the bottleneck according to profiling. """ word = word.strip(cls.punctuation) if word.startswith(('http://', 'https://', 'www.')) or \ cls._contains(word, '.com') or cls._contains(word, '.ly'): if cls._contains(word, 'bit.ly'): #return extract.grabDomain(word).lower() # Exceeded bit.ly api rate limit try: return extract.grabDomain(extract.grabCanonicalUrl(word)).lower() except: pass return extract.grabDomain(word).lower() word = word.lower() if word in cls.stopwords: return None return cls.stemmer.stem(word)