def loadContext(user_id): words_arr = loadWordsArr(user_id) words_arr = removeStopList(words_arr) length = len(words_arr) context = Counter(words_arr).most_common() context = map(lambda x: (x[0], x[1] / float(length)), context) return context
def vectorize(self, text): # better filter maybe texts = text.split('\n') #texts = filter(lambda t: not re.search('year', t), texts) #texts = filter(lambda t: not re.search('old', t), texts) texts = filter(lambda t: not re.search(r'\d{2} years old', t), texts) text = '\n'.join(texts) # Replace Twitter specific patterns. #text = re.sub(r'https?:\S+', ' ', text) #text = re.sub(r'\b\d{2} years old', ' ', text) #text = re.sub(r'old', ' ', text) #text = re.sub(r'years', ' ', text) #text = re.sub(r'\d+', ' ', text) #text = re.sub(r'#\w+', ' ', text) #text = re.sub(r'@\w+', ' ', text) # tokenize and remove stoplist words_arr = re.findall(r'\w+', text) words_arr = removeStopList(words_arr) # normalize length = len(words_arr) context = Counter(words_arr).most_common() context = map(lambda x: (x[0], x[1] / float(length)), context) return context
def contextFromText(text): text = replaceTWPattern(text) words_arr = re.findall(r'\w+', text) words_arr = removeStopList(words_arr) length = len(words_arr) context = Counter(words_arr).most_common() context = map(lambda x: (x[0], x[1] / float(length)), context) return context
def vectorize(self, text): # better filter maybe texts = text.split("\n") texts = filter(lambda t: not re.search(r"\b(I am|I'm|Im) a (man|woman)\b", t, re.I), texts) text = "\n".join(texts) # tokenize and remove stoplist words_arr = re.findall(r"\w+", text) words_arr = removeStopList(words_arr) # normalize length = len(words_arr) context = Counter(words_arr).most_common() context = map(lambda x: (x[0], x[1] / float(length)), context) return context