コード例 #1
0
def loadContext(user_id):
    words_arr = loadWordsArr(user_id)
    words_arr = removeStopList(words_arr)
    length = len(words_arr)
    context = Counter(words_arr).most_common()
    context = map(lambda x: (x[0], x[1] / float(length)), context)
    return context
コード例 #2
0
    def vectorize(self, text):

        # better filter maybe
        texts = text.split('\n')
        #texts = filter(lambda t: not re.search('year', t), texts)
        #texts = filter(lambda t: not re.search('old', t), texts)
        texts = filter(lambda t: not re.search(r'\d{2} years old', t), texts)

        text = '\n'.join(texts)

        # Replace Twitter specific patterns.
        #text = re.sub(r'https?:\S+', ' ', text)
        #text = re.sub(r'\b\d{2} years old', ' ', text)
        #text = re.sub(r'old', ' ', text)
        #text = re.sub(r'years', ' ', text)
        #text = re.sub(r'\d+', ' ', text)
        #text = re.sub(r'#\w+', ' ', text)
        #text = re.sub(r'@\w+', ' ', text)

        # tokenize and remove stoplist
        words_arr = re.findall(r'\w+', text)
        words_arr = removeStopList(words_arr)

        # normalize
        length = len(words_arr)
        context = Counter(words_arr).most_common()
        context = map(lambda x: (x[0], x[1] / float(length)), context)

        return context
コード例 #3
0
def contextFromText(text):
    text = replaceTWPattern(text)

    words_arr = re.findall(r'\w+', text)
    words_arr = removeStopList(words_arr)
    length = len(words_arr)

    context = Counter(words_arr).most_common()
    context = map(lambda x: (x[0], x[1] / float(length)), context)

    return context
コード例 #4
0
    def vectorize(self, text):

        # better filter maybe
        texts = text.split("\n")
        texts = filter(lambda t: not re.search(r"\b(I am|I'm|Im) a (man|woman)\b", t, re.I), texts)

        text = "\n".join(texts)

        # tokenize and remove stoplist
        words_arr = re.findall(r"\w+", text)
        words_arr = removeStopList(words_arr)

        # normalize
        length = len(words_arr)
        context = Counter(words_arr).most_common()
        context = map(lambda x: (x[0], x[1] / float(length)), context)

        return context