Beispiel #1
0
def create_global_topic_list(articleList):
    e = re.compile(r"\s(de)\s")
    u = re.compile(r"\s(du)\s")
    globalTopicList = []
    
    i = 0
    for commList in articleList.values():
        # Article body + all comments 
        art = commList[0].artBody        
        for comm in commList:
            art += comm.body
            
        # Global list of named entities
        art = u.sub(" Du ", art)            
        art = e.sub(" De ", art)
        entities = extract_entities(wordpunct_tokenize(art))
        globalTopicList += entities 
        i += 1
        if i % 100 == 0:
            print i,"comments processed for global vector" 

    globalTopicList = nltk.FreqDist(globalTopicList)

    tempVector = dict()
    for k in globalTopicList.items()[:100]:
        tempVector[k[0]] = 0
    
    f = open("globalTopics" + '.pkl', 'wb')
    pickle.dump(tempVector, f, pickle.HIGHEST_PROTOCOL)
    f.close()
def preprocessText(movie_id):
    doc = readPlot(movie_id)
    stopset = set(stopwords.words('english'))
    stemmer = SnowballStemmer('english',ignore_stopwords=True)
    tokens = wordpunct_tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    stemmed_text = [stemmer.stem(word) for word in clean]
    return stemmed_text
def get_text_words(text, stopwords=sw):
    text = preprocess_text(text)
    user_set = set(["http", "://"])
    text_words = set(wordpunct_tokenize(text.lower()))
    text_words = text_words.difference(stopwords)
    text_words = text_words.difference(user_set)
    text_words = [w for w in text_words if len(w) > 2]
    return text_words
def get_tweet_words(_tweet, stopwords = []):
    tweet = preprocess_tweet(_tweet)
    user_set = set(["http", "://"])
    tweet_words = set(wordpunct_tokenize(tweet.lower()))
    tweet_words = tweet_words.difference(stopwords)
    tweet_words = tweet_words.difference(user_set)
    tweet_words = [w for w in tweet_words if len(w)>2]
    return tweet_words
def get_text_words(text, stopwords=sw):
    text = preprocess_text(text)
    user_set = set(["http", "://"])
    text_words = set(wordpunct_tokenize(text.lower()))
    text_words = text_words.difference(stopwords)
    text_words = text_words.difference(user_set)
    text_words = [w for w in text_words if len(w) > 2]
    return text_words
def word_indicator(text, **kwargs):
    if CLASSIFIER == 'MultinomialNB':
        features = dict(Counter(wordpunct_tokenize(text.lower())))
        for el in features.keys():
            if el in sw or el in ["http", "://"]:
                del features[el]
    else:
        features = defaultdict(list)
        text_words = get_text_words(text, **kwargs)
        for w in text_words:
            features[w] = True
    return features
def word_indicator(text, **kwargs):
    if CLASSIFIER == "MultinomialNB":
        features = dict(Counter(wordpunct_tokenize(text.lower())))
        for el in features.keys():
            if el in sw or el in ["http", "://"]:
                del features[el]
    else:
        features = defaultdict(list)
        text_words = get_text_words(text, **kwargs)
        for w in text_words:
            features[w] = True
    return features
Beispiel #8
0
def get_clean_text_tokens(in_text):
    "returns a list of lemmatised tokens, after cleaning in_text from stopwords, numbers, and one-letter words"

    stop_words = set(stopwords.words("english"))
    tokens = [t for t in wordpunct_tokenize(in_text)]
    tokens = filter(lambda x: x not in stop_words, tokens)
    tokens = filter(lambda x: x.isalpha(), tokens)
    tokens = filter(lambda x: len(x) > 1, tokens)

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]

    return lemmatized_tokens
def tokenize(text):
    ret = []
    last_offset = 0
    if not text:
        return ret
    for token in wordpunct_tokenize(text):
        processed_token = token.lower().strip()
        if not processed_token or len(processed_token) < 3 or is_stop_word(
                token):
            continue
        processed_token = lemmatize_word(processed_token)
        last_offset = text.find(token, last_offset)
        ret.append((processed_token, last_offset))
    return ret
def tokenize(x):
    return [w for w in wordpunct_tokenize(x) if len(w)>=3]
Beispiel #11
0
def inject_term_list(in_lemma_list, in_text, lemmatizer):
    "This function replace the term with its lemma, prefixed by an underscore, if the term is included in in_lemma_list"
    return ' '.join([lemma_replace(in_lemma_list, t, lemmatizer).encode("utf-8") for t in wordpunct_tokenize(in_text)])