def text_lsi(new_text, num=10):
    new_vec = dictionary.doc2bow(process(new_text))
    vec_lsi = lsi[new_vec]
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    return [(s, movie_reviews[s[0]]) for s in sims[:num]]


#
# # load the document
# filename = 'data/txt_sentoken/neg/cv000_29416.txt'
#
# movie_reviews = load_doc(filename)
# sent_text = nltk.sent_tokenize(movie_reviews)  # this gives us a list of sentences
#
# tokens = [process(sentence) for sentence in sent_text]
#
# frequencies = Counter()
# for t in tokens: frequencies.update(t)
#
# print(frequencies)
# # split into tokens by white space
# tokens = text.split()
# # remove punctuation from each token
# table = str.maketrans('', '', string.punctuation)
# tokens = [w.translate(table) for w in tokens]
# # remove remaining tokens that are not alphabetic
# tokens = [word for word in tokens if word.isalpha()]
# # filter out stop words
# stop_words = set(stopwords.words('english'))
# tokens = [w for w in tokens if not w in stop_words]
# # filter out short tokens
# tokens = [word for word in tokens if len(word) > 1]
# print(tokens)
Beispiel #2
0
 def add_text(self, key, text):
     """
     Add text to the corpus, computing explicit sentiment of tokens along the way.
     
     Arguments
     key: name of the speaker, or Twitter handle, or unique ID for a speaker (any hashable type) 
     text: just that, text. Plain-old (str) is best, but it will handle reasonable Unicode. Unreasonable Unicode shall be mangled into submission before adding.
     """
     tokens = cl.process(text)
     self.wb.add_tokens(key, tokens)
 def add_text(self,key,text):
     """
     Add text to the corpus, computing explicit sentiment of tokens along the way.
     
     Arguments
     key: name of the speaker, or Twitter handle, or unique ID for a speaker (any hashable type) 
     text: just that, text. Plain-old (str) is best, but it will handle reasonable Unicode. Unreasonable Unicode shall be mangled into submission before adding.
     """
     tokens=cl.process(text)
     self.wb.add_tokens(key,tokens)
Beispiel #4
0
def process_text(corpus):
    path = corpus.file
    wordcount, vocab, important, pairs = linguist.process(corpus, path)
    json_corpus(corpus, wordcount, "wordcount")
    json_corpus(corpus, important, "important")
    json_corpus(corpus, vocab, "vocab")
    # todo: incorporate the next line into visual output
    # #print repr(pairs)
    collocations = linguist.context(corpus, important)
    profile_corpus(corpus, collocations)
    path.close()
Beispiel #5
0
 def _load_corpus(self):
     l.debug("Loading corpus")
     for key in can.candidates.keys():
         try:
             text=open(corpusPath+key+'.txt','rb').read()
         except: 
             continue
     
         tokens=cl.process(text)
         self.wb.add_tokens(key,tokens)
 
     self.wb.prune()
Beispiel #6
0
 def load_corpus(self, key, filename):
     """
     If we have an initial corpus of text for the speakers, pre-load it from a file. 
     
     Using an initial corpus is highly recommended for highly vociforous speakers -- journalists, media outlets, politicians, etc. 
     Using corporate press releases for initial corpus will leave you rather disappointed. 
     
     Arguments:
     key: name of the speaker, or Twitter handle, or unique ID for a speaker (any hashable type)
     filename: name or fully qualified path for a file containing text for this speaker (string)
     """
     l.debug("Loading corpus")
     text = open(filename, 'rb').read()
     tokens = cl.process(text)
     self.wb.add_tokens(key, tokens)
     self.wb.prune()
 def load_corpus(self, key, filename):
     """
     If we have an initial corpus of text for the speakers, pre-load it from a file. 
     
     Using an initial corpus is highly recommended for highly vociforous speakers -- journalists, media outlets, politicians, etc. 
     Using corporate press releases for initial corpus will leave you rather disappointed. 
     
     Arguments:
     key: name of the speaker, or Twitter handle, or unique ID for a speaker (any hashable type)
     filename: name or fully qualified path for a file containing text for this speaker (string)
     """
     l.debug("Loading corpus")
     text=open(filename,'rb').read()    
     tokens=cl.process(text)
     self.wb.add_tokens(key,tokens)
     self.wb.prune()
Beispiel #8
0
def sentiment(text):
    """
    Takes a bunch of text, computes total sentiment for it. 
    
    Args:
    text: (string) or clean unicode
    
    Returns:
    sentiment: -5 ("f**k shit horrible awful death") to +5 ("awesome happy jumping for joy")
    """

    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')
    tokens = linguist.process(text)
    if len(tokens) == 0: return 0

    #dm.add_to_corpus('twitter', tokens)
    ts = sum([t[1] for t in tokens]) / float(len(tokens))
    return ts
Beispiel #9
0
direct_speech = defaultdict(list)

for cid, c in can.iteritems():
    for sn in c:
        tweets = get_tweets(sn)
        direct_speech[cid].extend(tweets)

for cid, names in media.media.iteritems():
    for sn in names:
        tweets = get_tweets(sn)
        direct_speech[cid].extend(tweets)

for cid, tweets in direct_speech.iteritems():
    for tweet in tweets:
        tokens = linguist.process(tweet)
        print ">>>", tokens
        dm.add_to_corpus(cid, tokens)


"""zdata=zipfile.ZipFile(open('data/idf.json.zip','rb'))
jfile=zdata.namelist()[0]

for i,tweet in enumerate(zdata.open(jfile)):
    if i > 10000: break
    
    js=json.loads(tweet)
    
    author=js['author'][0]['name'].lower()
    try:
        text = clean_text(js['object']['content']['text'])
    if not filename.endswith(".txt"):
        continue
    # create the full path of the file to open
    path = directory + '/' + filename
    # load document
    doc = load_doc(path)
    movie_reviews.append(doc)

# for review in movie_reviews:
sent_text = [nltk.sent_tokenize(review)
             for review in movie_reviews]  # this gives us a list of sentences
# print(len(sent_text))
tokens = []
for review in sent_text:
    for sentence in review:
        tokens.append(process(sentence))
        # print(process(sentence))

frequencies = Counter()
for t in tokens:
    frequencies.update(t)

# print(frequencies.most_common(20))

# Remove words that occur only once
tokens = [[word for word in token if frequencies[word] > 1]
          for token in tokens]

dictionary = corpora.Dictionary(tokens)
dictionary.save('data/movie_reviews.dict')
Beispiel #11
0
direct_speech = defaultdict(list)

for cid, c in can.iteritems():
    for sn in c:
        tweets = get_tweets(sn)
        direct_speech[cid].extend(tweets)

for cid, names in media.iteritems():
    for sn in names:
        tweets = get_tweets(sn)
        direct_speech[cid].extend(tweets)

for cid, tweets in direct_speech.iteritems():
    for tweet in tweets:
        #sent = explicit_sentiment.sentiment_sentence(tweet)
        tokens = linguist.process(tweet)
        if len(tokens) == 0: continue

        dm.add_to_corpus(cid, tokens)
        ts = sum([t[1] for t in tokens]) / float(len(tokens))
        print ts, '>>>', tweet

mm.plot_multimode
"""zdata=zipfile.ZipFile(open('data/idf.json.zip','rb'))
jfile=zdata.namelist()[0]

for i,tweet in enumerate(zdata.open(jfile)):
    if i > 10000: break
    
    js=json.loads(tweet)