def text_lsi(new_text, num=10): new_vec = dictionary.doc2bow(process(new_text)) vec_lsi = lsi[new_vec] sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) return [(s, movie_reviews[s[0]]) for s in sims[:num]] # # # load the document # filename = 'data/txt_sentoken/neg/cv000_29416.txt' # # movie_reviews = load_doc(filename) # sent_text = nltk.sent_tokenize(movie_reviews) # this gives us a list of sentences # # tokens = [process(sentence) for sentence in sent_text] # # frequencies = Counter() # for t in tokens: frequencies.update(t) # # print(frequencies) # # split into tokens by white space # tokens = text.split() # # remove punctuation from each token # table = str.maketrans('', '', string.punctuation) # tokens = [w.translate(table) for w in tokens] # # remove remaining tokens that are not alphabetic # tokens = [word for word in tokens if word.isalpha()] # # filter out stop words # stop_words = set(stopwords.words('english')) # tokens = [w for w in tokens if not w in stop_words] # # filter out short tokens # tokens = [word for word in tokens if len(word) > 1] # print(tokens)
def add_text(self, key, text): """ Add text to the corpus, computing explicit sentiment of tokens along the way. Arguments key: name of the speaker, or Twitter handle, or unique ID for a speaker (any hashable type) text: just that, text. Plain-old (str) is best, but it will handle reasonable Unicode. Unreasonable Unicode shall be mangled into submission before adding. """ tokens = cl.process(text) self.wb.add_tokens(key, tokens)
def add_text(self,key,text): """ Add text to the corpus, computing explicit sentiment of tokens along the way. Arguments key: name of the speaker, or Twitter handle, or unique ID for a speaker (any hashable type) text: just that, text. Plain-old (str) is best, but it will handle reasonable Unicode. Unreasonable Unicode shall be mangled into submission before adding. """ tokens=cl.process(text) self.wb.add_tokens(key,tokens)
def process_text(corpus): path = corpus.file wordcount, vocab, important, pairs = linguist.process(corpus, path) json_corpus(corpus, wordcount, "wordcount") json_corpus(corpus, important, "important") json_corpus(corpus, vocab, "vocab") # todo: incorporate the next line into visual output # #print repr(pairs) collocations = linguist.context(corpus, important) profile_corpus(corpus, collocations) path.close()
def _load_corpus(self): l.debug("Loading corpus") for key in can.candidates.keys(): try: text=open(corpusPath+key+'.txt','rb').read() except: continue tokens=cl.process(text) self.wb.add_tokens(key,tokens) self.wb.prune()
def load_corpus(self, key, filename): """ If we have an initial corpus of text for the speakers, pre-load it from a file. Using an initial corpus is highly recommended for highly vociforous speakers -- journalists, media outlets, politicians, etc. Using corporate press releases for initial corpus will leave you rather disappointed. Arguments: key: name of the speaker, or Twitter handle, or unique ID for a speaker (any hashable type) filename: name or fully qualified path for a file containing text for this speaker (string) """ l.debug("Loading corpus") text = open(filename, 'rb').read() tokens = cl.process(text) self.wb.add_tokens(key, tokens) self.wb.prune()
def load_corpus(self, key, filename): """ If we have an initial corpus of text for the speakers, pre-load it from a file. Using an initial corpus is highly recommended for highly vociforous speakers -- journalists, media outlets, politicians, etc. Using corporate press releases for initial corpus will leave you rather disappointed. Arguments: key: name of the speaker, or Twitter handle, or unique ID for a speaker (any hashable type) filename: name or fully qualified path for a file containing text for this speaker (string) """ l.debug("Loading corpus") text=open(filename,'rb').read() tokens=cl.process(text) self.wb.add_tokens(key,tokens) self.wb.prune()
def sentiment(text): """ Takes a bunch of text, computes total sentiment for it. Args: text: (string) or clean unicode Returns: sentiment: -5 ("f**k shit horrible awful death") to +5 ("awesome happy jumping for joy") """ text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') tokens = linguist.process(text) if len(tokens) == 0: return 0 #dm.add_to_corpus('twitter', tokens) ts = sum([t[1] for t in tokens]) / float(len(tokens)) return ts
direct_speech = defaultdict(list) for cid, c in can.iteritems(): for sn in c: tweets = get_tweets(sn) direct_speech[cid].extend(tweets) for cid, names in media.media.iteritems(): for sn in names: tweets = get_tweets(sn) direct_speech[cid].extend(tweets) for cid, tweets in direct_speech.iteritems(): for tweet in tweets: tokens = linguist.process(tweet) print ">>>", tokens dm.add_to_corpus(cid, tokens) """zdata=zipfile.ZipFile(open('data/idf.json.zip','rb')) jfile=zdata.namelist()[0] for i,tweet in enumerate(zdata.open(jfile)): if i > 10000: break js=json.loads(tweet) author=js['author'][0]['name'].lower() try: text = clean_text(js['object']['content']['text'])
if not filename.endswith(".txt"): continue # create the full path of the file to open path = directory + '/' + filename # load document doc = load_doc(path) movie_reviews.append(doc) # for review in movie_reviews: sent_text = [nltk.sent_tokenize(review) for review in movie_reviews] # this gives us a list of sentences # print(len(sent_text)) tokens = [] for review in sent_text: for sentence in review: tokens.append(process(sentence)) # print(process(sentence)) frequencies = Counter() for t in tokens: frequencies.update(t) # print(frequencies.most_common(20)) # Remove words that occur only once tokens = [[word for word in token if frequencies[word] > 1] for token in tokens] dictionary = corpora.Dictionary(tokens) dictionary.save('data/movie_reviews.dict')
direct_speech = defaultdict(list) for cid, c in can.iteritems(): for sn in c: tweets = get_tweets(sn) direct_speech[cid].extend(tweets) for cid, names in media.iteritems(): for sn in names: tweets = get_tweets(sn) direct_speech[cid].extend(tweets) for cid, tweets in direct_speech.iteritems(): for tweet in tweets: #sent = explicit_sentiment.sentiment_sentence(tweet) tokens = linguist.process(tweet) if len(tokens) == 0: continue dm.add_to_corpus(cid, tokens) ts = sum([t[1] for t in tokens]) / float(len(tokens)) print ts, '>>>', tweet mm.plot_multimode """zdata=zipfile.ZipFile(open('data/idf.json.zip','rb')) jfile=zdata.namelist()[0] for i,tweet in enumerate(zdata.open(jfile)): if i > 10000: break js=json.loads(tweet)