def emotionWith(name, soup, emotion): threads = soup.find_all(attrs={"class":"thread"}) convo = None for t in threads: messages = t.children members = next(messages) members_list = members.split(',') #print(members) if len(members_list) == 2 and name in members_list[0]: break f = open('helloworld.html','w') html_text = "<html><head></head><body><h1>Message Log</h1>" # messages = [m for m in messages] # # Eventually I'll want to create a list so that I can # # give context for these messages for header in messages: user = header.find(attrs={"class":"user"}) date_text = header.find(attrs={"class":"meta"}).text date_text = date_text + '00' #date = datetime.strptime(date_text, "%A, %B %d, %Y at %I:%M%p %Z%z") message = next(messages) try: if vs(message.string)[emotion] > 0.2: print("----------") print(user.string) print(date_text) print(message.string) print(vs(message.string)) print('') except: pass
def messagesWith(name,soup=soup): threads = soup.find_all(attrs={"class":"thread"}) convo = None for t in threads: messages = t.children members = next(messages) members_list = members.split(',') #print(members) if len(members_list) == 2 and name in members_list[0]: break filename = 'messagesWith_' + name + '.html' f = open(filename,'w') html_text = "<html><head></head><body><h1>Message Log</h1>" for header in messages: user = header.find(attrs={"class":"user"}) date_text = header.find(attrs={"class":"meta"}).text date_text = date_text + '00' #date = datetime.strptime(date_text, "%A, %B %d, %Y at %I:%M%p %Z%z") message = next(messages) print(user.string) print(date_text) print(message.string) try: print(vs(message.string)) except: print("Couldn't get sentiment") print("-------------") html_text += "<hr>" html_text += p(user.string) html_text += p(date_text) html_text += p(message) html_text+="</body></html>" f.write(html_text) f.close()
# Convert time from Tue Mar 29 04:04:22 +0000 2016 to 2016-3-29 time = document["created_at"].split() month = months[time[1]] day = time[2] year = time[5] date = str(year) + "-" + str(month) + "-" + str(day) docs.append({"text": document["text"], "date": '"' + date + '"'}) aggregate = {} count = {} for doc in docs: text = doc["text"].encode('utf-8') sentiment = vs(text) value = (sentiment['neg'] * -1) + (sentiment['pos']) if doc["date"] not in aggregate: aggregate[doc["date"]] = value count[doc["date"]] = 1 else: aggregate[doc["date"]] += value count[doc["date"]] += 1 # normalize f.write("[ \n") for date in aggregate: aggregate[date] = aggregate[date]/count[date] f.write('{ \t "date": ' + str(date) + ',\n \t "value": ' + str(aggregate[date]) + "\n }")
feature_name_pos = [] for k, v in pos_vocab.items(): feature_name_pos.append(k) LR_pos = LogisticRegression(class_weight="balanced", penalty="l1", C=0.01).fit(pos, np.reshape(target, target.shape[0])) select2 = SelectFromModel(LR_pos, prefit=True) y = select2.transform(pos) save_pos = open("pos.pickle", 'wb') pickle.dump(pos, save_pos) save_pos.close() pprint("Pos_vectorizer has been pickled") sentiment_analyzer = vs() def more_feats(sent): text = cleaner.basic_cleaning(sent) sentiment = sentiment_analyzer.polarity_scores(sent) syllables = textstat.syllable_count(text) avg_syl_per_word = ( 0.001 + float(syllables)) / float(0.001 + len(word_tokenize(text))) num_terms = len(sent.split()) num_words = len(text.split()) num_unique_words = len(set(text.split())) num_char = len(text) total_char = len(sent) sent = cleaner.preprocessing_stage2(sent) urlcount = sent.count("URLHERE")
reviewlist=list() ratinglist=list() featureset=set() featurelist=list() reviewTextlist=list() scorelist=list() counter=0 g = open("./smaller.json", 'r') for l in g: review = eval(l) reviewText = review["reviewText"] rating = review["overall"] score=vs(reviewText)["compound"]#length=len(reviewText)#featurelist.append([score,length]) scorelist.append(score) ratinglist.append(rating) #fvector=np.array(featurelist) '''g = open("./smaller.json", 'r') for l in g: review = eval(l) reviewText = review["reviewText"] rating = review["overall"] score=vs(reviewText)["compound"] s = [lmtzr.lemmatize(i) for i in word_tokenize(reviewText) if i not in stop and i not in string.punctuation and i.isalpha()] featureset=featureset.union(set(s)) featurelist=list(featureset)
reviewlist = list() ratinglist = list() featureset = set() featurelist = list() reviewTextlist = list() scorelist = list() counter = 0 g = open("path/to/test_data", 'r') for l in g: review = eval(l) reviewText = review["reviewText"] rating = review["overall"] score = vs(reviewText)["compound"] s = [ lmtzr.lemmatize(i) for i in word_tokenize(reviewText) if i not in stop and i not in string.punctuation and i.isalpha() ] featureset = featureset.union(set(s)) featurelist = list(featureset) for l in g: review = eval(l) reviewText = review["reviewText"] rating = review["overall"] s = [ lmtzr.lemmatize(i) for i in word_tokenize(reviewText) if i not in stop and i not in string.punctuation and i.isalpha() ]
start = time.time() with open("reviews.csv") as infile: with open("reviews_sentiments.csv", "w") as outfile: index = -1 for line in infile: index += 1 if index == 0: outfile.write(line) continue if index % 100 == 0: print index / 100, text_split = line.strip().split(',', 4) text = text_split[4].decode('ascii', 'ignore') everything_else = ','.join(text_split[:4]) score = str(vs(text)['compound']) new_line = everything_else + "," + score + "," + text + "\n" outfile.write(new_line) d[text] = score if max_reviews and index >= max_reviews: break print if max_reviews: duration = time.time() - start print "%d reviews took %f seconds for an average of %f seconds per review" % (index, duration, duration/index) sorted_reviews = sorted(d.items(), key=lambda x: x[1]) worst = sorted_reviews[:10]
reviewlist=list() ratinglist=list() featureset=set() featurelist=list() reviewTextlist=list() scorelist=list() counter=0 g = open("path/to/test_data", 'r') for l in g: review = eval(l) reviewText = review["reviewText"] rating = review["overall"] score=vs(reviewText)["compound"] s = [lmtzr.lemmatize(i) for i in word_tokenize(reviewText) if i not in stop and i not in string.punctuation and i.isalpha()] featureset=featureset.union(set(s)) featurelist=list(featureset) for l in g: review = eval(l) reviewText = review["reviewText"] rating = review["overall"] s = [lmtzr.lemmatize(i) for i in word_tokenize(reviewText) if i not in stop and i not in string.punctuation and i.isalpha()] slist=[0]*len(featurelist) for item in s: slist[featurelist.index(item)]+=1 #print len(featurelist), len(slist), item reviewlist.append(slist) ratinglist.append(rating)