def api_BM25(): query = request.json['query'] corpus = request.json['responses'] tokenized_corpus = [preprocess(sentence) for sentence in corpus] tokenized_query = preprocess(query) bm25 = BM25Okapi(tokenized_corpus) weights = bm25.get_scores(tokenized_query) return jsonify(weights)
def comment_comment_similarity(comment1, comment2): comment1 = nlp.preprocess(comment1) comment2 = nlp.preprocess(comment2) term_vector1 = set(get_term_vector(comment1)) term_vector2 = set(get_term_vector(comment2)) intersection = term_vector1 & term_vector2 sim = float(len(intersection)) if sim > D: val = 1 else: val = sim / float(D) return val
def main(filepath): emotions = [] if not os.path.isfile(filepath): print("File path does not exist. Exiting...".format(filepath)) sys.exit() # make it txt with open(filepath) as fp: ln_count = 0; blob = [] for line in fp: ln_count += 1 if ln_count % 10 == 0: emotions.append(emotion.get(blob)) blob = [] else: blob.append(unicode(line, "utf-8")) input = nlp.preprocess(line) input = utils.unique(input) print(input) sel.getImage(' '.join(input)) image.join(line, ln_count) if len(blob) != 0: emotions.append(emotion.get(blob)) mood = emotion.process(emotions) print(mood) song = music.getSong(mood) song = "./music/" + mood + "/" + song video.generate(song, "Awesomevideo", "mp4")
def fun(): data = pd.read_csv("product_review.csv") review = (data['review_body']) sentiment = (data['Sentiment']) n_size = 1500 obj = preprocess() clean = [] for i in range(0, n_size): clean.append(obj.tokenize(data['review_body'][i], i)) vectorizer = CountVectorizer() trainset = vectorizer.fit_transform(clean).toarray() vocabulary = vectorizer.get_feature_names() joblib.dump(vocabulary, 'a.txt') for i in obj.nottrack: try: a = vocabulary.index(i[0]) trainset[i[1]][a] -= 2 except: continue forest = RandomForestClassifier(n_estimators=100) forest.fit(trainset[0:n_size], data['Sentiment'][0:n_size]) joblib.dump(forest, 'b.txt')
def preprocess(file_name): data = pd.read_json("./origin/{}.json".format(file_name)) # remove useless properties del data['html'] del data['timestamp'] del data['url'] del data['user'] del data['retweets'] del data['replies'] del data['fullname'] del data['id'] del data['likes'] data['text'] = data['text'].map(lambda x: nlp.preprocess(x)) data = data.to_json(orient = 'records',force_ascii = False) data = json.loads(data) with open("./processed/{}.json".format(file_name), "w", encoding = "utf-8") as f: json.dump(data, f, ensure_ascii = False)
def analyze(text): global topic_dist global topics_per_word global phis global topic_words grams = nlp.preprocess(text) assert(len(grams) > 0) nlp.mass_vectorize(grams) assert(len(nlp.vectors) > 0) assert(nlp.corpus != None and len(nlp.corpus) > 0) #tops = file_upload.num nlp.train_lda() topic_dist, topics_per_word, phis, topic_words = nlp.do_lda(grams) paras = nlp.split_paragraphs(text) bow_grams = [] assert(len(grams) == len(paras)) for i in range(len(grams)): bow_grams.append([gram for sentence in grams[i] for gram in sentence]) word_list = [] for i in range(len(paras)): word_list.append(nlp.stable_matching(paras[i], bow_grams[i], topics_per_word)) return [word for para in word_list for word in para]
import numpy as np nlp = spacy.load('en') df = load_csv("data/output.csv", "|") # print(df) industry_json = load_json_file("sasb_mm_industry.json") threat_json = load_json_file("sasb_mm_threats.json") sub_df, sec_data_list = get_data_with_code("sasb", df, "Internet Media & Services") # print(sec_data_list[0]) text_data = [] for data in sec_data_list: tokens = preprocess(data) # print(tokens) text_data.append(tokens) threat_desc = [] threat_name = [] for threat in threat_json: # print(threat["Threat"]) for obj in threat["SubThreats"]: # doc2 = nlp(obj["Description"]) # print(obj["SubThreat"]) threat_desc.append(preprocess(obj["Description"])) threat_name.append(obj["SubThreat"]) # print(doc1.similarity(doc2)) # break
from sklearn.feature_extraction.text import CountVectorizer from sklearn.ensemble import RandomForestClassifier from nlp import preprocess from sklearn import metrics from sklearn.externals import joblib import pandas as pd import numpy as np obj1 = preprocess() def getsentiment(inp): clean = (obj1.tokenize(inp, 0)) final = clean.split(' ') testset = [] vocabulary = joblib.load('a.txt') length = len(vocabulary) for i in range(0, length): testset.append(0) for i in vocabulary: if i in final: count = 0 a = vocabulary.index(i) count = final.count(i) testset[a] = testset[a] + count for i in obj1.nottrack: try: a = vocabulary.index(i[0]) testset[a] -= 2
def increSTS(new_comment, clusters): new_comment = nlp.preprocess(new_comment) term_vector = set(metrics.get_term_vector(new_comment)) if len(clusters) == 0: c = Cluster() c.add_comment(new_comment) clusters.add(c) return ca = [] cb = [] for cluster in clusters: dist, terms = cluster.get_distance_from_center(new_comment, term_vector) if dist != float("inf"): ca.append(cluster) # if dist < radius_threshold: if terms > TH_TERMS: cb.append(cluster) # ca = [cluster for cluster in clusters if cluster.get_distance_from_center(new_comment, term_vector) != float("inf")] # cb = [cluster for cluster in ca if cluster.get_distance_from_center(new_comment,term_vector) < radius_threshold] if len(cb) != 0: cb.sort(key=lambda c: len(c.comments), reverse=True) cadded = cb[0] cadded.add_comment(new_comment) cchanged = set() for cluster in ca: if cluster == cadded: continue for i, comment in enumerate(cluster.comments): tv = set(cluster.comment_term_vectors[i]) # if cadded.get_distance_from_center(comment,tv)[0] < radius_threshold: if cadded.get_distance_from_center(comment, tv)[1] > TH_TERMS: cadded.add_comment(comment) cluster.remove_comment(comment, i) cchanged.add(cluster) for cluster in cchanged: # V = [comment for comment in cluster.comments if cluster.get_distance_from_center(comment) > radius_threshold] V = [] Vindex = [] Vtv = [] for i, comment in enumerate(cluster.comments): tv = set(cluster.comment_term_vectors[i]) # if cluster.get_distance_from_center(comment, tv)[0] >= radius_threshold: if cluster.get_distance_from_center(comment, tv)[1] <= TH_TERMS: V.append(comment) Vindex.append(i) Vtv.append(tv) while len(V) > 0: for i, excluded_comment in enumerate(V): excluded_comment_tv = Vtv[i] cluster.remove_comment(excluded_comment, Vindex[i]) clusters_list = list(clusters) clusters_list.sort(key=lambda c: len(c.comments), reverse=True) added = False for candidate_cluster in clusters_list: # if candidate_cluster.get_distance_from_center(excluded_comment,excluded_comment_tv)[0] < radius_threshold: if candidate_cluster.get_distance_from_center( excluded_comment, excluded_comment_tv)[1] > TH_TERMS: candidate_cluster.add_comment(excluded_comment) added = True break if not added: c = Cluster() c.add_comment(new_comment) clusters.add(c) V = [] Vtv = [] for i, comment in enumerate(cluster.comments): tv = set(cluster.comment_term_vectors[i]) # if cluster.get_distance_from_center(comment, tv)[0] >= radius_threshold: if cluster.get_distance_from_center(comment, tv)[1] <= TH_TERMS: V.append(comment) Vtv.append(tv) else: c = Cluster() c.add_comment(new_comment) clusters.add(c) return
data_path = Path( '../datasets/full_dataset.csv').resolve() # this dataset still dont exist data = pd.read_csv(data_path) data.head() data.replace('', np.nan, inplace=True) data.dropna(inplace=True) data.shape data["full_text"].astype(str) data.head() encoder_path = Path('../emotion_recognition/encoder.pickle').resolve() with encoder_path.open('rb') as file: encoder = pickle.load(file) cleaned_data = preprocess(data.full_text) sequences = [text.split() for text in cleaned_data] list_tokenized = tokenizer.texts_to_sequences(sequences) x_data = pad_sequences(list_tokenized, maxlen=100) y_pred = model.predict(x_data) for index, value in enumerate(np.sum(y_pred, axis=0) / len(y_pred)): print(encoder.classes_[index] + ": " + str(value)) y_pred_argmax = y_pred.argmax(axis=1) data_len = len(y_pred_argmax) for index, value in enumerate(np.unique(y_pred_argmax)): print(encoder.classes_[index] + ": " + str(len(y_pred_argmax[y_pred_argmax == value]) / data_len))