Beispiel #1
0
def api_BM25():
    query = request.json['query']
    corpus = request.json['responses']

    tokenized_corpus = [preprocess(sentence) for sentence in corpus]
    tokenized_query = preprocess(query)

    bm25 = BM25Okapi(tokenized_corpus)
    weights = bm25.get_scores(tokenized_query)

    return jsonify(weights)
Beispiel #2
0
def comment_comment_similarity(comment1, comment2):
    comment1 = nlp.preprocess(comment1)
    comment2 = nlp.preprocess(comment2)
    term_vector1 = set(get_term_vector(comment1))
    term_vector2 = set(get_term_vector(comment2))
    intersection = term_vector1 & term_vector2
    sim = float(len(intersection))
    if sim > D:
        val = 1
    else:
        val = sim / float(D)
    return val
Beispiel #3
0
def main(filepath):  
        emotions = []
	if not os.path.isfile(filepath):
            print("File path does not exist. Exiting...".format(filepath))
	    sys.exit()
        # make it txt
	with open(filepath) as fp:
            ln_count = 0;
            blob = []
            for line in fp:
                ln_count += 1
                if ln_count % 10 == 0:
                    emotions.append(emotion.get(blob))
                    blob = []
                else:
                    blob.append(unicode(line, "utf-8"))
		input = nlp.preprocess(line)
		input = utils.unique(input)
                print(input)
                sel.getImage(' '.join(input))
                image.join(line, ln_count)
        if len(blob) != 0:
            emotions.append(emotion.get(blob))
        mood = emotion.process(emotions)
        print(mood)
        song = music.getSong(mood)
        song = "./music/" + mood + "/" + song
        video.generate(song, "Awesomevideo", "mp4")
Beispiel #4
0
def fun():
    data = pd.read_csv("product_review.csv")
    review = (data['review_body'])
    sentiment = (data['Sentiment'])
    n_size = 1500

    obj = preprocess()
    clean = []
    for i in range(0, n_size):
        clean.append(obj.tokenize(data['review_body'][i], i))

    vectorizer = CountVectorizer()
    trainset = vectorizer.fit_transform(clean).toarray()
    vocabulary = vectorizer.get_feature_names()
    joblib.dump(vocabulary, 'a.txt')

    for i in obj.nottrack:
        try:
            a = vocabulary.index(i[0])
            trainset[i[1]][a] -= 2
        except:
            continue

    forest = RandomForestClassifier(n_estimators=100)
    forest.fit(trainset[0:n_size], data['Sentiment'][0:n_size])
    joblib.dump(forest, 'b.txt')
Beispiel #5
0
def preprocess(file_name):
  data = pd.read_json("./origin/{}.json".format(file_name))
  # remove useless properties
  del data['html']
  del data['timestamp']
  del data['url']
  del data['user']
  del data['retweets']
  del data['replies']
  del data['fullname']
  del data['id']
  del data['likes']

  data['text'] = data['text'].map(lambda x: nlp.preprocess(x))

  data = data.to_json(orient = 'records',force_ascii = False)

  data = json.loads(data)

  with open("./processed/{}.json".format(file_name), "w", encoding = "utf-8") as f:
    json.dump(data, f, ensure_ascii = False)
Beispiel #6
0
def analyze(text):
    global topic_dist
    global topics_per_word
    global phis
    global topic_words
    grams = nlp.preprocess(text)
    assert(len(grams) > 0)
    nlp.mass_vectorize(grams)
    assert(len(nlp.vectors) > 0)
    assert(nlp.corpus != None and len(nlp.corpus) > 0)
    #tops = file_upload.num
    nlp.train_lda()
    topic_dist, topics_per_word, phis, topic_words = nlp.do_lda(grams)
    paras = nlp.split_paragraphs(text)
    bow_grams = []
    assert(len(grams) == len(paras))
    for i in range(len(grams)):
        bow_grams.append([gram for sentence in grams[i] for gram in sentence])
    word_list = []
    for i in range(len(paras)):
        word_list.append(nlp.stable_matching(paras[i], bow_grams[i], topics_per_word))
    return [word for para in word_list for word in para]
Beispiel #7
0
import numpy as np
nlp = spacy.load('en')

df = load_csv("data/output.csv", "|")
# print(df)

industry_json = load_json_file("sasb_mm_industry.json")
threat_json = load_json_file("sasb_mm_threats.json")

sub_df, sec_data_list = get_data_with_code("sasb", df,
                                           "Internet Media & Services")
# print(sec_data_list[0])
text_data = []

for data in sec_data_list:
    tokens = preprocess(data)
    # print(tokens)
    text_data.append(tokens)

threat_desc = []
threat_name = []

for threat in threat_json:
    # print(threat["Threat"])
    for obj in threat["SubThreats"]:
        # doc2 = nlp(obj["Description"])
        # print(obj["SubThreat"])
        threat_desc.append(preprocess(obj["Description"]))
        threat_name.append(obj["SubThreat"])
        # print(doc1.similarity(doc2))
# break
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from nlp import preprocess
from sklearn import metrics
from sklearn.externals import joblib
import pandas as pd
import numpy as np
obj1 = preprocess()


def getsentiment(inp):
    clean = (obj1.tokenize(inp, 0))
    final = clean.split(' ')

    testset = []
    vocabulary = joblib.load('a.txt')
    length = len(vocabulary)
    for i in range(0, length):
        testset.append(0)

    for i in vocabulary:
        if i in final:
            count = 0
            a = vocabulary.index(i)
            count = final.count(i)
            testset[a] = testset[a] + count

    for i in obj1.nottrack:
        try:
            a = vocabulary.index(i[0])
            testset[a] -= 2
Beispiel #9
0
def increSTS(new_comment, clusters):
    new_comment = nlp.preprocess(new_comment)
    term_vector = set(metrics.get_term_vector(new_comment))
    if len(clusters) == 0:
        c = Cluster()
        c.add_comment(new_comment)
        clusters.add(c)
        return

    ca = []
    cb = []
    for cluster in clusters:
        dist, terms = cluster.get_distance_from_center(new_comment,
                                                       term_vector)
        if dist != float("inf"):
            ca.append(cluster)
        # if dist < radius_threshold:
        if terms > TH_TERMS:
            cb.append(cluster)
    # ca = [cluster for cluster in clusters if cluster.get_distance_from_center(new_comment, term_vector) != float("inf")]
    # cb = [cluster for cluster in ca if cluster.get_distance_from_center(new_comment,term_vector) < radius_threshold]

    if len(cb) != 0:
        cb.sort(key=lambda c: len(c.comments), reverse=True)
        cadded = cb[0]
        cadded.add_comment(new_comment)
        cchanged = set()
        for cluster in ca:
            if cluster == cadded:
                continue
            for i, comment in enumerate(cluster.comments):
                tv = set(cluster.comment_term_vectors[i])
                # if cadded.get_distance_from_center(comment,tv)[0] < radius_threshold:
                if cadded.get_distance_from_center(comment, tv)[1] > TH_TERMS:
                    cadded.add_comment(comment)
                    cluster.remove_comment(comment, i)
                    cchanged.add(cluster)
        for cluster in cchanged:
            # V = [comment for comment in cluster.comments if cluster.get_distance_from_center(comment) > radius_threshold]
            V = []
            Vindex = []
            Vtv = []
            for i, comment in enumerate(cluster.comments):
                tv = set(cluster.comment_term_vectors[i])
                # if cluster.get_distance_from_center(comment, tv)[0] >= radius_threshold:
                if cluster.get_distance_from_center(comment,
                                                    tv)[1] <= TH_TERMS:
                    V.append(comment)
                    Vindex.append(i)
                    Vtv.append(tv)
            while len(V) > 0:
                for i, excluded_comment in enumerate(V):
                    excluded_comment_tv = Vtv[i]
                    cluster.remove_comment(excluded_comment, Vindex[i])
                    clusters_list = list(clusters)
                    clusters_list.sort(key=lambda c: len(c.comments),
                                       reverse=True)
                    added = False
                    for candidate_cluster in clusters_list:
                        # if candidate_cluster.get_distance_from_center(excluded_comment,excluded_comment_tv)[0] < radius_threshold:
                        if candidate_cluster.get_distance_from_center(
                                excluded_comment,
                                excluded_comment_tv)[1] > TH_TERMS:
                            candidate_cluster.add_comment(excluded_comment)
                            added = True
                            break
                    if not added:
                        c = Cluster()
                        c.add_comment(new_comment)
                        clusters.add(c)

                V = []
                Vtv = []
                for i, comment in enumerate(cluster.comments):
                    tv = set(cluster.comment_term_vectors[i])
                    # if cluster.get_distance_from_center(comment, tv)[0] >= radius_threshold:
                    if cluster.get_distance_from_center(comment,
                                                        tv)[1] <= TH_TERMS:
                        V.append(comment)
                        Vtv.append(tv)

    else:
        c = Cluster()
        c.add_comment(new_comment)
        clusters.add(c)
        return
Beispiel #10
0
data_path = Path(
    '../datasets/full_dataset.csv').resolve()  # this dataset still dont exist
data = pd.read_csv(data_path)
data.head()
data.replace('', np.nan, inplace=True)
data.dropna(inplace=True)
data.shape
data["full_text"].astype(str)
data.head()

encoder_path = Path('../emotion_recognition/encoder.pickle').resolve()
with encoder_path.open('rb') as file:
    encoder = pickle.load(file)

cleaned_data = preprocess(data.full_text)
sequences = [text.split() for text in cleaned_data]
list_tokenized = tokenizer.texts_to_sequences(sequences)
x_data = pad_sequences(list_tokenized, maxlen=100)

y_pred = model.predict(x_data)

for index, value in enumerate(np.sum(y_pred, axis=0) / len(y_pred)):
    print(encoder.classes_[index] + ": " + str(value))

y_pred_argmax = y_pred.argmax(axis=1)
data_len = len(y_pred_argmax)
for index, value in enumerate(np.unique(y_pred_argmax)):
    print(encoder.classes_[index] + ": " +
          str(len(y_pred_argmax[y_pred_argmax == value]) / data_len))