def one_classifier(text, lang, embedding_name, model_path, model_file): #-------------------------------------------------------------------------------------------- #--- LOAD MODEL AND EMBEDDING #-------------------------------------------------------------------------------------------- print(model_file) cls = pickle.load(open(model_path + model_file, 'rb')) embedding = Embeddings(embedding_name) #-------------------------------------------------------------------------------------------- #--- PROCESSING #-------------------------------------------------------------------------------------------- processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) if len( to_vector_single_nonzeros(no_stpw_text, embedding, len(no_stpw_text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( no_stpw_text, embedding, len(no_stpw_text)), axis=0) vectorized_text2 = np.reshape(vectorized_text, (1, -1)) prob = cls.predict_proba(vectorized_text2)[:, 1] else: vectorized_text = np.zeros((300, ) * 1) prob = 0 #print(cls.classes_) # check that class at second position is L1 for i in list(prob): return (i)
def get_topics(text, lang, topics_path): #initialization embeddings = Embeddings(emb_dict[lang]) # get the topics dictionary from the path topics_dicts = load_data(topics_path) topics_dict = topics_dicts[lang] topics = list(topics_dict.keys()) if lang == 'en': #cl = 0.7 # when a topic is "close" cl = 0.5 else: cl = 0.5 # now vectorize the topics vect_dict_topics = [ (w, np.mean(to_vector_single_nonzeros(topics_dict[w], embeddings, len(topics_dict[w])), axis=0)) for w in topics ] #print(vect_dict_topics) # get topics assigned_topics = [] dists = [] if len(to_vector_single_nonzeros(text, embeddings, len(text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( text, embeddings, len(text)), axis=0) else: vectorized_text = np.zeros((300, ) * 1) for v in vect_dict_topics: dists.append(spatial.distance.cosine( vectorized_text, v[1])) # measure distance to all topics good_topics = [ topics[i].upper() for i in range(len(topics)) if dists[i] < cl ] # choose close topics if not good_topics: good_topics.append('OTHER') # assigned_topics.append(topic) assigned_topics.append(good_topics) return assigned_topics
def make_vectorize(): try: #Load the data data = request.get_json() except Exception as e: raise e if data == {}: return (bad_request()) else: #Get the text and the language try: lang = data['lang'] except: try: lang = detect_language(data['text']) print(lang) except: responses = jsonify( "Error in vectorize: language field is missing") return responses try: text = data['text'] except: responses = jsonify("Error in vectorize: text is missing") return responses if lang not in ['en', 'es', 'ar', 'ro', 'fr']: responses = jsonify( "Language not available. Language must be in ['en','es','ar','ro','fr']" ) return responses #Preprocess the text print("Vectorize...") embeddings = Embeddings(emb_dict[lang]) processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) vectorized_tokens = to_vector_single_nonzeros(no_stpw_text, embeddings, len(no_stpw_text)) if len(vectorized_tokens) > 0: vectorized_text = np.mean(vectorized_tokens, axis=0) else: vectorized_text = np.zeros((300, ) * 1) print(vectorized_text) #Send the response codes responses = jsonify(vector=vectorized_text.tolist()) responses.status_code = 200 return responses
def two_classifier(text, lang, embedding_name, model_path, model_file_JIH, model_file_EXR): #-------------------------------------------------------------------------------------------- #--- LOAD MODEL AND EMBEDDING #-------------------------------------------------------------------------------------------- cls_JIH = pickle.load(open(model_path + model_file_JIH, 'rb')) cls_EXR = pickle.load(open(model_path + model_file_EXR, 'rb')) embedding = Embeddings(embedding_name) #-------------------------------------------------------------------------------------------- #--- PROCESSING #-------------------------------------------------------------------------------------------- processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) if len( to_vector_single_nonzeros(no_stpw_text, embedding, len(no_stpw_text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( no_stpw_text, embedding, len(no_stpw_text)), axis=0) vectorized_text2 = np.reshape(vectorized_text, (1, -1)) prob_JIH = cls_JIH.predict_proba(vectorized_text2)[:, 1] prob_EXR = cls_EXR.predict_proba(vectorized_text2)[:, 1] else: vectorized_text = np.zeros((300, ) * 1) prob_JIH = 0 prob_EXR = 0 if prob_JIH > prob_EXR: prob = prob_JIH else: prob = prob_EXR for i in list(prob): return (i)
def classifier(annotated_data, lang, user_id, case_id, clas_name): #-------------------------------------------------------------------------------------------- #--- DEFINE FILES AND LANGUAGE #-------------------------------------------------------------------------------------------- model_path = './data/probability/insikt/' model_file = user_id + '_' + case_id + '_' + clas_name + '_classifier.model' if (lang == 'en'): embedding_name = 'embedding-EN' if (lang == 'ar'): embedding_name = 'embedding-AR' if (lang == 'es'): embedding_name = 'embedding-ES' if (lang == 'ro'): embedding_name = 'embedding-RO' if (lang == 'fr'): embedding_name = 'embedding-FR' embedding = Embeddings(embedding_name) #-------------------------------------------------------------------------------------------- #--- GENERAL SCRIPT #-------------------------------------------------------------------------------------------- ########## Tokenize + stopwords #print(annotated_data) #raw_data=np.array(annotated_data) x_train = [i[0] for i in annotated_data] #print(x_train) y_train = [i[1] for i in annotated_data] #replace N0 for L0...!!! #print(y_train) x_train_DL = [] print('Data training with ' + str(len(x_train)) + ' texts') for text in x_train: #print(text) processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) if len( to_vector_single_nonzeros(no_stpw_text, embedding, len(no_stpw_text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( no_stpw_text, embedding, len(no_stpw_text)), axis=0) else: vectorized_text = np.zeros((300, ) * 1) #print(vectorized_text) #x_train_DL.append(np.reshape(vectorized_text,(1,-1))) x_train_DL.append(vectorized_text) ########## Build and test classifiers with 10-fold -cross validation skf = StratifiedKFold(n_splits=10, shuffle=True) # Stochastic Descent Gradient cls = SGDClassifier(loss="log", penalty="l2", max_iter=500).fit(x_train_DL, y_train) scores = cross_val_score(cls, x_train_DL, y_train, cv=skf, scoring='accuracy') print("Accuracy C-10V EN: %2.1f (+/- %2.1f)" % (100 * scores.mean(), scores.std() * 200)) print(cls.classes_) # check that class at the second position is 'Yes' accuracy = round((100 * scores.mean()), 2) ########## Save the model pickle.dump(cls, open(model_path + model_file, 'wb')) return (accuracy)