def one_classifier(text, lang, embedding_name, model_path, model_file):

    #--------------------------------------------------------------------------------------------
    #--- LOAD MODEL AND EMBEDDING
    #--------------------------------------------------------------------------------------------
    print(model_file)
    cls = pickle.load(open(model_path + model_file, 'rb'))

    embedding = Embeddings(embedding_name)

    #--------------------------------------------------------------------------------------------
    #--- PROCESSING
    #--------------------------------------------------------------------------------------------

    processed_text = preprocess(text)

    no_stpw_text = remove_stopwords(processed_text, lang)

    if len(
            to_vector_single_nonzeros(no_stpw_text, embedding,
                                      len(no_stpw_text))) > 0:
        vectorized_text = np.mean(to_vector_single_nonzeros(
            no_stpw_text, embedding, len(no_stpw_text)),
                                  axis=0)
        vectorized_text2 = np.reshape(vectorized_text, (1, -1))
        prob = cls.predict_proba(vectorized_text2)[:, 1]
    else:
        vectorized_text = np.zeros((300, ) * 1)
        prob = 0
    #print(cls.classes_) # check that class at second position is L1

    for i in list(prob):

        return (i)
Example #2
0
def get_topics(text, lang, topics_path):
    #initialization
    embeddings = Embeddings(emb_dict[lang])

    # get the topics dictionary from the path
    topics_dicts = load_data(topics_path)
    topics_dict = topics_dicts[lang]

    topics = list(topics_dict.keys())

    if lang == 'en':
        #cl = 0.7 # when a topic is "close"
        cl = 0.5
    else:
        cl = 0.5
    # now vectorize the topics
    vect_dict_topics = [
        (w,
         np.mean(to_vector_single_nonzeros(topics_dict[w], embeddings,
                                           len(topics_dict[w])),
                 axis=0)) for w in topics
    ]
    #print(vect_dict_topics)

    # get topics
    assigned_topics = []
    dists = []

    if len(to_vector_single_nonzeros(text, embeddings, len(text))) > 0:
        vectorized_text = np.mean(to_vector_single_nonzeros(
            text, embeddings, len(text)),
                                  axis=0)
    else:
        vectorized_text = np.zeros((300, ) * 1)

    for v in vect_dict_topics:
        dists.append(spatial.distance.cosine(
            vectorized_text, v[1]))  # measure distance to all topics

    good_topics = [
        topics[i].upper() for i in range(len(topics)) if dists[i] < cl
    ]  # choose close topics
    if not good_topics:
        good_topics.append('OTHER')

        # assigned_topics.append(topic)
    assigned_topics.append(good_topics)

    return assigned_topics
Example #3
0
def make_vectorize():
    try:
        #Load the data
        data = request.get_json()

    except Exception as e:
        raise e

    if data == {}:
        return (bad_request())
    else:
        #Get the text and the language
        try:
            lang = data['lang']
        except:
            try:
                lang = detect_language(data['text'])
                print(lang)
            except:
                responses = jsonify(
                    "Error in vectorize: language field is missing")
                return responses
        try:
            text = data['text']
        except:
            responses = jsonify("Error in vectorize: text is missing")
            return responses

        if lang not in ['en', 'es', 'ar', 'ro', 'fr']:
            responses = jsonify(
                "Language not available. Language must be in ['en','es','ar','ro','fr']"
            )
            return responses
        #Preprocess the text
        print("Vectorize...")

        embeddings = Embeddings(emb_dict[lang])

        processed_text = preprocess(text)
        no_stpw_text = remove_stopwords(processed_text, lang)
        vectorized_tokens = to_vector_single_nonzeros(no_stpw_text, embeddings,
                                                      len(no_stpw_text))

        if len(vectorized_tokens) > 0:
            vectorized_text = np.mean(vectorized_tokens, axis=0)
        else:
            vectorized_text = np.zeros((300, ) * 1)
            print(vectorized_text)

        #Send the response codes
        responses = jsonify(vector=vectorized_text.tolist())
        responses.status_code = 200
        return responses
def two_classifier(text, lang, embedding_name, model_path, model_file_JIH,
                   model_file_EXR):
    #--------------------------------------------------------------------------------------------
    #--- LOAD MODEL AND EMBEDDING
    #--------------------------------------------------------------------------------------------

    cls_JIH = pickle.load(open(model_path + model_file_JIH, 'rb'))
    cls_EXR = pickle.load(open(model_path + model_file_EXR, 'rb'))

    embedding = Embeddings(embedding_name)

    #--------------------------------------------------------------------------------------------
    #--- PROCESSING
    #--------------------------------------------------------------------------------------------

    processed_text = preprocess(text)
    no_stpw_text = remove_stopwords(processed_text, lang)
    if len(
            to_vector_single_nonzeros(no_stpw_text, embedding,
                                      len(no_stpw_text))) > 0:
        vectorized_text = np.mean(to_vector_single_nonzeros(
            no_stpw_text, embedding, len(no_stpw_text)),
                                  axis=0)
        vectorized_text2 = np.reshape(vectorized_text, (1, -1))
        prob_JIH = cls_JIH.predict_proba(vectorized_text2)[:, 1]
        prob_EXR = cls_EXR.predict_proba(vectorized_text2)[:, 1]

    else:
        vectorized_text = np.zeros((300, ) * 1)
        prob_JIH = 0
        prob_EXR = 0

    if prob_JIH > prob_EXR:
        prob = prob_JIH
    else:
        prob = prob_EXR

    for i in list(prob):
        return (i)
Example #5
0
def classifier(annotated_data, lang, user_id, case_id, clas_name):

    #--------------------------------------------------------------------------------------------
    #--- DEFINE FILES AND LANGUAGE
    #--------------------------------------------------------------------------------------------

    model_path = './data/probability/insikt/'
    model_file = user_id + '_' + case_id + '_' + clas_name + '_classifier.model'

    if (lang == 'en'):
        embedding_name = 'embedding-EN'

    if (lang == 'ar'):
        embedding_name = 'embedding-AR'

    if (lang == 'es'):
        embedding_name = 'embedding-ES'

    if (lang == 'ro'):
        embedding_name = 'embedding-RO'

    if (lang == 'fr'):
        embedding_name = 'embedding-FR'

    embedding = Embeddings(embedding_name)
    #--------------------------------------------------------------------------------------------
    #--- GENERAL SCRIPT
    #--------------------------------------------------------------------------------------------

    ########## Tokenize + stopwords
    #print(annotated_data)
    #raw_data=np.array(annotated_data)
    x_train = [i[0] for i in annotated_data]
    #print(x_train)
    y_train = [i[1] for i in annotated_data]  #replace N0 for L0...!!!
    #print(y_train)
    x_train_DL = []

    print('Data training with ' + str(len(x_train)) + ' texts')

    for text in x_train:
        #print(text)
        processed_text = preprocess(text)
        no_stpw_text = remove_stopwords(processed_text, lang)
        if len(
                to_vector_single_nonzeros(no_stpw_text, embedding,
                                          len(no_stpw_text))) > 0:
            vectorized_text = np.mean(to_vector_single_nonzeros(
                no_stpw_text, embedding, len(no_stpw_text)),
                                      axis=0)
        else:
            vectorized_text = np.zeros((300, ) * 1)
        #print(vectorized_text)
        #x_train_DL.append(np.reshape(vectorized_text,(1,-1)))
        x_train_DL.append(vectorized_text)

########## Build and test classifiers with 10-fold -cross validation

    skf = StratifiedKFold(n_splits=10, shuffle=True)

    #	Stochastic Descent Gradient

    cls = SGDClassifier(loss="log", penalty="l2",
                        max_iter=500).fit(x_train_DL, y_train)
    scores = cross_val_score(cls,
                             x_train_DL,
                             y_train,
                             cv=skf,
                             scoring='accuracy')
    print("Accuracy C-10V EN: %2.1f (+/- %2.1f)" %
          (100 * scores.mean(), scores.std() * 200))
    print(cls.classes_)  # check that class at the second position is 'Yes'
    accuracy = round((100 * scores.mean()), 2)
    ########## Save the model

    pickle.dump(cls, open(model_path + model_file, 'wb'))
    return (accuracy)