Beispiel #1
0
def classify(path, model_file):
    
    classes=['truthful_positive', 'truthful_negative', 'deceptive_positive','deceptive_negative']
    bag_of_words, count_of_each_word_matrix, probability_for_each_class= utils.readFromModelFile(model_file, classes)
    #bag_of_words, count_of_each_word_matrix, probability_for_each_class, classes= nblearn3.learn(path)
    #print(len(bag_of_words), len(count_of_each_word_matrix), len(count_of_each_word_matrix[0]), len(probability_for_each_class) )
    
    Val_data, Val_labela, Val_labelb, filepaths= data.new_data_processing(path, True)
    
    ##Val_data, filepaths= data.new_data_processing_test(path)
    
    class_indexes=utils.createIndexMappingForClass(classes)
    words_indexes=utils.createIndexMappingForClass(bag_of_words)
    
    #print(Val_data)
    Val_data_dict=data.convertToDictionary(Val_data)
    
    predicted_labela=list()
    predicted_labelb=list()
    predicted_lables=list()
    for i in range(len(Val_data_dict)): 
        max_probabilty=float("-inf")
        max_probability_class=None
        for j in range(len(classes)): 
            total_probability=0
            posterior_probability=0
            for key, value in Val_data_dict[i].items():
                if words_indexes.get(key)!=None:
                    #print(key, words_indexes.get(key),j, count_of_each_word_matrix[j][words_indexes.get(key)])
                    posterior_probability+=math.log(count_of_each_word_matrix[j][words_indexes.get(key)])
                    #print(filepaths[i],key, posterior_probability)
            prior_probability=probability_for_each_class.get(classes[j])
            total_probability= posterior_probability+math.log(prior_probability)
            #print(total_probability)
            if total_probability>max_probabilty:
                max_probabilty=total_probability
                max_probability_class=classes[j]
        #print(max_probability_class)        
        predicted_labela.append(utils.extractLabelA(max_probability_class)) 
        predicted_labelb.append(utils.extractLabelB(max_probability_class)) 
        predicted_lables.append(max_probability_class)
    
    #print(predicted_labela, predicted_labelb)
    joined_label=list()
    
    ##To Test F1 score locally
    for i in range(len(Val_labela)):
        #print(Val_labela[i], predicted_labela[i], Val_labelb[i], predicted_labelb[i])
        joined_label.append(Val_labela[i]+str('_')+Val_labelb[i])
    print(utils.f1_score_from_sklearn(Val_labela, predicted_labela, classes))
    print(utils.f1_score_from_sklearn(Val_labelb, predicted_labelb, classes))
    print(utils.f1_score_from_sklearn(joined_label, predicted_lables, classes))
    
    return predicted_labela, predicted_labelb, filepaths
def classify(path, model_file):
    
    classes_labela=['deceptive', 'truthful']
    classes_labelb=['negative', 'positive']
    learned_weights_labela, selected_features_labela, learned_weights_labelb, selected_features_labelb= utils.readFromModelFile(model_file)
        ##Val_data, filepaths= data.new_data_processing_test(path)
    ##learned_weights_labela, averaged_weights_labela,selected_features_labela,learned_weights_labelb, averaged_weights_labelb,selected_features_labelb= perceplearn3.learn(path)
        #print(len(bag_of_words), len(count_of_each_word_matrix), len(count_of_each_word_matrix[0]), len(probability_for_each_class) )
    
    Val_data, Val_labela, Val_labelb, filepaths= data.new_data_processing(path, True)
    
    ##Val_data, filepaths= data.new_data_processing_test(path)
    
    #print(Val_data)
    Val_data_dict=data.convertToDictionary(Val_data)
    
    selected_features_labela_dict=utils.createIndexMappingForClass(selected_features_labela)
    
    selected_features_labelb_dict=utils.createIndexMappingForClass(selected_features_labelb)

    X_labela=utils.populateFeatureValuesWithoutLabelsForEachDocument(Val_data_dict, selected_features_labela_dict)
    
    y_labela=utils.predicted_labels(X_labela, classes_labela, learned_weights_labela)
    
    finalOutput=dict()
    
    finalOutput=utils.prepareFinalOutput(filepaths, y_labela, finalOutput,"a")
    
    X_labelb=utils.populateFeatureValuesWithoutLabelsForEachDocument(Val_data_dict, selected_features_labelb_dict)
    
    y_labelb=utils.predicted_labels(X_labelb, classes_labelb, learned_weights_labelb)
    
    finalOutput=utils.prepareFinalOutput(filepaths, y_labelb, finalOutput,"b")
    
    labela, labelb=utils.extractLabelsFromDict(finalOutput)

    score1, score2, score3=utils.f1_score_from_sklearn(Val_labela, labela, classes_labela)
    score1a, score2a, score3a=utils.f1_score_from_sklearn(Val_labelb, labelb, classes_labelb)
    
    print((score1+score1a)/2,(score2+score2a)/2,(score3+score3a)/2)
    
    return labela, labelb, filepaths
Beispiel #3
0
def calculateF1scoreOnValidationSet(path, selected_features_label,
                                    learned_weights_label, classes_label,
                                    label):

    Val_data, Val_labela, Val_labelb, filepaths = data.new_data_processing(
        path, True)

    ##Val_data, filepaths= data.new_data_processing_test(path)

    #print(Val_data)
    Val_data_dict = data.convertToDictionary(Val_data)

    selected_features_label_dict = createIndexMappingForClass(
        selected_features_label)

    X_label = populateFeatureValuesWithoutLabelsForEachDocument(
        Val_data_dict, selected_features_label_dict)

    y_label = predicted_labels(X_label, classes_label, learned_weights_label)

    finalOutput = dict()

    finalOutput = prepareFinalOutput(filepaths, y_label, finalOutput, label)

    labela, labelb = extractLabelsFromDict(finalOutput)

    if label == "a":
        label = labela
        Val_label = Val_labela
    else:
        label = labelb
        Val_label = Val_labelb
    score1, score2, score3 = f1_score_from_sklearn(Val_label, label,
                                                   classes_label)

    return score1, score2, score3
def learn(path='/Users/nishatiwari/Downloads/op_spam_training_data/'):

    Train_data, Train_labela, Train_labelb, bag_of_words, bag_of_words_for_each_file = data.new_data_processing(
        path, False)

    #list of dictionary which hold term frequency for each word for a document
    Train_data_dict = data.convertToDictionary(Train_data)
    #print(bag_of_words_for_each_file)
    #print(len(Train_labela), len(Train_labelb), len(Train_data_dict))
    #Val_data_dict=data.convertToDictionary(Val_data)

    word_counts_for_classes = utils.countWordsForEachClass(
        Train_data_dict, Train_labela, Train_labelb)

    word_counts_for_classes = utils.keepHighFrequencyWords(
        word_counts_for_classes)

    #List of words and the number of documents they appear on

    #train_words_idf=data.calculateIdfforEachWord(Train_data_dict, bag_of_words)

    #val_words_idf=data.calculateIdfforEachWord(Val_data_dict, bag_of_words)

    #Train_words_tf_idf=data.calculateTfIDFforEachWord(Train_data_dict, train_words_idf,Train_data)
    #Val_words_tf_idf=data.calculateTfIDFforEachWord(Val_data_dict,val_words_idf,Val_data)

    #print(dict_classes)
    classes_labela = ['deceptive', 'truthful']
    classes_labelb = ['negative', 'positive']

    selected_features_labela = utils.createFeaturesMatrix(
        classes_labela, word_counts_for_classes)

    selected_features_labela_dict = utils.createIndexMappingForClass(
        selected_features_labela)

    selected_features_labelb = utils.createFeaturesMatrix(
        classes_labelb, word_counts_for_classes)

    selected_features_labelb_dict = utils.createIndexMappingForClass(
        selected_features_labelb)

    X_labela = utils.populateFeratureValuesForEachDocument(
        Train_data_dict, selected_features_labela_dict, Train_labela,
        classes_labela)

    X_labelb = utils.populateFeratureValuesForEachDocument(
        Train_data_dict, selected_features_labelb_dict, Train_labelb,
        classes_labelb)

    w = np.zeros(len(selected_features_labela_dict) + 1)
    u = np.zeros(len(selected_features_labela_dict) + 1)
    c = 1
    maxIter = 120
    for i in range(1, maxIter):
        np.random.seed(i)
        np.random.shuffle(X_labela)
        y = X_labela[:, len(X_labela[0]) - 1]
        X_label = X_labela[:, 0:len(X_labela[0]) - 1]
        X_label = np.insert(X_label, 0, 1, axis=1)
        w, u, c = utils.trainDataGivenWeights(w, u, y, X_label, c)
        #print("vanilla labela",i,utils.calculateF1scoreOnValidationSet(path, selected_features_labela, w, classes_labela,"a"))
        #print("average labela",i,utils.calculateF1scoreOnValidationSet(path, selected_features_labela, u, classes_labela,"a"))
        i += 1

    w = np.zeros(len(selected_features_labelb_dict) + 1)
    u = np.zeros(len(selected_features_labelb_dict) + 1)
    c = 1
    maxIter = 100

    for i in range(1, maxIter):
        np.random.seed(i)
        np.random.shuffle(X_labelb)
        y = X_labelb[:, len(X_labelb[0]) - 1]
        X_label = X_labelb[:, 0:len(X_labelb[0]) - 1]
        X_label = np.insert(X_label, 0, 1, axis=1)
        w, u, c = utils.trainDataGivenWeights(w, u, y, X_label, c)
        #print("vanilla labelb",i,utils.calculateF1scoreOnValidationSet(path,selected_features_labelb, w,classes_labelb,"b"))
        #print("average labelb",i,utils.calculateF1scoreOnValidationSet(path, selected_features_labelb, u,classes_labelb,"b"))
        i += 1

    learned_weights_labela, averaged_weights_labela = utils.trainData(
        selected_features_labela_dict, X_labela)

    learned_weights_labelb, averaged_weights_labelb = utils.trainData(
        selected_features_labelb_dict, X_labelb)

    return learned_weights_labela, averaged_weights_labela, selected_features_labela, learned_weights_labelb, averaged_weights_labelb, selected_features_labelb
Beispiel #5
0
def learn(path='/Users/nishatiwari/Downloads/op_spam_training_data/'):

    Train_data, Train_labela, Train_labelb, bag_of_words, bag_of_words_for_each_class = data.new_data_processing(
        path, False)

    #print(Train_labela, Train_labelb)

    Train_data_dict = data.convertToDictionary(Train_data)
    #Val_data_dict=data.convertToDictionary(Val_data)

    #list of dictionary which hold term frequency for each word for a document
    Train_data_dict = data.convertToDictionary(Train_data)
    #Val_data_dict=data.convertToDictionary(Val_data)

    #List of words and the number of documents they appear on
    #print(bag_of_words)
    train_words_idf = data.calculateIdfforEachWord(Train_data_dict,
                                                   bag_of_words)

    #val_words_idf=data.calculateIdfforEachWord(Val_data_dict, bag_of_words)

    Train_words_tf_idf = data.calculateTfIDFforEachWord(
        Train_data_dict, train_words_idf, Train_data)
    #Val_words_tf_idf=data.calculateTfIDFforEachWord(Val_data_dict,val_words_idf,Val_data)

    #print(dict_classes)
    classes = [
        'truthful_positive', 'truthful_negative', 'deceptive_positive',
        'deceptive_negative'
    ]

    classes_data_set = dict()
    dict_of_words_for_each_class = dict()

    for i in range(len(Train_data)):
        #print(Train_labela[i], Train_labelb[i])
        current_class = None
        if Train_labela[i] == 'truthful' and Train_labelb[i] == 'positive':
            current_class = 'truthful_positive'
        elif Train_labela[i] == 'truthful' and Train_labelb[i] == 'negative':
            current_class = 'truthful_negative'
        elif Train_labela[i] == 'deceptive' and Train_labelb[i] == 'positive':
            current_class = 'deceptive_positive'
        else:
            current_class = 'deceptive_negative'

        classes_data_set, dict_of_words_for_each_class = utils.updateClassDictionary(
            current_class, Train_data_dict[i], classes_data_set,
            dict_of_words_for_each_class)
        #print(len(dict_of_words_for_each_class.keys()), current_class)

    probability_for_each_class = utils.calculateProbabiltyForEachClass(
        classes, classes_data_set)

    count_of_each_word_matrix, count_of_total_words = utils.calculateTheCountOfWords(
        classes, classes_data_set, bag_of_words)

    count_of_each_word_matrix = utils.applyingLaplacesmoothing(
        count_of_each_word_matrix, count_of_total_words, len(bag_of_words))

    utils.findoutTopWords(count_of_each_word_matrix, bag_of_words)
    #print(classes_data_set.get('deceptive_negative')[0])
    #print(len(bag_of_words), len(count_of_each_word_matrix), len(count_of_each_word_matrix[0]), len(probability_for_each_class) )
    #print(bag_of_words)
    return bag_of_words, count_of_each_word_matrix, probability_for_each_class, classes