Example #1
0
    def process(self, use_chi2=True, chi2_param=5000,nb_max=1000):
        if not self.silent:
            print_color("Frodo is processing data.",COLORS.GREEN)
            
        training_data = np.array(self.training_data[:nb_max])
        
        interview_text = training_data[:,1]
        Y = training_data[:,2].astype(int)
        df_train = np.array([interview_text, Y]).T

        # training to be done
        self.wv = WordVectorizer(df_train, contains_prediction=True, use_chi2=use_chi2, chi2_param=chi2_param)
        X = self.wv.convert_to_word_vector(df_train[:,0])
        X[X > 0] = 1
        # Y = df_train[:,1]
        # assert len(X) == len(Y), str(X.shape)+' is not equal to '+str(Y.shape)
        # print(df_train)
        # print(X.shape, Y.shape)
        # print(df_train.shape)

        # internalpurposes
        self.X_internal = X
        self.Y_internal = Y
        self.df_train_internal = df_train
        self.interview_text_internal = interview_text

        try:
            self.predictor = multiple_naive_bayes(X,Y)
        except Exception as e:
            print(str(e))
        #don't remove this:
        self.has_processed=True
Example #2
0
def run_nb(train_size=2000,
           ngram_range=(1, 1),
           smoothing=1,
           learn_code='0000',
           kbest=None):

    df_complete = pd.read_csv('./clean/ml_dataset_train-' + learn_code +
                              '.csv',
                              index_col=0)

    n = len(df_complete)

    train_split = 0.8

    # subset of the training set for performance purposes
    subset_indicies = np.random.choice(
        [x for x in range(0, int(n * train_split))], size=train_size)
    df_train = df_complete.iloc[subset_indicies]

    # subset of testing set for performance purposes
    subset_indicies = np.random.choice(
        [x for x in range(int(n * train_split), n)], size=2000)
    df_test = df_complete.iloc[subset_indicies]

    # create a word vector using our training set
    if kbest:
        wv = WordVectorizer(df_train.values,
                            contains_prediction=True,
                            ngram_range=ngram_range,
                            use_chi2=True,
                            chi2_param=kbest)
    else:
        wv = WordVectorizer(df_train.values,
                            contains_prediction=True,
                            ngram_range=ngram_range)

    # make improvement here, convert to indicator
    X_train = wv.convert_to_word_vector(df_train.values.T[0])
    X_train[X_train > 0] = 1
    Y_train = df_train.values.T[1]
    assert len(X_train) == len(Y_train)

    # train the model
    predictor = multiple_naive_bayes(X_train, Y_train)

    # conver to indicator
    X_test = wv.convert_to_word_vector(df_test.values.T[0])
    X_test[X_test > 0] = 1
    Y_test = df_test.values.T[1]

    # get predictions
    predicted = predictor(X_test)

    return Y_test, predicted
def run_nb(train_size=2000,ngram_range=(1,1),smoothing=1, learn_code='0000', kbest=None):

	df_complete = pd.read_csv('./clean/ml_dataset_train-'+learn_code+'.csv',index_col=0)

	n = len(df_complete)

	train_split = 0.8

	# subset of the training set for performance purposes
	subset_indicies = np.random.choice([x for x in range(0, int(n*train_split)) ],size=train_size)
	df_train = df_complete.iloc[subset_indicies]


	# subset of testing set for performance purposes
	subset_indicies = np.random.choice([x for x in range(int(n*train_split), n) ],size=2000)
	df_test = df_complete.iloc[subset_indicies]

	# create a word vector using our training set
	if kbest:
		wv = WordVectorizer(df_train.values, contains_prediction=True, ngram_range=ngram_range, use_chi2=True, chi2_param=kbest) 
	else:
		wv = WordVectorizer(df_train.values, contains_prediction=True, ngram_range=ngram_range) 

	# make improvement here, convert to indicator
	X_train = wv.convert_to_word_vector(df_train.values.T[0])
	X_train[X_train > 0] = 1 
	Y_train = df_train.values.T[1]
	assert len(X_train) == len(Y_train)


	# train the model
	predictor = multiple_naive_bayes(X_train,Y_train)

	# conver to indicator
	X_test = wv.convert_to_word_vector(df_test.values.T[0])
	X_test[X_test > 0] = 1
	Y_test = df_test.values.T[1]

	# get predictions
	predicted = predictor(X_test)
	
	return Y_test, predicted