def process(self, use_chi2=True, chi2_param=5000,nb_max=1000): if not self.silent: print_color("Frodo is processing data.",COLORS.GREEN) training_data = np.array(self.training_data[:nb_max]) interview_text = training_data[:,1] Y = training_data[:,2].astype(int) df_train = np.array([interview_text, Y]).T # training to be done self.wv = WordVectorizer(df_train, contains_prediction=True, use_chi2=use_chi2, chi2_param=chi2_param) X = self.wv.convert_to_word_vector(df_train[:,0]) X[X > 0] = 1 # Y = df_train[:,1] # assert len(X) == len(Y), str(X.shape)+' is not equal to '+str(Y.shape) # print(df_train) # print(X.shape, Y.shape) # print(df_train.shape) # internalpurposes self.X_internal = X self.Y_internal = Y self.df_train_internal = df_train self.interview_text_internal = interview_text try: self.predictor = multiple_naive_bayes(X,Y) except Exception as e: print(str(e)) #don't remove this: self.has_processed=True
def run_nb(train_size=2000, ngram_range=(1, 1), smoothing=1, learn_code='0000', kbest=None): df_complete = pd.read_csv('./clean/ml_dataset_train-' + learn_code + '.csv', index_col=0) n = len(df_complete) train_split = 0.8 # subset of the training set for performance purposes subset_indicies = np.random.choice( [x for x in range(0, int(n * train_split))], size=train_size) df_train = df_complete.iloc[subset_indicies] # subset of testing set for performance purposes subset_indicies = np.random.choice( [x for x in range(int(n * train_split), n)], size=2000) df_test = df_complete.iloc[subset_indicies] # create a word vector using our training set if kbest: wv = WordVectorizer(df_train.values, contains_prediction=True, ngram_range=ngram_range, use_chi2=True, chi2_param=kbest) else: wv = WordVectorizer(df_train.values, contains_prediction=True, ngram_range=ngram_range) # make improvement here, convert to indicator X_train = wv.convert_to_word_vector(df_train.values.T[0]) X_train[X_train > 0] = 1 Y_train = df_train.values.T[1] assert len(X_train) == len(Y_train) # train the model predictor = multiple_naive_bayes(X_train, Y_train) # conver to indicator X_test = wv.convert_to_word_vector(df_test.values.T[0]) X_test[X_test > 0] = 1 Y_test = df_test.values.T[1] # get predictions predicted = predictor(X_test) return Y_test, predicted
def run_nb(train_size=2000,ngram_range=(1,1),smoothing=1, learn_code='0000', kbest=None): df_complete = pd.read_csv('./clean/ml_dataset_train-'+learn_code+'.csv',index_col=0) n = len(df_complete) train_split = 0.8 # subset of the training set for performance purposes subset_indicies = np.random.choice([x for x in range(0, int(n*train_split)) ],size=train_size) df_train = df_complete.iloc[subset_indicies] # subset of testing set for performance purposes subset_indicies = np.random.choice([x for x in range(int(n*train_split), n) ],size=2000) df_test = df_complete.iloc[subset_indicies] # create a word vector using our training set if kbest: wv = WordVectorizer(df_train.values, contains_prediction=True, ngram_range=ngram_range, use_chi2=True, chi2_param=kbest) else: wv = WordVectorizer(df_train.values, contains_prediction=True, ngram_range=ngram_range) # make improvement here, convert to indicator X_train = wv.convert_to_word_vector(df_train.values.T[0]) X_train[X_train > 0] = 1 Y_train = df_train.values.T[1] assert len(X_train) == len(Y_train) # train the model predictor = multiple_naive_bayes(X_train,Y_train) # conver to indicator X_test = wv.convert_to_word_vector(df_test.values.T[0]) X_test[X_test > 0] = 1 Y_test = df_test.values.T[1] # get predictions predicted = predictor(X_test) return Y_test, predicted