def get_processed_data(): import features (train_data, X_train, val_data, X_val, test_data, X_test) = features.get_data() all_data = pd.concat( [train_data, val_data, test_data], ignore_index = True) data = data_pp.process_data(all_data) clean_body_labels(data) train_data_pp = data[: len(train_data)] val_data_pp = data[len(train_data): len(train_data) + len(val_data)] test_data_pp = data[len(train_data) + len(val_data): ] return (train_data_pp, X_train, val_data_pp, X_val, test_data_pp, X_test)
def predict_class(img, model): # display_image(img, " Prediction Module") # Load ANN with open('MLPClassifier.pkl', 'rb') as f: clf1 = pickle.load(f) with open('scaler.pkl', 'rb') as f: scaler = pickle.load(f) # Load CNN json_file = open('CNNmodelFinal.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights( "F:\CODING\ProjectLatex\draft\models\.014-0.783.hdf5") loaded_model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy']) if model == "cnn": img = cv2.resize(img, (128, 128)) # img = pre.filter_image(img) # img = pre.otsu_thresh(img) # print(img) immatrix = [] # img_arr = array(np.asarray(img)).flatten() immatrix.append(img) inp = np.asarray(immatrix) Output = proba.prob(img) inp = inp.reshape(inp.shape[0], 128, 128, 1) inp = inp.astype('float32') inp /= 255 # print(inp) output = loaded_model.predict_classes(inp) # print(output) z = mp.list[int(output[0])] # output = proba(Output, z) return Output else: x = fea.get_data(img) temp = [] temp.append(x) temp = scaler.transform(temp) y = clf1.predict(temp) y = mp.list[int(y[0])] if y in mp.small: y = y.lower() # print(y + ' Got Predicted') return y
for files in os.listdir(path): real_value = pos print(files) print(real_value) if pos < 95: pos = pos + 1 continue path2 = os.listdir(path + files + '/') y = (len(path2)) cnt = 0 boudary = (y * 90) / 100 for imges in path2: val = path + files + '/' + imges img = cv2.imread(val, 0) cnt = cnt + 1 new_list = get_data(img) if new_list == -1: continue # print (new_list) if cnt > boudary: for i in new_list: test_in.write(str(i)) test_in.write(" ") test_in.write('\n') test_out.write(str(real_value)) test_out.write('\n') else: for i in new_list: train_in.write(str(i)) train_in.write(" ") train_in.write('\n')
def image_to_text(str): img = cv2.imread(str, 0) # img = cv2.resize(img, (50, 50), interpolation=cv2.INTER_CUBIC) with open('KNNClassifier.pkl', 'rb') as f: clf2 = pickle.load(f) with open('ExtraTreesClassifier.pkl', 'rb') as f: clf1 = pickle.load(f) with open('MPLClassifier4.pkl', 'rb') as f: clf3 = pickle.load(f) with open('scaler4.pkl', 'rb') as f: scaler = pickle.load(f) list_chars = template.run(img) # print(len(list_chars)) # print(len(list_chars[0])) ret_str = "" for word_list in list_chars: chars = [] for char_img in word_list: datas = get_data(char_img) # util.display_image(char_img) chars.append(datas) chars = scaler.transform(chars) out_vec1 = clf1.predict(chars) out_vec2 = clf2.predict(chars) out_vec3 = clf3.predict(chars) x1 = "" for vec in out_vec1: cnt = 0 for i in vec: cnt = cnt + 1 if i == 1: break val = "" if cnt < 11: cnt = cnt - 1 val = chr(48 + cnt) elif cnt > 10 and cnt < 37: cnt = cnt - 11 val = chr(65 + cnt) else: cnt -= 37 val = chr(97 + cnt) x1 = x1 + val x2 = "" for vec in out_vec2: cnt = 0 for i in vec: cnt = cnt + 1 if i == 1: break val = "" if cnt < 11: cnt = cnt - 1 val = chr(48 + cnt) elif cnt > 10 and cnt < 37: cnt = cnt - 11 val = chr(65 + cnt) else: cnt -= 37 val = chr(97 + cnt) x2 = x2 + val x3 = "" for vec in out_vec3: cnt = 0 for i in vec: cnt = cnt + 1 if i == 1: break val = "" if cnt < 11: cnt = cnt - 1 val = chr(48 + cnt) elif cnt > 10 and cnt < 37: cnt = cnt - 11 val = chr(65 + cnt) else: cnt -= 37 val = chr(97 + cnt) x3 = x3 + val finalx = "" for i in range(0, len(x1)): l = [] if x1[i] != 'z': l.append(x1[i]) if x2[i] != 'z': l.append(x2[i]) if x3[i] != 'z': l.append(x3[i]) if len(l) == 0: finalx += 'z' else: finalx += l[0] ret_str += finalx ret_str += " " return ret_str
# Author: Sebastian Law # Date: 30-Mar-2016 # Revised: 30-Mar-2016 import features import numpy as np from sklearn.ensemble import RandomForestClassifier import matplotlib.pyplot as plt from sklearn.learning_curve import learning_curve data = features.get_data() train = data.loc[data['Survived'].notnull()] X = train.values[:, 2:] y = train.values[:, 1] # forest = RandomForestClassifier(n_estimators=1000, # max_depth=8, # criterion='entropy', # min_samples_split=5, # max_features=6) forest = RandomForestClassifier(n_estimators=1000, max_depth=9, criterion='entropy', min_samples_split=10, max_features=6) train_sizes, train_scores, test_scores = learning_curve( forest, X, y, cv=10,
def report(grid_scores, n_top=6): params = None top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] for i, score in enumerate(top_scores): print("Rank: {0}".format(i + 1)) print("Mean score: {0:.4f} (std: {1:.4f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters:", score.parameters) print("") if params is None: params = score.parameters return params data = features.get_data() train = data.loc[data['Survived'].notnull()] X = train.values[:, 2:] y = train.values[:, 1] sqrtfeat = np.sqrt(X.shape[1]).astype(int) grid_test = {"n_estimators" : [1000, 2000, 3000, 4000, 5000], "criterion" : ["gini", "entropy"], "max_features" : [sqrtfeat, sqrtfeat+1, sqrtfeat+2, sqrtfeat+3], "max_depth" : [5, 7, 9, 11, 13], "min_samples_split" : [2, 4, 6, 8, 10]} forest = RandomForestClassifier(oob_score=True) grid_search = GridSearchCV(forest, grid_test, n_jobs=-1, cv=10)
#code to train the algorithms #written by Rajat Arora 2013A7PS104P from features import get_data, sentiment_score, add_sentiment_score from vector import get_word_features, vectorize, get_words, naive_bayes_vector import cPickle from sklearn.svm import SVC from sklearn.naive_bayes import BernoulliNB from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from maxent import Maxent import numpy from naive_bayes import NaiveBayesClassifier from svm_classifier import SVM data = get_data('shortdatabase.csv') word_features = get_word_features(data['tweet']) word_features = sorted(word_features) word_features = sorted(word_features) word_vector = vectorize(word_features, data['tweet'], data['sentiment']) vector = [] labels = [] for example in word_vector: vector = vector + [example[0]] labels = labels + [example[1]] print "Stage 1: Word Polarity" print "training bayesian network" words = get_words("features.txt") bayes_vector = naive_bayes_vector(words, data['tweet'], data['sentiment'])
def run(): f = 0 test_data = [] output = [] train_data = [] predicated_output = [] x = 0 y = 0 cwd = os.getcwd() train_in = 'train_input6.txt' test_in = 'test_input6.txt' train_out = 'train_out6.txt' test_out = 'test_out6.txt' train_in = open(train_in, 'w') train_out = open(train_out, 'w') test_in = open(test_in, 'w') test_out = open(test_out, 'w') path = cwd+'/Fnt6/' for files in os.listdir(path): character_value = int((files[6:len(files)])) # print (files) #cnt += 1 # print (character_value) real_value = 0 # print(files) if character_value > 0 and character_value < 11: real_value = str(character_value-1) elif character_value > 10 and character_value < 37: character_value = character_value - 10 real_value = chr(65 + character_value - 1) else: character_value = character_value - 36 real_value = chr(97 + character_value - 1) # print(real_value) path2 = os.listdir(path + files+'/') y = (len(path2)) #print (path2) cnt = 0 boudary = (y * 90) / 100 for imges in path2: val = path+files+'/' + imges #print(val) img = cv2.imread(val, 0) #print (img) # print(img) cnt = cnt + 1 new_list = get_data(img) # print (new_list) if cnt > boudary: test_data.append(new_list) x = x + 1 output.append(real_value) for i in new_list: test_in.write(str(i)) test_in.write(" ") test_in.write('\n') test_out.write(str(real_value)) test_out.write('\n') else: train_data.append(new_list) y = y + 1 predicated_output.append(real_value) for i in new_list: train_in.write(str(i)) train_in.write(" ") # train_in.write(str(new_list)) train_in.write('\n') train_out.write(str(real_value)) train_out.write('\n') train_out.close() train_in.close() test_out.close() test_in.close()
def predict(in_fname, lin_n_cv_iters, n_cv_iters, regularizations, n_labs, age_index, gender_index, out_fname, nn_out_fname=None, verbose=False, emb_fnames=None): if verbose: print "loading data" X_train, Y_train, X_validation, Y_validation, X_test, Y_test = features.get_data( in_fname) emb_data_list = [None] emb_fname_list = [''] if emb_fnames is not None: for emb_fname in emb_fnames: emb_data_list.append(emb.get_emb_data(emb_fname)) emb_fname_list.append(emb_fname) if verbose: print "training, validating and testing models" results = [] for e, emb_data in enumerate(emb_data_list): if verbose: print str(e) if verbose: print "-->L2" model = models.L2(X_train, Y_train, X_validation, Y_validation, X_test, Y_test, n_labs, emb_data) if lin_n_cv_iters == -1: params = [[False, True], regularizations] else: params = [['sample', False, True], ['uniform', regularizations[0], regularizations[-1]]] model.crossvalidate(params=params, param_names=['fit_intercept', 'C'], n_cv_iters=lin_n_cv_iters) model.test() s = model.summarize() s['emb_fname'] = emb_fname_list[e] results.append(s) if verbose: print "-->L1" model = models.L1(X_train, Y_train, X_validation, Y_validation, X_test, Y_test, n_labs, age_index, gender_index, emb_data) if lin_n_cv_iters == -1: params = [[False, True], regularizations] else: params = [['sample', False, True], ['uniform', regularizations[0], regularizations[-1]]] model.crossvalidate(params=params, param_names=['fit_intercept', 'C'], n_cv_iters=lin_n_cv_iters) model.test() s = model.summarize() s['emb_fname'] = emb_fname_list[e] results.append(s) if verbose: print "-->RandomForest" model = models.RandomForest(X_train, Y_train, X_validation, Y_validation, X_test, Y_test, emb_data) if n_cv_iters == -1: params = [[1, 10, 20], [1, 3, 10], ['sqrt_n_features', 'n_features'], [1, 3, 10], [1, 3, 10], [True, False], ['gini', 'entropy']] else: params = [['randint', 1, 20], ['randint', 1, 10], ['sample', 'sqrt_n_features', 'n_features'], ['randint', 1, 10], ['randint', 1, 10], ['sample', True, False], ['sample', 'gini', 'entropy']] param_names = [ 'n_estimators', 'max_depth', 'max_features', 'min_samples_split', 'min_samples_leaf', 'bootstrap', 'criterion' ] model.crossvalidate(params=params, param_names=param_names, n_cv_iters=n_cv_iters) model.test() s = model.summarize() s['emb_fname'] = emb_fname_list[e] results.append(s) if emb_data is not None: if verbose: print "-->Only embeddings" model = models.L(emb_data[0], Y_train, emb_data[1], Y_validation, emb_data[2], Y_test, None) if lin_n_cv_iters == -1: params = [['l1', 'l2'], [False, True], regularizations] else: params = [['sample', 'l1', 'l2'], ['sample', False, True], ['uniform', regularizations[0], regularizations[-1]]] model.crossvalidate(params=params, param_names=['penalty', 'fit_intercept', 'C'], n_cv_iters=lin_n_cv_iters) model.test() s = model.summarize() s['emb_fname'] = emb_fname_list[e] results.append(s) with open(out_fname, 'w') as fout: fout.write(yaml.dump(results)) if nn_out_fname is not None: best_model = nn.evaluate(nn_out_fname, n_cv_iters, 20, X_train, Y_train, X_validation, Y_validation, X_test, Y_test, 45, models=['cnn2'], random_seed=345, verbose=verbose)