def main(): junk_param = sys.argv[1] classifier_param = sys.argv[2] train_param = sys.argv[3] #split_param = sys.argv[4] print('Main Program Begins : ') write_csv.generate_inkml_file_list() symbol_data_obj_list, junk_data_obj_list, test_data_obj_list = parse_data.parse_data( junk_param) print(len(symbol_data_obj_list)) print(len(junk_data_obj_list)) print(len(test_data_obj_list)) print('object created') symbol_data_obj_list = feature_extraction.get_features( symbol_data_obj_list, 'symbol_feature_list.csv') junk_data_obj_list = feature_extraction.get_features( junk_data_obj_list, 'junk_feature_list.csv') test_data_obj_list = feature_extraction.get_features( test_data_obj_list, 'test_feature_list.csv') print('Features extracted') prediction_file, GT_file = classification_driver.classification( junk_param, classifier_param, train_param) #Feature Extraction follows if (prediction_file is not None and GT_file is not None): command = 'python evalSymbIsole.py ' + data_folder + GT_file + ' ' + data_folder + prediction_file + ' HTML > output.html' #After this we can save all features in one csv as a table with final column as output(GT) #This will also save time for parsing ISO files again and again. os.system(command) print('Done!')
def main(): path_train = "./Train" path_test = "./Test" N = len(os.listdir(path_train)) # this number can be determined algebraically: (0.025 + (0.01)(x - 1) = 3 (seconds)); D = x * 13 (cepstrums) N_test = len(os.listdir(path_test)) D = 299 * 13 X, N, D = get_features(path_train) X_test, N_test, _ = get_features(path_test) y = get_labels(path_train) y_test = get_labels(path_test) write_features("./X.txt", X, N, D) write_features("./X_test.txt", X_test, N_test, D) write_labels("./y.txt", y, N) write_labels("./y_test.txt", y_test, N_test) X = read_features("./X.txt", N, D) X_test = read_features("./X_test.txt", N_test, D) y = read_labels("./y.txt", N) y_test = read_labels("./y_test.txt", N_test) # run on linear svm model linear_svm(path_train, path_test, X=X, X_test=X_test, y=y, y_test=y_test) # run on rbf kernel svm model rbf(path_train, path_test, X=X, X_test=X_test, y=y, y_test=y_test)
def test_data(input_path, model_path): data = os.listdir(input_path) model = load_model(model_path) with open('running_time.txt', 'w') as f: pass for element in data: was = time() img_path = os.path.join(input_path, element) image = io.imread(img_path) words = get_char_images_pred(image) test_points = [] for word in words: for char in word: test_points.append(get_features(char, False)) predictions = model.predict(test_points) new_pred = [] current = 0 for i in range(len(words)): for _ in range(len(words[i])): new_pred.append(predictions[current]) current += 1 new_pred.append(0) element = element.split('.')[0] element = element + '.txt' path = os.path.join("output","text", element) save_predictions(new_pred, path) time_taken = time() - was with open('output/running_time.txt', 'a') as f: f.write(str(time_taken)) f.write('\n')
def get_samples_from_arr(arr, arr_len=220, step=20, n_samples=1): output = np.zeros((n_samples, n_features * n_channels)) for i in range(n_samples): for j in range(n_channels): feat_list = get_features(arr[i * step:i * step + arr_len, j]) output[i, j * n_features:(j + 1) * n_features] = feat_list return output
def perform_clustering(path, method, threshold): prefix = '/Users/lyudakopeikina/Documents/HSE_FaceRec_tf-master/facial_clustering/lfw_ytf2%s_features.npz' #prefix = '/Users/lyudakopeikina/Documents/HSE_FaceRec_tf-master/facial_clustering/faces/features%s.npz' crop_center = False features_file = os.path.join(path[0], prefix % (recognizer_list[recognizer_ind][1])) print(features_file) features, labels = get_features(features_file, recognizer_list[recognizer_ind][2], recognizer_list[recognizer_ind][0]) print(len(features[0])) X_norm = preprocessing.normalize(features, norm='l2') pair_dist = pairwise_distances(X_norm) timer = time.time() clusters = clustering_results(pair_dist, method, threshold) timer = time.time() - timer print('clustering time for', method, timer) predictions = -np.ones(len(labels)) for idx, cluster in enumerate(clusters): predictions[cluster] = idx idx = len(clusters) for i in range(len(predictions)): if predictions[i] == -1: idx += 1 predictions[i] = idx num_of_classes = len(np.unique(labels)) num_of_clusters = len(clusters) print('features shape:', X_norm.shape, '#classes:', num_of_classes, '#clusters:', num_of_clusters) return num_of_classes, num_of_clusters, labels, predictions
def features_in_path(folder_path, sentiment): """Return features from the files in the give folder path and set sentiment""" all_features = [] for filename in os.listdir(folder_path)[:1000]: path = folder_path + "/" + filename tokens = open_and_tokenize(path) features = get_features(tokens) features["filename"] = filename features["sentiment"] = sentiment all_features.append(features) return all_features
def features_from_label(audio_file, segment): """ Using the label, extract the features from the segment defined by the label. """ duration = segment['end'] - segment['start'] audio, sample_rate = librosa.core.load(audio_file, duration=duration, offset=segment['start']) features = fe.get_features(audio, sample_rate) return features
def main(): POPULATION = 100 MAX_ITER = 15 MUTATION_PROB = 0.1 global df df = get_features() global n_speakers n_speakers = df['a'].cat.categories.size # Initial population pop = create_initial_population(POPULATION) pop_ci = population_ci(pop) print( '-------------------------\tInitial population\t-------------------------' ) find_best_table(pop, pop_ci) for iteration in range(MAX_ITER): print('-------------------------\tIteration ' + str(iteration + 1) + ' of ' + str(MAX_ITER) + '\t-------------------------') # Initialization crossover_bag = np.zeros((1, 8), dtype=np.int16) # Compute fitness for each table in the population pop_fitness = pop_ci * -1 leveler = np.min(pop_fitness) pop_fitness = pop_fitness - leveler total_fitness = np.sum(pop_fitness) # Generate the crossover_bag for i in range(POPULATION): perc = pop_fitness[i] / total_fitness n = int(round(perc * POPULATION)) for j in range(n): crossover_bag = np.vstack((crossover_bag, pop[i])) crossover_bag = crossover_bag[1:] # Crossover for i in range(POPULATION): # Randomly select 2 parents (tables) couple = random.sample(range(crossover_bag.shape[0]), 2) father = crossover_bag[couple[0]] mother = crossover_bag[couple[1]] child = crossover(father, mother) # Mutation if random.random() <= MUTATION_PROB: child = mutate(child) if i == 0: new_pop = child else: new_pop = np.vstack((new_pop, child)) pop = new_pop # Compute CI for each table in the new population pop_ci = population_ci(pop) find_best_table(pop, pop_ci)
def process_corpus(tr_in_filename, te_in_filename, u_in_filename, tr_out_filename, te_out_filename, u_out_filename): input_f = open(tr_in_filename, 'r') tr_original_corpus = pickle.load(input_f) input_f.close() input_f = open(te_in_filename, 'r') te_original_corpus = pickle.load(input_f) input_f.close() input_f = open(u_in_filename, 'r') u_original_corpus = pickle.load(input_f) input_f.close() tr_instances = [d['question'] for d in tr_original_corpus if '' not in d['target']] te_instances = [d['question'] for d in te_original_corpus if '' not in d['target']] u_instances = [d['question'] for d in u_original_corpus if ((not 'target' in d) or '' not in d['target'])] vect = get_features() vect.fit(tr_instances + te_instances + u_instances) v_instances = vect.transform(tr_instances + te_instances + u_instances) v_instances = csr_matrix(v_instances > 0, dtype=int8) print v_instances.shape tr_corpus = Corpus() tr_corpus.instances = v_instances[:len(tr_instances)] tr_corpus.full_targets = [d['target'] for d in tr_original_corpus if '' not in d['target']] tr_corpus.representations = [_get_repr(i[0]) for i in tr_instances] tr_corpus._features_vectorizer = vect tr_corpus.save_to_file(tr_out_filename) te_corpus = Corpus() te_corpus.instances = v_instances[:len(te_instances)] te_corpus.full_targets = [d['target'] for d in te_original_corpus if '' not in d['target']] te_corpus.representations = [_get_repr(i[0]) for i in te_instances] te_corpus._features_vectorizer = vect te_corpus.save_to_file(te_out_filename) u_corpus = Corpus() u_corpus.instances = v_instances[:len(u_instances)] u_corpus.full_targets = [d['target'] if ('target' in d and '' not in d['target']) else [] for d in u_original_corpus] u_corpus.representations = [_get_repr(i[0]) for i in u_instances] u_corpus._features_vectorizer = vect u_corpus.save_to_file(u_out_filename)
def get_segment_labels(segs, inkml_obj, detector): """ Get classifications for segments. """ labels = [] for seg in segs: tracez = [] for tr in seg: tracez.append(inkml_obj.get_trace(tr)) features = fe.get_features(tracez) label, prob = detector.score_for_trace([features]) if label == ',': label = 'COMMA' labels.append(label) return labels
def predict_arcs(conll_dict, model, feature_dict): buff = [i for i in range(len(conll_dict['FORM']))[::-1]] stack, dgraph = [], [] while (len(buff) > 0 or len(stack) > 1): config = (stack, buff, dgraph) features = get_features(config, conll_dict) binary_features = one_hot_encoding([features], feature_dict) choice = model.predict(binary_features) try: if choice == 'shift': shift(stack, buff, stack) elif choice == 'left_arc': left_arc(stack, buff, dgraph) elif choice == 'right_arc': right_arc(stack, buff, dgraph) else: return None except IndexError: break return dgraph
def get_feature_vectors_for_training(data): projective_tree_count = 0 projective_non_parsable = [] X = [] y = [] for id,sent_dict in data.items(): if len(sent_dict['FORM']) == 1: #Example : train file, line 97384. Text : '************************' continue gold_arcs = get_gold_arcs(sent_dict['HEAD']) buff = [i for i in range(len(sent_dict['FORM']))[::-1]] projective = is_projective(gold_arcs, len(sent_dict['FORM'])) if not projective: continue try: dgraph, configurations = make_transitions(buff, oracle_std, gold_arcs) except IndexError: projective_non_parsable.append(sent_dict) continue for config in configurations: X.append(get_features(config[:2],sent_dict)) y.append(config[2]) # Root missing. if set(gold_arcs)-set(dgraph): print("Missing arcs",set(gold_arcs)-set(dgraph)) projective_tree_count+=1 feature_values = set([feature for row in X for feature in row]) feature_dict = {feature: i for i, feature in enumerate(feature_values)} with open('feature_dict.pkl', 'wb') as f: pickle.dump(feature_dict,f) X_ = one_hot_encoding(X,feature_dict) a = 0.0 for i in range(len(X_)): b = float(sum(X_[i])) c = float(len(X[i])) a += b y = np.array(y) print("Number of valid projection trees : "+str(projective_tree_count)) return X_,y
def emotion_classifier(audio_source_path, storage_name, action): get_observed_emotions_codes = get_emotion_code_from_description(emotion_labels)(observed_emotions) [extraction_active, train_active] = parse_action(action) print("Starting...") print("Feature extraction: {0}".format(extraction_active)) print("Network train: {0}".format(train_active)) execute = (pipe | get_features( mfcc_required=True, chroma_required=True, mel_required=True, storage_name=storage_name, active=extraction_active) | partial(filter, filter_dataset(get_observed_emotions_codes)) | list | train_network()) execute(audio_source_path)
def avg(training_file, submission_file, output_file): data = utilities.read_file(training_file) train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data) targets_train, targets_cv = preprocess.get_train_cv_targets( train_data, cv_data) (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) = feature_extraction.get_avg_maps(train_data) x_train_all, x_cv_all = feature_extraction.get_x_by_avg( train_data, cv_data, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) clfs = regression.linear_regression(x_train_all, x_cv_all, targets_train, targets_cv) clfs = regression.random_forest(x_train_all, x_cv_all, targets_train, targets_cv) print 'Filling submission file...' sub_data = utilities.read_file(submission_file, True) for i in range(1, len(sub_data)): chunk_id = sub_data[i][1] hour = sub_data[i][3] weekday = '' all_features = feature_extraction.get_features(chunk_id, weekday, hour, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) for j in range(5, len(sub_data[i])): if sub_data[i][j] == '0': feature = [] for f in all_features: feature.append(f[j - 5]) sub_data[i][j] = clfs[j - 5].predict([feature])[0] utilities.write_file(output_file, sub_data)
def avg(training_file, submission_file, output_file): data = utilities.read_file(training_file) train_data, cv_data = preprocess.get_train_cv_data_by_chunk(data) targets_train, targets_cv = preprocess.get_train_cv_targets( train_data, cv_data) (chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) = feature_extraction.get_avg_maps(train_data) x_train_all, x_cv_all = feature_extraction.get_x_by_avg( train_data, cv_data, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) clfs = regression.linear_regression( x_train_all, x_cv_all, targets_train, targets_cv) clfs = regression.random_forest( x_train_all, x_cv_all, targets_train, targets_cv) print 'Filling submission file...' sub_data = utilities.read_file(submission_file, True) for i in range(1, len(sub_data)): chunk_id = sub_data[i][1] hour = sub_data[i][3] weekday = '' all_features = feature_extraction.get_features( chunk_id, weekday, hour, chunk_avg, hour_avg_by_chunk, weekday_avg_by_chunk, hour_avg, weekday_avg) for j in range(5, len(sub_data[i])): if sub_data[i][j] == '0': feature = [] for f in all_features: feature.append(f[j - 5]) sub_data[i][j] = clfs[j - 5].predict([feature])[0] utilities.write_file(output_file, sub_data)
def get_ft(dataset, id): print("Extracting features for: %s" % id) img = dataset[id].combined((Color.blue, Color.yellow, Color.red)) return get_features(img, method=Feature.dct)
#Hyperparameter variables go here k = 5 k_lim = 20 #Dummy data set until we get the feature fully solved url = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class'] dataset = pa.read_csv(url, names=names) #print(dataset.head()) X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 4].values # Real data for X and y y = fe.read_instruments() X = [fe.get_features(filename) for filename in fe.get_wav_files()] print("X length:") print(len(X)) print("and Y length:") print(len(y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) #Input data plotting #fig = plt.figure(figsize=(6,6)) #plt.scatter(X_train, y_train, color='c', label='train') #plt.scatter(X_test, y_test, color='m', label='test') #plt.xlabel('x')
def rbf(path_train, path_test, pca_comp=200, rbf_gamma=0.0003, rbf_C=10, X=None, X_test=None, y=None, y_test=None): ''' This function trains and tests an svm with rbf kernel according to the data files specified in path_train and path_test :param path_train: str: path to folder with train files :param path_test: str: path to folder with test files :param pca_comp: int: number of components that pca will reduce the feature dimensions to :param rbf_gamma: float: gamma parameter for rbf kernel :param rbf_C: float: C parameters for rbf kernel :param X: ndarray (N,D): feature matrix to train rbf kernel with; default is None :param X_test: ndarray (number of test samples, D): test feature matrix to test rbf kernel; default is None :param y: ndarray (N,): labels for training samples; default is None :param y_test: ndarray (number of test samples, ): labels for test samples; default is None ''' print("\n__________RBF__________") pca = PCA(pca_comp) rbf = SVC(gamma=rbf_gamma, C=rbf_C, kernel='rbf') print("Extracting features for training...") if X is None: # extract mfcc features for training X = get_features(path_train) # normalize the features x_train_mean, x_train_std = train_normalize(X) X = (X - x_train_mean) / x_train_std #reduce to pca_comp dimensions using pca X = pca.fit_transform(X) if y is None: y = get_labels(path_train) print("Training with RBF...") # we train using an svm with an rbf kernel, with class weights "balanced" rbf.fit(X, y) y_pred_train = rbf.predict(X) print("Extracting features for testing...") if X_test is None: X_test = get_features(path_test) # apply same normalization and pca dimensionality reduction to test feature matrix X_test = (X_test - (x_train_mean)) / (x_train_std) X_test = pca.transform(X_test) print("Testing with RBF...") if y_test is None: y_test = get_labels(path_test) y_pred = rbf.predict(X_test) print("\nTrain accuracy: ", np.mean(y_pred_train == y) * 100, "%", sep='') print("Test accuracy: ", np.mean(y_pred == y_test) * 100, "%", sep='') print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
def linear_svm(path_train, path_test, svm_alpha=0.0001, lr=1e-4, num_iter=150, X=None, X_test=None, y=None, y_test=None): ''' This function trains and tests a linear svm according to the data files :param path_train: str: path to folder with train files :param path_test: str: path to folder with test files :param svm_alpha: float: regularization parameter for linear svm model :param lr: float: learning rate parameters for svm model :param num_iter: maximum number of the linear svm model :param X: ndarray (N,D): feature matrix to train rbf kernel with; default is None :param X_test: ndarray (number of test samples, D): test feature matrix to test rbf kernel; default is None :param y: ndarray (N,): labels for training samples; default is None :param y_test: ndarray (number of test samples, ): labels for test samples; default is None ''' print("\n__________Linear SVM__________") num_classes = 6 svm = sk.linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=svm_alpha, learning_rate='constant', eta0=lr, tol=1e-5, max_iter=num_iter, early_stopping=True) print("Extracting features for training...") if X is None: # extract mfcc features for training X = get_features(path_train) # normalize the features x_train_mean, x_train_std = train_normalize(X) X = (X - x_train_mean) / x_train_std if y is None: y = get_labels(path_train) print("Training with Linear SVM...") # initialize W and b to small random weights W = np.random.uniform(-0.001, 0.001, (num_classes, 299 * 13)) b = np.random.uniform(-0.001, 0.001, (num_classes)) # we train using a linear svm svm.fit(X, y, W, b) y_pred_train = svm.predict(X) print("Extracting features for testing...") if X_test is None: X_test = get_features(path_test) # apply same normalization to test feature matrix X_test = (X_test - (x_train_mean)) / (x_train_std) print("Testing with Linear SVM...") if y_test is None: y_test = get_labels(path_test) y_pred = svm.predict(X_test) print("\nTrain accuracy: ", np.mean(y_pred_train == y) * 100, "%", sep='') print("Test accuracy: ", np.mean(y_pred == y_test) * 100, "%", sep='')
import os import numpy as np import keras from tqdm import tqdm as tqdm from keras.models import Sequential from keras.layers import Dense, Dropout from keras.layers.normalization import BatchNormalization from keras.optimizers import SGD import feature_extraction if __name__ == '__main__': img_root = r'G:\Workspace\DS&Alg-Project1-Release\data\image' images, features = feature_extraction.get_features(img_root) images_categories = list({img.split('_')[0] for img in images}) category_map = { images_categories[i]: i for i in range(len(images_categories)) } x_train = features labels = np.array( list(map(lambda x: category_map[x.split('_')[0]], images))) y_train = keras.utils.to_categorical(labels.reshape((x_train.shape[0], 1)), num_classes=len(images_categories)) model = Sequential() #! Length hard coded. model.add(Dense(512, activation='relu', input_dim=7 * 7 * 512)) model.add(Dropout(0.5))
def parallel(i): features = get_features(*i) features.pop("label") return pd.DataFrame(features, index=[0])
from sklearn.metrics import accuracy_score from sklearn.feature_extraction import DictVectorizer import pickle import feature_extraction pos_data = np.load('sarcasm_array.npy') neg_data = np.load('non_sarcasm_array.npy') #print 'Number of sarcastic tweets :', len(pos_data) #print 'Number of non-sarcastic tweets :', len(neg_data) cls_set = ['Non-Sarcastic', 'Sarcastic'] features = [] for tweet in pos_data: features.append((feature_extraction.get_features(tweet), cls_set[1])) for tweet in neg_data: features.append((feature_extraction.get_features(tweet), cls_set[0])) features = np.array(features) targets = (features[0::, 1] == 'Sarcastic').astype(int) vec = DictVectorizer() features_vec = vec.fit_transform(features[0::, 0]) pickle.dump(vec, open("features_dict.pkl", 'wb')) order = shuffle(range(len(features))) targets = targets[order] features_vec = features_vec[order, 0::]
def train_and_test(data_dir=DATA_DIR, results_dir=RESULTS_DIR, feature_types=FEATURE_TYPES): p = Path(data_dir) y_train_ = pd.read_csv(os.path.join(DATA_DIR, 'y_train.csv'), index_col=0, header=None, squeeze=True) y_train_ = y_train_.map(LANGUAGES) y_test_ = pd.read_csv(os.path.join(DATA_DIR, 'y_test.csv'), index_col=0, header=None, squeeze=True) y_test_ = y_test_.map(LANGUAGES) results = _recursive_defaultdict() for dir_ in [d for d in p.iterdir() if d.is_dir()]: # load the relevant preprocessed data print(f"loading X_train for {dir_.name}") X_train = pd.read_csv(os.path.join(dir_, 'X_train.csv'), index_col=0) print(f"indexing y_train for {dir_.name}") y_train = y_train_.loc[X_train.index] results[dir_.name]['y_train'] = y_train print(f"loading X_test for {dir_.name}") X_test = pd.read_csv(os.path.join(dir_, 'X_test.csv'), index_col=0) print(f"indexing y_test for {dir_.name}") y_test = y_test_.loc[X_test.index] results[dir_.name]['y_test'] = y_test for feature_type in feature_types: # select features to use in training features = get_features(X_train, type_=feature_type) print( f"using features `{feature_type}` with preprocessing `{dir_.name}`" ) train_data = lgb.Dataset(features, label=y_train) # tune hyper parameters and save them best_score, best_hyperparams = tune_hyperparams(train_data) results[dir_.name][feature_type]['cv_error'] = best_score results[dir_.name][feature_type]['params'] = best_hyperparams # train print( f"training using features `{feature_type}` with preprocessing `{dir_.name}`" ) print(f"training params:\n {best_hyperparams}") booster = lgb.train(best_hyperparams, train_data) # calculate training metrics train_predictions = booster.predict(features).argmax(axis=1) train_predictions = pd.Series(train_predictions, index=X_train.index) results[dir_.name][feature_type]['train_pred'] = train_predictions results[dir_.name][feature_type]['train_error'] = np.mean( train_predictions != y_train) print( f"training error is {results[dir_.name][feature_type]['train_error']}" ) # calculate test metrics test_features = get_features(X_test, type_=feature_type) test_predictions = booster.predict(test_features).argmax(axis=1) test_predictions = pd.Series(test_predictions, index=X_test.index) results[dir_.name][feature_type]['test_pred'] = test_predictions results[dir_.name][feature_type]['test_error'] = np.mean( test_predictions != y_test) print( f"test error is {results[dir_.name][feature_type]['test_error']}" ) # save the model models_dir = os.path.join(dir_, MODELS_DIR) os.makedirs(models_dir, exist_ok=True) booster.save_model( os.path.join(models_dir, f"{feature_type}_model.txt")) # save the results os.makedirs(results_dir, exist_ok=True) with open(os.path.join(results_dir, 'results.pkl'), 'wb') as handle: pickle.dump(results, handle) return results
print(original_data.question1[index]) counter = counter + 1 if counter == count: return if __name__ == '__main__': best_iteration = 111 question = "What are the best ways to loose weight?" prepaire_traindata = True #prepaire_traindata = False start = clock() if prepaire_traindata: #prepare traindata #feature_extraction.extract_features(False, True) feature_selection.select_features_at(best_iteration) model_training.train_and_save_model() generate_tmp_data(question) train_features = feature_extraction.get_features(data_paths.tmp) train_features.to_csv(data_paths.tmp_features, index=False) prediction.predict_and_write_data(data_paths.tmp, data_paths.tmp_features, data_paths.tmp_submission) get_top_questions(30) print('Overall duration: ', round(clock() - start, 0), 'seconds')
def predictions(url): ff = fe.get_features(url) prob = RF_clf.predict_proba(ff)[0][1] * 100 return prob, ff
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.tree import DecisionTreeClassifier # Importing data cleaning and feature extraction functions from data_cleaning import clean_words from feature_extraction import get_features # Datasets path true_dataset_path = 'dataset/True.csv' fake_dataset_path = 'dataset/Fake.csv' # Data pre-processing df = get_features(true_dataset_path, fake_dataset_path) # Cleaning text Data df['total'] = df['total'].apply(clean_words) # Defining x and y as feature and label respectively x = df['total'] y = df['label'] # Train-Test splitting x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.25) # Logistic Regression LR = Pipeline([('TFIDF_Vectorizer', TfidfVectorizer()), ('Logistic_Regression', LogisticRegression(n_jobs=-1))])