def extract_features(segment): data = np.asarray(extract(np.asarray(segment))) data = np.array([data]) data_chu = scaler_chu.transform(data) data_df = scaler_df.transform(data) return data_chu, data_df
def prepareData(ratings, reviews): # We call the extract method to retrieve the most relevant words (features) features = extract(reviews) # For each review, we only keep the words that are features filtered_tokens = [filter(review, features) for review in reviews] filtered_reviews = [] for f in filtered_tokens: review = "" for t in f: review = review + t + " " filtered_reviews.append(review) # We create a column for each feature # If that feature is mentioned in the review : the value is 1, else 0 cv = CountVectorizer(binary=True) x = cv.fit_transform(filtered_reviews) x_df = pd.DataFrame(x.toarray(), columns=cv.get_feature_names()) # To simplify our classification, we make sure that high ratings are considered positive (1), # while low ones are considered negative (0) ratings = transform_rating(ratings) return ratings, x_df
def ext_feat(): for i, file in enumerate(files): _, sig = wav.read(os.path.join(DATA_SET_PATH, file)) feats = extract(sig) features.append(feats) words_labels.append(Label_Map[file[0]]) gender_lables.append(Label_Map[file[1]]) bar.update(i + 1)
def read(path): global count, thresh, split_count X = [] y = [] sents = [] file = open(path, "r") data = file.read().split('.') for i in tqdm(range(len(data))): d = data[i] thresh -= 1 if thresh > 0 or len(d) == 0: continue d = re.sub(".\s\)", "", d).replace('<', ' <').replace('>', '> ') d = re.sub(">", "> ", d) d = re.sub(r"(<[^><\s]+)", r"\1>", d) d = re.sub(">>", ">", d) d = re.sub('<<', '<', d) d = re.sub("</", " </", d).split() # d = d.split() sentence, label = gen_label(d) sentence = remove_tags(sentence) sentence = re.sub(r'<.*?>', '', ' '.join(sentence)).split() sentence = np.array(sentence) label = np.array(label) # print(np.unique(label)) if label.shape != sentence.shape: print(len(label), len(sentence)) print(label) print(sentence) print(data[i]) assert label.shape == sentence.shape list_sent = windowing(sentence) list_label = windowing(label) assert len(list_label) == len(list_sent) assert len(list_sent[0]) == len(list_label[0]) for i in range(len(list_sent)): sentence = ' '.join(list_sent[i]) l = list_label[i] # manual + fasttext manual_feat = extract(sentence.split()) # character encoding char_list = sent_to_char(sentence.split()) # phobert embedding sentence = extract_bert(sentence) sentence = np.hstack((sentence, manual_feat, char_list)) pad_len = SENT_LENGTH - len(l) l += ['pad'] * (pad_len) l = np.array(l, dtype='<U12') sentence = np.append(sentence, np.zeros((pad_len, sentence.shape[1])), axis=0) X.append(sentence) y.append(l) #print(sentence.shape, label.shape, '\n') return np.array(X), np.array(y, dtype='<U12')
def extract_features(segment): #print('intial ======') #print(segment) data = np.asarray(extract(np.asarray(segment))) #print(data) data = np.array([data]) data_chu = scaler_chu.transform(data) data_df = scaler_df.transform(data) #print('data before transform ========') #print(data) #data = scaler_df.transform(data) #data = scaler.transform(data) #print('data after transform ========') #print(data) return data_chu, data_df
def keep_predicting(): # remove previous files for file in os.listdir(TEST_PATH): os.remove(os.path.join(TEST_PATH, file)) while True: try: for file in os.listdir(TEST_PATH): rate, sig = wav.read(os.path.join(TEST_PATH, file)) feat = extract(sig) pca_feats = pca_transform([feat]) result = words_clf.predict(pca_feats) result2 = gender_clf.predict(pca_feats) play(result[0]) play(result2[0]) os.remove(os.path.join(TEST_PATH, file)) except Exception as e: print(e.__str__())
def classify(train, test): vect1, vect2 = feature_extraction.extract(train, test) classifiers = [MultinomialNB(), SGDClassifier(random_state=0)] for classifier in classifiers: predict = cross_val_predict(classifier, vect1, train.Label_Cat) print(predict) print(precision_score(train.Label_Cat, predict, average='micro')) print(recall_score(train.Label_Cat, predict, average='micro')) print(f1_score(train.Label_Cat, predict, average='micro')) print(classification_report(train.Label_Cat, predict)) print(accuracy_score(train.Label_Cat, predict)) print('on test data') classifier.fit(vect1, train.Label_Cat) y_pred = classifier.predict(vect2) print(classification_report(test.Label_Cat, y_pred)) print(accuracy_score(test.Label_Cat, y_pred))
segmentation_test.segment(patients, path_to_data, path_to_results) #-------------------Feature Extraction Tsfresh--------------------- patients = [ "Subject_1", "Subject_3", "Subject_4", "Subject_7", "Subject_10", "Subject_11", "Subject_12", "Subject_13", "Subject_14", "Subject_15", "Subject_16", "Subject_17" ] path_data = "Sliding_Window_Data" + os.sep + "Sensor_Data" path_result_features = "Features" if not os.path.exists(path_result_features): os.mkdir(path_result_features) feature_extraction.extract(patients, path_data, path_result_features) #----------------------Extract Labels -------------------- path_data = "Sliding_Window_Data" + os.sep + "Labels" subjects = [ "Subject_1", "Subject_3", "Subject_4", "Subject_7", "Subject_10", "Subject_11", "Subject_12", "Subject_13", "Subject_14" ] extract_labels.labels(path_data, subjects) #-------------------Remove Unnecessary Features------------ path_result_selected_features = "Selected_features_Data" path_data = "Features"
labels = np.concatenate( (epo_eeg_p1_tr_cl.events[:, 2], epo_eeg_p2_tr_cl.events[:, 2])) epoch = Datmat.transpose(2, 0, 1) df = pd.DataFrame() for ch in range(epoch.shape[2]): feature = extract( epoch[:, :, ch], fs, 0.1, amplitude=True, amplitude_P300=True, kurtosis=True, skewness=True, std=True, sampen=True, rms=True, hurst=True, gradient=True, alfa=True, beta=True, theta=True, delta=True, broad_band=True, ) current = pd.DataFrame(feature) current['class'] = labels - 1 df = pd.concat([df, current], ignore_index=True)
import pandas as pd import json import os from feature_extraction import extract window_size = 50 step_size = 10 #80% overlap seems to work for us dir = os.listdir('./13april_data/') result = [] #print(dir) for csv_file in dir: dataframe = pd.read_csv(os.path.join('13april_data', csv_file)) dataset = dataframe.values #print(dataset) for row in range(int((len(dataset) - window_size) / step_size)): processed = extract(dataset[row * step_size:row * step_size + window_size]) processed.append(dataset[row][-1]) # print(processed) #print(len(dataset)) result.append(processed) df = pd.DataFrame(result) df.to_csv('preprocess_17april_logout.csv', header=0)