def update_mapping(mapping, label_dir): label_files = os.listdir(label_dir) label_files = [os.path.join(label_dir, file) for file in label_files] for file in label_files: _, segment_labels = read_label_file(file) for label in segment_labels: if label not in mapping.keys(): mapping[label] = len(mapping.keys()) return mapping
def truncNgramMLP(): data_file = "./Data/train/real_train_data.csv" label_file = "./Data/train/real_train_label.csv" # data_file = "./Data/train/train.csv" # label_file = "./Data/train/train_label.csv" X = read_data_file(data_file) X = pad_sequences(X, maxlen=328, dtype='int32', padding='post', truncating='post') y = read_label_file(label_file) # print ("Shape of train data(m):\n", X.shape) # print ("Data:\n", X[0:5], "\n") # print ("Shape of train label:", y.shape) # print ("Label:\n", y[0:5], "\n") str_X = [] for i in range(X.shape[0]): str_X.append(','.join([str(k) for k in X[i]])) df = pd.DataFrame(str_X, index=range(X.shape[0]), columns=['data']) # Apply ngram and Tfidf to tfidf = TfidfVectorizer(analyzer="word", max_features=5000, ngram_range=(2, 4)) # print(tfidf) X_transformed = tfidf.fit_transform(df.data) # test_transformed = tfidf.fit_transform() X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42) # Success print("Training and testing split was successful.") print(X_train.shape, y_train.shape, X_test.shape, y_test.shape) mlp_model = MLP(X_train.shape[1]) print(mlp_model.summary()) tensorBoardCallback = TensorBoard(log_dir='./logs/trunc_ngram_mlp', write_graph=X.shape[1]) optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=1e-4, amsgrad=False) # optimizer = SGD(lr=0.01, momentum=0.9, decay=1e-6, nesterov=False) mlp_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) mlp_model.fit(X_train, y_train, callbacks=[tensorBoardCallback], epochs=20, batch_size=128) score, acc = mlp_model.evaluate(X_test, y_test, verbose=2, batch_size=128) print("score: %.2f" % (score)) print("acc: %.2f" % (acc))
def generate_segments(video_dir, label_dir, segment_dir): # the frames are in one-based indexing. if not os.path.exists(segment_dir): os.makedirs(segment_dir) files = sorted(os.listdir(video_dir)) action_to_logit_dict, _ = read_mapping_file(MAPPING_FILE) all_dict = dict() pbar = tqdm(files) for file in pbar: pbar.set_postfix({'video': file}) vid_file = os.path.join(video_dir, file) label_file = str(file)[:-len(VIDEO_EXT)] + LABEL_EXT label_file = os.path.join(label_dir, label_file) segment_windows, segment_actions = read_label_file(label_file) for i, segment_action in enumerate(segment_actions): segment_window = segment_windows[i] segment_logit = action_to_logit_dict[segment_action] segment_name = '.'.join([file[:-len(VIDEO_EXT)], str(i)]) segment_file = os.path.join(segment_dir, segment_name + '.npy') segment_frames = read_segment_from_video( vid_file, segment_window) np.save(segment_file, segment_frames) segment_dict = { 'action': segment_action, 'label': segment_logit, 'window': segment_window, 'vid-file': vid_file, 'label-file': label_file, 'segment-file': segment_file, 'n-frames': len(segment_frames) } all_dict[segment_name] = segment_dict return all_dict
def extract_label(in_file_path, out_file_path, short_index): y = read_label_file(in_file_path) output = pd.DataFrame(data=y) output.drop(short_index, axis=0) output.to_csv(out_file_path, index=False, header=False)
# %% [markdown] # ## Plot PE Header length distribution - Test data # Credit: [@MengdanCode](https://github.com/MengdanCode) # %% from data import read_data_file, read_label_file import numpy as np from matplotlib import pyplot as plt data_file = "./Data/train/train.csv" label_file = "./Data/train/train_label.csv" X = read_data_file(data_file) y = read_label_file(label_file) # %% X_len = [] for i in X: X_len.append(len(i)) print('X_len generated') X_len = np.array(X_len) print(X_len.min()) print(X_len.max()) fig_per_hour = plt.figure() per_hour = fig_per_hour.add_subplot(111) counts, bins, patches = per_hour.hist(X_len, bins=100, normed=False,