def setup_local_results(exp_time): log_root = os.path.join(project_root(), 'data', 'logs', exp_time) os.mkdir(log_root) logging.basicConfig(filename=os.path.join(log_root, exp_time + '.log'), level=logging.DEBUG) shutil.copy(os.path.join(project_root(), 'pytorch_classifier.py'), log_root) shutil.copy(os.path.join(project_root(), 'train.py'), log_root) return log_root
def initialize_local_experiment(): exp_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") results_path = setup_local_results(exp_time) data_name = 'training_filled.pickle' log(message="Data name: {}", value=data_name) training_examples = pd.read_pickle( os.path.join(project_root(), 'data', 'processed', data_name)) with open(os.path.join(project_root(), 'data', 'processed', 'lengths.txt')) as f: lengths_list = [int(l) for l in f.read().splitlines()] with open( os.path.join(project_root(), 'data', 'processed', 'is_sepsis.txt')) as f: is_sepsis = [int(l) for l in f.read().splitlines()] writer = SummaryWriter(log_dir=os.path.join(project_root(), 'data', 'logs', exp_time), comment='') return training_examples, lengths_list, is_sepsis, writer, results_path
def calculate_sepsis(): # occ = 0 # for training_example in tqdm.tqdm(training_examples): # if 1 in training_example['SepsisLabel'].values: # occ += 1 # print(occ) data_path = os.path.join(project_root(), 'data', 'processed', 'training') training_files = [os.path.join(data_path, f) for f in os.listdir(data_path) if f.endswith('.csv')] training_files.sort() for training_file in tqdm.tqdm(training_files): training_example = pd.read_csv(training_file, sep='|') if 1 in training_example['SepsisLabel'].values: print(training_file)
def rewrite_data_to_better_formats(training_files): lengths = [] is_sepsis = [] all_data = np.zeros((1552210, 42)) ind = 0 training_examples = [] for i, training_file in enumerate(tqdm.tqdm(training_files)): example = pd.read_csv(training_file, sep=',') example['seg_id'] = i training_examples.append(example) is_sepsis.append(1 if 1 in example['SepsisLabel'].values else 0) lengths.append(len(example)) all_data[ind:ind + len(example), :] = example.values ind += len(example) all_data = pd.DataFrame(all_data, columns=example.columns.values, index=None) all_data.to_hdf(os.path.join(project_root(), 'data', 'processed', 'training_concatenated.hdf'), key='df') all_data.to_csv(os.path.join(project_root(), 'data', 'processed', 'training_concatenated.csv'), index=False) ss = sklearn.preprocessing.StandardScaler() all_data = pd.DataFrame(ss.fit_transform(all_data), columns=all_data.columns.values) with open(os.path.join(project_root(), 'data', 'processed', 'lengths.txt'), 'w') as f: [f.write('{}\n'.format(l)) for l in lengths] with open( os.path.join(project_root(), 'data', 'processed', 'is_sepsis.txt'), 'w') as f: [f.write('{}\n'.format(l)) for l in is_sepsis] with open( os.path.join(project_root(), 'data', 'processed', 'training_raw.pickle'), 'wb') as f: pickle.dump(training_examples, f) training_examples = [] for training_file in tqdm.tqdm(training_files): example = pd.read_csv(training_file, sep=',') example.ffill(inplace=True) example.bfill(inplace=True) example.fillna(0, inplace=True) training_examples.append(example) with open( os.path.join(project_root(), 'data', 'processed', 'training_filled.pickle'), 'wb') as f: pickle.dump(training_examples, f)
def check_if_exists_empty_after_non_empty(training_examples): occured = False for training_example in tqdm.tqdm(training_examples): training_example.fillna(0, inplace=True) if 1 in training_example['SepsisLabel'].values: is_sepsis = False for i in training_example['SepsisLabel'].values: if i == 1: is_sepsis = True if is_sepsis and i == 0: plt.figure() plt.plot(training_example['SepsisLabel'], c="g") plt.show(block=False) occured = True if occured: print('Occured') if __name__ == '__main__': with open(os.path.join(project_root(), 'data', 'processed', 'training.pickle'), 'rb') as fp: training_examples = pickle.load(fp) # plot_sepsis_label(training_examples) # plot_length_hist(training_examples) # check_if_exists_empty_after_non_empty(training_examples) calculate_sepsis() plot_start_sepsis_hist(training_examples)
import os from sklearn.manifold import TSNE import plotly.express as px import pandas as pd from utils.path_utils import project_root if __name__ == '__main__': path = os.path.join(project_root(), 'data', 'processed', 'librispeech-gender-feats-test-clean.csv') data = pd.read_csv(path) columns = [ 'mean', 'std', 'median', 'kurt', 'skew', 'p25', 'p75', 'iqr', 'ent', 'meanfun', 'maxfun', 'minfun' ] data = data.dropna() perplexity = 20 tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42) feats2d = tsne.fit_transform(data[columns].values) data['x'] = feats2d[:, 0] data['y'] = feats2d[:, 1] fig = px.scatter(data, x='x', y='y', title=f'TSNE 2d projection of data<br>{path}', color="label", hover_data=['path'])
import pandas as pd import numpy as np import os import pickle import tqdm from utils.path_utils import project_root data_path = os.path.join(project_root(), 'data', 'raw', 'training') data_path2 = os.path.join(project_root(), 'data', 'processed', 'training') training_examples = [] training_files = [f for f in os.listdir(data_path) if f.endswith('.psv')] training_files.sort() for training_file in tqdm.tqdm(training_files): example = pd.read_csv(os.path.join(data_path, training_file), sep='|') example.to_csv(os.path.join(data_path2, training_file[:-4] + '.csv'), sep=',', index=False, header=example.columns)
import os from sklearn.manifold import TSNE import plotly.express as px import pandas as pd from utils.path_utils import project_root if __name__ == '__main__': path = os.path.join(project_root(), 'data', 'processed', 'librispeech-gender-feats-test-clean.csv') data = pd.read_csv(path) columns = ['mean', 'std', 'median', 'kurt', 'skew', 'p25', 'p75', 'iqr', 'ent'] data = data.dropna() perplexity = 20 tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42) feats2d = tsne.fit_transform(data[columns].values) data['x'] = feats2d[:, 0] data['y'] = feats2d[:, 1] fig = px.scatter(data, x='x', y='y', title=f'TSNE 2d projection of data<br>{path}', color="label", hover_data=['path']);fig.show() fig.write_html(os.path.join(project_root(), "data", "processed", f"2d_{perplexity}.html"))
import os import pandas as pd from utils.path_utils import project_root from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import LabelEncoder from sklearn.metrics import f1_score if __name__ == '__main__': train_data = pd.read_csv( os.path.join(project_root(), 'data', 'processed', 'librispeech-gender-feats-train-clean-100.csv')) dev_data = pd.read_csv( os.path.join(project_root(), 'data', 'processed', 'librispeech-gender-feats-dev-clean.csv')) train_data = train_data.dropna() dev_data = dev_data.dropna() columns = [ 'mean', 'std', 'median', 'kurt', 'skew', 'p25', 'p75', 'iqr', 'ent' ] train_y = train_data['label'] le = LabelEncoder() le.fit(train_data['label'])
], } aggs.update(aggs_clinical) aggs.update(aggs_vital) # rolling # expandingx def agg_feats(df): return None def extract_features(df): df = agg_feats(df) return df if __name__ == '__main__': data_name = 'training_concatenated.hdf' training_examples_path = os.path.join(project_root(), 'data', 'processed', data_name) df = pd.read_hdf(training_examples_path, key='df') df = df.iloc[:1000, :] df = extract_features(df) df.to_csv( os.path.join(project_root(), 'data', 'processed', 'training_features.csv'))
import os import pandas as pd from utils.path_utils import project_root from sklearn.preprocessing import StandardScaler from sklearn.cluster import DBSCAN, KMeans from sklearn.manifold import TSNE import plotly.express as px if __name__ == '__main__': train_data = pd.read_csv(os.path.join(project_root(), 'data', 'processed', 'librispeech-gender-feats-train-clean-100.csv')) dev_data = pd.read_csv(os.path.join(project_root(), 'data', 'processed', 'librispeech-gender-feats-dev-clean.csv')) data = dev_data.dropna() columns = ['mean', 'std', 'median', 'kurt', 'skew', 'p25', 'p75', 'iqr', 'ent', 'meanfun', 'maxfun', 'minfun'] # columns = ['iqr', 'meanfun'] for i in range(7, len(columns)-1): for j in range(i+1, len(columns)): tmp_columns = [columns[i], columns[j]] values = data[tmp_columns].values scaler = StandardScaler()
35] - training_example.values[ 6:, :35] training_example = pd.concat( [training_example, pd.DataFrame(columns=lag_columns)]) training_example.loc[6:, lag_columns] = lag_features # training_example.ffill(inplace=True) # training_example.bfill(inplace=True) # training_example.fillna(0, inplace=True) training_examples_lag.append(training_example) return training_examples_lag if __name__ == '__main__': data_name = 'training_filled.pickle' training_examples = pd.read_pickle( os.path.join(project_root(), 'data', 'processed', data_name)) training_examples = add_lag_features(training_examples) with open( os.path.join(project_root(), 'data', 'processed', 'training_filled_lag.pickle'), 'wb') as f: pickle.dump(training_examples, f)
std = bin_means.std() median = np.median(bin_means) kurt = stats.kurtosis(bin_means) skew = stats.skew(bin_means) p25 = np.percentile(bin_means, 25) p75 = np.percentile(bin_means, 75) iqr = p75 - p25 ent = stats.entropy(bin_means) return mean, std, median, kurt, skew, p25, p75, iqr, ent if __name__ == '__main__': chosen_set = 'train-clean-100' raw_data_root = os.path.join(project_root(), 'data', 'raw', 'LibriSpeech') speakers_filepath = os.path.join(raw_data_root, 'SPEAKERS.TXT') results_filepath = os.path.join(project_root(), 'data', 'processed', f'librispeech-gender-feats-{chosen_set}.csv') audio_paths, labels = get_librispeech_paths(raw_data_root, speakers_filepath, contains=chosen_set) tq = tqdm.tqdm(enumerate((zip(audio_paths, labels))), total=len(audio_paths)) feats_rows = [] for i, (path, label) in tq: audio, fs = sf.read(path) row = extract_features(audio, fs) feats_rows.append(row) results = pd.DataFrame(feats_rows, columns=['mean', 'std', 'median', 'kurt', 'skew', 'p25', 'p75', 'iqr', 'ent']) results['path'] = ['/'.join(p.split('/')[-4:]) for p in audio_paths]
duration = len(audio) / fs return mean, std, median, kurt, skew, p25, p75, iqr, ent, meanfun, maxfun, minfun, duration def extract_speaker_id(rec_path): return rec_path.split(os.sep)[-3] if __name__ == '__main__': # chosen_set = 'train-clean-100' # chosen_set = 'test-clean' chosen_set = 'dev-clean' if os.path.isdir(os.path.join(project_root(), 'data', 'raw', 'LibriSpeech')): raw_data_root = os.path.join(project_root(), 'data', 'raw', 'LibriSpeech') else: raw_data_root = os.path.join(project_root(), 'data', 'raw', chosen_set, 'LibriSpeech') speakers_filepath = os.path.join(raw_data_root, 'SPEAKERS.TXT') results_filepath = os.path.join( project_root(), 'data', 'processed', f'librispeech-gender-feats-{chosen_set}.csv') audio_paths, labels = get_librispeech_paths(raw_data_root, speakers_filepath, contains=chosen_set)