def preprocess_TIMIT(data_root, out_file): """ Summary: Args: Returns: """ categories = os.listdir(data_root) speakers = [[ os.path.join(data_root, c, s) for s in os.listdir(os.path.join(data_root, c)) ] for c in categories] speakers = [s for c in speakers for s in c] wavs = [[ os.path.join(s, w) for w in os.listdir(s) if w.__contains__(config['AUDIO_READ_FORMAT_TIMIT']) ] for s in speakers] wavs = [s for c in wavs for s in c] count = 0 shuffle_ixs = list(range(len(wavs))) shuffle(shuffle_ixs) wavs = np.array(wavs)[shuffle_ixs][:].tolist() mels = {c: [] for c in categories} for wav_fname in tqdm(wavs, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)): try: aud = preprocess(wav_fname) except AssertionError as e: print(e, "Couldn't process ", len(aud), wav_fname) continue # file_out_ = wav_fname.split('.')[0].replace( # RAW_DATA_DIR, INTERIM_DATA_DIR) + '_' + AUDIO_WRITE_FORMAT # os.makedirs('/'.join(file_out_.split('/')[:-1]), exist_ok=True) # soundfile.write(file_out_, aud, config['SAMPLING_RATE']) # exit() mel = mel_spectogram(aud) if mel.shape[1] <= config['SLIDING_WIN_SIZE']: print("Couldn't process ", mel.shape, wav_fname) continue c = wav_fname.split('/')[-3] s = '_'.join(wav_fname.split('/')[-2:]).split('.')[0] mels[c].append((s, mel)) write_hdf5(out_file, mels)
def setup(self, stage=None): inferno = pd.read_csv(self.mirage_csv_file, index_col=0) mirage = pd.read_csv(self.inferno_csv_file, index_col=0) raw_data = pd.concat([inferno, mirage]) X, y = preprocess(raw_data) X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, test_size=self.test_size, random_state=42) self.train_dataset = GrenadeDataset(X_train, y_train) self.val_dataset = GrenadeDataset(X_test, y_test)
def predict(order): order = data_utils.preprocess(order) data_dir = os.path.join(root_dir, 'data') source_vocab_path = os.path.join(data_dir, "vocab/vocab_source.txt") target_vocab_path = os.path.join(data_dir, "vocab/vocab_target.txt") target_int_to_letter, target_letter_to_int = data_utils.getvocab( target_vocab_path) source_int_to_letter, source_letter_to_int = data_utils.getvocab( source_vocab_path) source = [ data_utils.sentenceofhan_to_token_ids(order, source_letter_to_int) ] source_pin = [ data_utils.sentenceofpin_to_token_ids2(order, source_letter_to_int) ] target = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] lr = 0 source_len = [len(source[0])] target_len = [len(target[0])] hostport = '192.168.31.186:6000' host, port = hostport.split(':') #grpc channel = implementations.insecure_channel(host, int(port)) stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) #build request request = predict_pb2.PredictRequest() request.model_spec.name = 'seq2seq' request.model_spec.signature_name = 'seq2seq' request.inputs['input_data'].CopyFrom( tf.contrib.util.make_tensor_proto(source, dtype=tf.int32)) request.inputs['input_data_pin'].CopyFrom( tf.contrib.util.make_tensor_proto(source_pin, dtype=tf.int32)) request.inputs['targets'].CopyFrom( tf.contrib.util.make_tensor_proto(target, dtype=tf.int32)) request.inputs['lr'].CopyFrom( tf.contrib.util.make_tensor_proto(lr, dtype=tf.float32)) request.inputs['source_sequence_length'].CopyFrom( tf.contrib.util.make_tensor_proto(source_len, dtype=tf.int32)) request.inputs['target_sequence_length'].CopyFrom( tf.contrib.util.make_tensor_proto(target_len, dtype=tf.int32)) model_result = stub.Predict(request, 60.0) output = np.array(model_result.outputs['output'].int_val) ans_ = [target_int_to_letter.get(i) for i in output] ans_ = ''.join(ans_) ans_ = ans_.replace('S', '') st = 'NN:NN' et = 'NN:NN' if 'E' in ans_: tmp = ans_.split('E') if len(tmp) == 2: st = tmp[0] et = tmp[1] print(st, et) return st, et
- filter out the possible duplicates and cycles (repeated routes) - add price as the main attribute to each combination - print combinations in a nice way - for each bags option as a list of lists - sample combination list consists flight_numbers with total price at the end Comments: - run code as `cat flights.csv | python find_combinations.py` - most of functions assume that flight_numbers are unique among dataset - function for loading data is not optimised for real-time streaming, it just waits for the static stdinput - tree searching is limited to max 2 stopovers - if interested in searching combinations from certain airport, it can be easily specified before tree searching ''' time_start = time() flights = stdin_flights() flights = preprocess(flights) # 0,1,2 bags taken for bags in range(3): flights_tree = make_tree(flights, num_bags=bags) combinations = search_combinations(flights_tree) combinations = filter_cycles(combinations, flights) combinations = filter_duplicates(combinations) combinations = add_prices(combinations, flights, num_bags=bags) # print("\n Flights combinations and prices for number of bags: {}".format(bags)) # pprint(combinations) print('Time elapsed: {} ms'.format(round(1000 * (time() - time_start), 2)))
# Initialize Parameters df_path = 'datasets/raw_data.csv' save_train = 'datasets/train.csv' save_val = 'datasets/validation.csv' save_test = 'datasets/test.csv' save_wordidx = 'datasets/vocab.csv' save_idxword = 'datasets/idxword.csv' n_subset = 100000 train_size = 0.67 valtest_size = 0.165 # Load raw data df = pd.read_csv(df_path) # Preprocess raw/text data df = preprocess(df) df['reviewText'] = cleanText(df['reviewText']) # Subset data subset = subset_df(df, n_samples=n_subset) # Create train, val, test sets train, validation, test = split_df(subset, size_train=train_size, size_valtest=valtest_size) # Compute main target class weights target_weights = class_weights(train, target='overall', p_expect=(1 / 3)) np.savetxt("train_class_weights.csv", target_weights, delimiter=",") # Compute conditional independent sample weights
def preprocess_GMU(): """ Summary: Args: Returns: """ speakers_info = pd.read_csv(GMU_DATA_INFO) categories = Counter( speakers_info['native_language']).most_common(GMU_ACCENT_COUNT) categories = [c[0] for c in categories] speakers_info = speakers_info[speakers_info['native_language'].isin( categories)] speakers_info = speakers_info[['filename', 'native_language']] speakers_info['name'] = speakers_info['filename'] speakers_info['filename'] = speakers_info['filename'].apply( lambda fname: os.path.join(GMU_DATA, fname + AUDIO_READ_FORMAT_GMU)) count = -15 shuffle_ixs = list(range(len(speakers_info['filename'].tolist()))) shuffle(shuffle_ixs) fnames = np.array( speakers_info['filename'].tolist())[shuffle_ixs][count:].tolist() langs = np.array(speakers_info['native_language'].tolist() )[shuffle_ixs][count:].tolist() names = np.array( speakers_info['name'].tolist())[shuffle_ixs][count:].tolist() train_names = sample(names, int(0.8 * len(names))) val_names = set(names) - set(train_names) mels_train = {lang: [] for lang in langs} mels_val = {lang: [] for lang in langs} for name, fname, lang in tqdm(zip(names, fnames, langs), total=len(langs), bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET)): try: aud = preprocess(fname) except AssertionError as e: print("Couldn't process ", len(aud), fname) continue # file_out_ = fname.split('.')[0].replace( # RAW_DATA_DIR, INTERIM_DATA_DIR) + '_' + AUDIO_WRITE_FORMAT # soundfile.write(file_out_, aud, SAMPLING_RATE) mel = mel_spectogram(aud) if mel.shape[1] <= config['SLIDING_WIN_SIZE']: print("Couldn't process ", mel.shape, fname) continue if name in val_names: mels_val[lang].append((name, mel)) else: mels_train[lang].append((name, mel)) write_hdf5(GMU_PROC_OUT_FILE_T, mels_train) write_hdf5(GMU_PROC_OUT_FILE_VAL, mels_val)
filter_sizes=FLAGS.filter_sizes, max_sent_len=FLAGS.max_sentence_len, embedding_mat=embedding_mat, word_nums=len(vocab), filter_nums=FLAGS.filter_nums, label_nums=FLAGS.label_nums, learning_rate=FLAGS.learning_rate, model_path=FLAGS.model_path, epoch=FLAGS.num_epochs, batch_size=FLAGS.batch_size, dropout_prob=FLAGS.dropout_keep_prob) train_data = data_utils.generate_data('./data/train_data.ids', FLAGS.max_sentence_len, vocab) valid_data = data_utils.generate_data('./data/valid_data.ids', FLAGS.max_sentence_len, vocab) print('train data size is {}, valid data size is {}.'.format( len(train_data[0]), len(valid_data[0]))) model.train(train_data, valid_data) if __name__ == '__main__': # preprocess data data_utils.preprocess( data_paths=[FLAGS.train_data_file, FLAGS.valid_data_file], vocab_path=FLAGS.vocabulary_file, glove_path=FLAGS.glove_file, embed_mat_path=FLAGS.save_embedding_file) train()