def main1(): setup = [line for line in load_data() if line.startswith('value')] instructions = [ line for line in load_data() if not line.startswith('value') ] for line in setup: text = line.split() chip = int(text[1]) number = text[-1] Bot.get_or_create(number).receives(chip) while instructions: for bot in list(Bot.all_bots.values()): if bot.ready: sentinel = 'bot {} gives low'.format(bot.number) instruction = extract(instructions, sentinel) text = instruction.split() if text[5] == 'bot': low_recv = Bot.get_or_create(text[6]) bot.gives_low_to(low_recv) else: # reciever is output position = int(text[6]) output[position] = bot.extract_lowest() if text[10] == 'bot': high_recv = Bot.get_or_create(text[11]) bot.gives_high_to(high_recv) else: position = int(text[11]) output[position] = bot.extract_high()
def test_defects_module(filename, plot): # initialize defect module train_values = 5 train_trees = 15 filename_train = "train_data/defects_acc_data.output" init_server.init_defects_module(filename_train, train_values, train_trees) # load test data test_values = 5 test_data = cmn.aver_std_array(cmn.load_data(filename, (3,)), test_values) test_data = test_data.reshape(len(test_data)/2, 2) test_times = cmn.label_array(cmn.load_data(filename, (0,)), test_values) # plot results if plot == "yes" : train_data = cmn.aver_std_array(cmn.load_data(filename_train, (3,)), train_values) train_data = train_data.reshape(len(train_data)/2, 2) #tmp_data = train_data #tmp_predicted = df.predicted(tmp_data) #tmp_data = np.append(tmp_data, tmp_predicted.reshape(len(tmp_predicted), 1), axis=1).reshape(len(tmp_predicted), 3); #print tmp_data xx, yy = cmn.get_grid(train_data[:, [0, 1]]) train_predicted = df.predicted(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) #print "Train data\n", train_data #print "Train predicted\n", train_predicted test_predicted = df.predicted(test_data) cmn.plot_2D_data(test_data, test_predicted, train_data, train_predicted, [6.8, 12.8], [-0.1, 3.0]); #cmn.plot_2D_data(tmp_data, tmp_predicted, train_data, train_predicted, [6.8, 12.8], [-0.1, 3.0]); # start finding by time df.find_actions(test_data, test_times) return
def init_defects_module(filename, values, trees): # defects module use default accelerometer.output # accelerometer.output: time,accx,accy,accz,label* train_data = cmn.aver_std_array(cmn.load_data(filename, (3,)), values) train_data = train_data.reshape(len(train_data)/2, 2) train_predicted = cmn.label_array(cmn.load_data(filename, (4,)), values) df.init_defects_module(values, trees, train_data, train_predicted) return
def init_turns_module(filename, values, trees): # turns module use default compass.output # compass.output: time,magn,label* train_data = cmn.get_diff_array(cmn.load_data(filename, (1,))) train_data = cmn.aver_std_array(train_data, values) train_data = train_data.reshape(len(train_data)/2, 2) train_predicted = cmn.label_array(cmn.load_data(filename, (2,)), values) tr.init_turns_module(values, trees, train_data, train_predicted) return
def main(self): t_start = datetime.now() logger.info(' {} / {} '.format(self.name, self.random_seed).center(62, '=')) logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params))) if os.path.isfile(os.path.join(self.output_dir, 'test.csv')): logger.info('Output already exists - skipping') return # Initialize the random number generator self.random_state = RandomState(self.random_seed) np.random.seed(int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder)) train_df = common.load_data('train') train_df['comment_text'] = train_df['comment_text'].apply(unidecode) test_df = common.load_data('test') test_df['comment_text'] = test_df['comment_text'].apply(unidecode) vectorizer = self.build_vectorizer(train_df, test_df) folds = common.stratified_kfold(train_df, random_seed=self.random_seed) for fold_num, train_ids, val_ids in folds: logger.info(f'Fold #{fold_num}') fold_train_df = train_df[train_df['id'].isin(train_ids)] fold_val_df = train_df[train_df['id'].isin(val_ids)] models = self.train(fold_num, vectorizer, fold_train_df, fold_val_df) logger.info('Generating the out-of-fold predictions') path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') self.predict(models, vectorizer, fold_val_df, path) logger.info('Generating the test predictions') path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') self.predict(models, vectorizer, test_df, path) logger.info('Combining the out-of-fold predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv') df_part = pd.read_csv(path, usecols=['id'] + common.LABELS) df_parts.append(df_part) train_pred = pd.concat(df_parts) path = os.path.join(self.output_dir, 'train.csv') train_pred.to_csv(path, index=False) logger.info('Averaging the test predictions') df_parts = [] for fold_num in range(1, 11): path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv') df_part = pd.read_csv(path, usecols=['id'] + common.LABELS) df_parts.append(df_part) test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean() path = os.path.join(self.output_dir, 'test.csv') test_pred.to_csv(path, index=False) logger.info('Total elapsed time - {}'.format(datetime.now() - t_start))
def init_behavior_defects_module(filename, values, trees): # behavior defects module use generated file # structure: time,speed,turns,defects,lat,lon,labels* train_speed_data = cmn.label_array(cmn.load_data(filename, (1,)), values) train_speed_data = train_speed_data.reshape(len(train_speed_data), 1) train_turns_data = cmn.label_array(cmn.load_data(filename, (2,)), values) train_turns_data = train_turns_data.reshape(len(train_turns_data), 1) train_defects_data = cmn.label_array(cmn.load_data(filename, (3,)), values) train_defects_data = train_defects_data.reshape(len(train_defects_data), 1) train_data = np.hstack((train_speed_data, train_turns_data, train_defects_data)) train_predicted = cmn.label_array(cmn.load_data(filename, (6,)), values) bd.init_behavior_defects_module(values, trees, train_data, train_predicted) return
def main(): matrices = [correct_nodata(common.load_data(path, skip=6)) for path in INPUT_PATHS] matrix = np.dstack(matrices) rows = matrix.reshape(-1, len(MAP_NAMES)) common.save_data(rows, OUTPUT_PATH)
def load_errors(): args = parsed_args() filepath = common.get_file_storage_location()+"/"+args.file if not os.path.isfile(filepath): print(filepath + " is not a file, provide an existing file") return return common.load_data(filepath)
def main1(): addresses = load_data() invalid_removed = reject(is_invalid, addresses) valids = [address for address in invalid_removed if is_valid(address)] valid_count = len(valids) print('total valid ips:', valid_count) return valids
def test_sqs_policies(self): policies = load_data('iam/sqs-policies.json') for p, expected in zip( policies, [False, True, True, False, False, False, False, False]): violations = check_cross_account(p, set(['221800032964'])) self.assertEqual(bool(violations), expected)
def main(): screen = Screen(50, 6) for line in load_data(): screen.execute_instruction(line) print('Part 1: total pixels on:', sum(screen)) # Part 2 is just reading the output print(screen)
def get_fuzzing(target, ck, data): print '[*]now demo test get xss......' parsed_tuple = urlparse.urlparse(urllib.unquote(target)) url_query = urlparse.parse_qs(parsed_tuple.query, True) print url_query for i in url_query.keys(): query = str(i) + '=' + str(url_query[i][0]) tmp = query + flag location = str(url_query[i][0]) + flag now_target = target.replace(query, tmp) #data = load_data(agent_list,ck) try: #print 1 data = load_data(agent_list, ck) #print data req = urllib2.Request(now_target, data=data) #print 3 res = urllib2.urlopen(req) #print 4 content_html = res.read() #print 5 if flag in content_html or location in content_html: #print 6 get_detect(now_target, ck, flag, True) else: return False except: pass
def main(argv): options= argparser().parse_args() model, tokenizer, labels, config = load_trained_model(options.model_dir) test_texts, test_labels = load_data( options.test_data, options.input_format, config.multiclass ) label_encoder = MultiLabelBinarizer(classes=labels) tokenize = make_tokenization_function(tokenizer, config.seq_len) metrics_values = model.evaluate( tokenize(test_texts), label_encoder.fit_transform(test_labels), batch_size=options.batch_size ) for name, value in zip(model.metrics_names, metrics_values): print(f'{name}\t{value}') predictions = model.predict( tokenize(test_texts), verbose=1, batch_size=options.batch_size ) assert len(test_texts) == len(predictions) for text, gold, preds in zip(test_texts, test_labels, predictions): if config.multiclass: pred_labels = [labels[preds.argmax()]] else: pred_labels = [labels[i] for i, v in enumerate(preds) if v > 0.5] print('{}\t{}'.format(','.join(pred_labels), text))
def main(): print("Loading...") fullset = common.load_data(FULLSET_PATH, sep=',') types = get_types(fullset) print("Predicting...") uncertain_mask = (types == UNCERTAIN_LABEL) uncertainset = fullset[uncertain_mask] probs = get_probs_for_uncertain(uncertainset) linenum_to_probs = { idx: prob for idx, prob in zip(np.nonzero(uncertain_mask)[0], probs) } print("Deciding...") probs_and_predictions = [] for i, (row, type_) in enumerate(zip(fullset, types)): if type_ == UNCERTAIN_LABEL: probs = linenum_to_probs[i].tolist() prediction, order = check_and_decide(row[:common.N_DISASTER], probs) probs_and_predictions.append(probs + [prediction] + [order + 1]) elif type_ == -99: probs_and_predictions.append([-99] * (common.N_CLASS + 2)) else: probs = [0.0] * common.N_CLASS probs[type_] = 1.0 probs_and_predictions.append(probs + [type_] + [0]) print("Saving...") common.save_data( np.concatenate((types[:, np.newaxis], probs_and_predictions), axis=1), OUTPUT_PATH)
def load_errors(): args = parsed_args() filepath = common.found_errors_storage_location() + "/" + args.file if not os.path.isfile(filepath): print(filepath + " is not a file, provide an existing file") return return common.load_data(filepath)
def main2(): # same as before but with register c starting at 1 registers = dict.fromkeys('abd', 0) registers['c'] = 1 instructions = [line for line in load_data()] registers = execute(instructions, registers) result = registers['a'] print("register a:", result)
def read_tags(text_file, fact_file, fact_type): """Returns the text as a unicode string as well as a dictionary with the various kinds of tags.""" (text, tags) = load_data(text_file, fact_file, fact_type) if fact_type == 'BAE': structures = tags_with_name(tags, 'STRUCTURE') tag_dictionary = read_tags_bae(structures) else: tag_dictionary = read_tags_basic(tags) return (text, tag_dictionary)
def main(): parser = ArgumentParser(prog="listen") parser.add_argument("user") parser.add_argument("-n", "--host", default="127.0.0.1", help="hostname of inboxes") args = parser.parse_args() users = common.load_data("users.pickle") assert args.user in users Reactor(GetBarks(args.user, args.host)).run()
def main(argv): options = argparser().parse_args(argv[1:]) train_texts, train_labels = load_data(options.train, options.input_format, options.multiclass) dev_texts, dev_labels = load_data(options.dev, options.input_format, options.multiclass) num_train_examples = len(train_texts) label_encoder = MultiLabelBinarizer() label_encoder.fit(train_labels) train_Y = label_encoder.transform(train_labels) dev_Y = label_encoder.transform(dev_labels) num_labels = len(label_encoder.classes_) classifier, tokenizer, optimizer, config = prepare_classifier( num_train_examples, num_labels, options) config.multiclass = options.multiclass tokenize = make_tokenization_function(tokenizer, options.seq_len) train_X = tokenize(train_texts) dev_X = tokenize(dev_texts) history = classifier.fit( train_X, train_Y, epochs=options.epochs, batch_size=options.batch_size, validation_data=(dev_X, dev_Y), ) metrics_values = classifier.evaluate(dev_X, dev_Y, batch_size=options.batch_size) for name, value in zip(classifier.metrics_names, metrics_values): print(f'{name}\t{value}') if options.save_model is not None: save_trained_model(options.save_model, classifier, tokenizer, label_encoder.classes_, config) return 0
def main(): print('Loading...') fullset = common.load_data(FULLSET_PATH, sep=',') print('Processing...') countmap = get_uncertain_count_map(fullset) print('Saving...') common.save_map(countmap.reshape(common.N_ROWS, -1), OUTPUT_PATH) print('Done!')
def post_fuzzing(target, ck, data): print '[*]now demo test post xss......' try: data = load_data(agent_list, ck) req = urllib2.Request(target, data=data) res = urllib2.urlopen(req) content = res.read() if res.code == 301 or res.code == 302: print '[*]we get a 301/302 when connect to the target,please recheck it' exit() elif res.code == 200: param_method = raw_input( '[*]you can provide params for accuracy or auto find by xssee: [0]provide [1]auto' ) if param_method == '1': print 'now detect params......' content = BeautifulSoup(content, 'html.parser') #print '111' input_list = content.select('input') for i in range(0, len(input_list)): name = input_list[i]['name'] in_type = input_list[i]['type'] print name, in_type elif param_method == '0': print '[*]post data like: id=1&name=2 etc' post_str = raw_input('[*]please input post data:') param_list = post_str.strip().split('&') param_dict = {} for i in range(0, len(param_list)): param_dict[param_list[i].strip().split('=') [0]] = param_list[i].strip().split('=')[1] for i in param_dict.keys(): param_dict[i] = param_dict[i] + flag #post_data = urllib.urlencode(param_dict) try: post_data = load_post_data(agent_list, ck, param_dict) req = urllib2.Request(target, data=post_data) res = urllib2.urlopen(req) content_html = res.read() if flag in content_html or param_list[ i] in content_html: print 'ok!' post_detect(target, ck, flag, param_dict, True) param_dict[i] = param_dict[i].replace(flag, '') except: pass #print '[*]fuzz post test lose connect' except: print '[*]connect failed to target' exit()
def on_timer_task(self, event): now = time.time() if now - self.last_user_reread > self.user_reread_period: self.users = common.load_data("users.pickle") self.last_user_reread = now bark = self.make_random_bark() sender = self.linker.sender("//%s/outbox/%s" % (self.hostname, bark.user)) sender.send(tuple(bark)) event.reactor.schedule(self.bark_period, self)
def main(): print('Loading...') fullset = common.load_data(FULLSET_PATH, sep=',') print('Processing...') trainset = get_trainset(fullset, upward=True) print('Saving...') common.save_data(trainset, OUTPUT_PATH) print('Done!')
def main_report_count_in_report_file(reports_filepath): reports_data = common.load_data(reports_filepath) if reports_data == None: print(reports_filepath + " has no data") raise ValueError(reports_filepath + " has no data") count = 0 for error_type_id in generate_webpage_with_error_output.for_review(): for e in reports_data: if e['error_id'] == error_type_id: count += 1 return count
def main(): parser = ArgumentParser(prog="bark") parser.add_argument("user") parser.add_argument("content", nargs="+") parser.add_argument("-n", "--host", default="127.0.0.1", help="hostname of outboxes") args = parser.parse_args() users = common.load_data("users.pickle") assert args.user in users content = " ".join(args.content) Reactor(PutBark(args.user, content, args.host)).run()
def main(): train_df = common.load_data('train') path = [common.OUTPUT_DIR] for name in os.listdir(os.path.join(*path)): if not os.path.isdir(os.path.join(*path, name)): continue path.append(name) for random_seed in os.listdir(os.path.join(*path)): if not os.path.isdir(os.path.join(*path, random_seed)): continue path.append(random_seed) results = [] for params_str in os.listdir(os.path.join(*path)): if not os.path.isdir(os.path.join(*path, params_str)): continue path.append(params_str) model_results = OrderedDict({'name': name}) for param in sorted(params_str.split('_')): try: k, v = param.split('=') k = k.replace('-', '_') model_results[k] = v except ValueError: pass scores = [] for fold_num in range(1, 11): fold_csv = os.path.join(*path, f'fold{fold_num}_validation.csv') if os.path.isfile(fold_csv): output = pd.read_csv(fold_csv).sort_values('id') target = train_df[train_df['id'].isin( output['id'])].sort_values('id') assert ( output['id'].values == target['id'].values).all() output = output[common.LABELS].values target = target[common.LABELS].values score = roc_auc_score(target, output, average='macro') model_results[f'fold{fold_num}'] = score scores.append(score) if scores: model_results['mean'] = np.mean(scores) model_results['std'] = np.std(scores) results.append(model_results) path.pop() if results: results = pd.DataFrame(results).sort_values('mean', ascending=False) results.to_csv(os.path.join(*path, 'evaluation.csv'), index=False) path.pop() path.pop()
def main(): ids = load_data(favorite_artists[CURRENT_USER]) artist_names = load_data_at(favorite_artists[CURRENT_USER], 1) new_songs = [] index = 0 num_ids = str(len(ids)) for artist_id in ids: artist_name = artist_names[index] index = index + 1 print(str(index) + '/' + num_ids + ': ' + artist_name) # try: time.sleep(PAUSE_TIME) results = sp.artist_albums(artist_id, album_type='album') new_songs.extend(get_recent_tracks(results, artist_id)) # except Exception as e: # print(e) # try: time.sleep(PAUSE_TIME) results = sp.artist_albums(artist_id, album_type='single') new_songs.extend(get_recent_tracks(results, artist_id)) # except Exception as e: # print(e) print('New songs: ' + str(len(new_songs))) print('Removing duplicates...') new_songs = remove_duplicates(new_songs) print('New songs: ' + str(len(new_songs))) if CURRENT_USER == ALEJANDRO: print('Removing tracks in history...') new_songs = remove_history_tracks(new_songs) print('New songs: ' + str(len(new_songs))) main_bucket = [] remix_bucket = [] for track in new_songs: name = track['name'] add_to_main = True for indicator in REMIX_INDICATORS: if indicator in name.lower(): remix_bucket.append(track) add_to_main = False if add_to_main: main_bucket.append(track) print('main_bucket: ' + str(len(main_bucket))) print('remix_bucket: ' + str(len(remix_bucket))) playlist_id = create_playlist(sp, playlist_title[CURRENT_USER], 'All new music released after last friday') add_to_playlist(playlist_id, tracks_to_ids(main_bucket)) add_to_playlist(playlist_id, tracks_to_ids(remix_bucket))
def main1(): '''find most common char in each column''' data = load_data() columns = zip(*data) result = [] for col in columns: count = Counter() for char in col: count.update(char) mode = count.most_common(1)[0][0] result.append(mode) print('final answer:', ''.join(result))
def main(): trainset = common.load_data(TRAINSET_PATH, sep=',') trainset = common.onehot_encode(trainset, 0) for i in range(N_MODEL): x_train, x_test, y_train, y_test = common.split(trainset, i) x_train, x_test = common.normalize(x_train, x_test) model, history = train(x_train, y_train, N_EPOCH) model.evaluate(x_test, y_test) model.save(common.numbering(MODEL_PATH, i)) save_history(history, common.numbering(HISTORY_PATH, i)) print(i, ' is done.')
def main(): fullset = common.load_data(FULLSET_PATH, sep=',') clust_samples = get_clust_samples(fullset) km = KModes(n_clusters=N_CLUST, n_init=N_INIT, init='Huang', verbose=True) clust_labels = km.fit_predict(clust_samples) label_to_codes = get_label_to_codes(clust_samples, clust_labels) with open(JSON_PATH, 'w') as f: json.dump(label_to_codes, f, sort_keys=True) common.save_data([[km.cost_]] + km.cluster_centroids_.tolist(), RESULT_PATH)
def main(): while True: city, month, day, raw_data = ms.new_get_filters() df = cf.load_data(city, month, day) ms.show_sample_data(df, raw_data) # using default_data function help to get data without filtering month and day ms.time_stats(cf.default_data(city)) ms.station_stats(df) ms.trip_duration_stats(df) ms.user_stats(df, city) restart = input('\nWould you like to restart? Enter yes or no.\n') if restart.lower() != 'yes': break
def main2(): ''' now the least common ''' data = load_data() columns = zip(*data) result = [] for col in columns: count = Counter() for char in col: count.update(char) mode = count.most_common()[-1][0] result.append(mode) print('final answer part 2:', ''.join(result))
def main(): basedir = '/home/nadav/studies/mapping_and_perception_autonomous_robots/kitti_data/orginaized_data' date = '2011_09_30' dataset_number = '0033' result_dir = r'/home/nadav/studies/mapping_and_perception_autonomous_robots/project_2/results' cur_date_time = time.strftime("%Y.%m.%d-%H.%M") result_dir_timed = os.path.join(result_dir, f'{cur_date_time}') print(f'saving to: {result_dir_timed}') os.makedirs(result_dir_timed, exist_ok=True) data = load_data(basedir, date, dataset_number) # Q2 extended_kalman_filter(result_dir_timed, data)
def add_recent_tracks(results, artist_id): black_list = load_data('black_list_artists.csv') if artist_id in black_list: return for result in results['items']: if should_add_to_list(result): time.sleep(PAUSE_TIME) print('release_date: ' + result['release_date'] + ' id: ' + result['id'] + ' name: ' + result['name'] + ' album_group: ' + result['album_group'] + ' album_type: ' + result['album_type']) if result['release_date'] not in accepted_dates: accepted_dates.append(result['release_date']) time.sleep(PAUSE_TIME) track_ids = get_album_track_ids(sp.album(result['uri']), artist_id) add_to_playlist(track_ids)
def get_probs_for_uncertain(uncertainset): trainset = common.load_data(TRAINSET_PATH, sep=',') encoded_uncertainset = common.onehot_encode( uncertainset[:, common.N_DISASTER:], 0) encoded_trainset = common.onehot_encode(trainset, 0) prob_sums = np.zeros((len(uncertainset), common.N_CLASS)) for i in range(N_MODEL): x_train, _, _, _ = common.split(encoded_trainset, i) _, normalized_uncertainset = common.normalize(x_train, encoded_uncertainset) prob_sums += tf.keras.models.load_model(common.numbering( MODEL_PATH, i)).predict(normalized_uncertainset) print(f'{i} is done.') return prob_sums / N_MODEL
def read_file(): """Reads and returns childs and last_upd from the data file.""" data_lst = common.load_data() if data_lst: if common.DATA_FORMAT == common.JSON: childs, last_upd_str = data_lst # mask is ISO format last_upd = dt.datetime.strptime(last_upd_str, '%Y-%m-%d').date() elif common.DATA_FORMAT == common.PKL: childs, last_upd = data_lst else: # error pass else: childs = None last_upd = None return childs, last_upd
def preprocessMain(stopword=False, basic_word = True, lemmatize=True): # Preprocess train data records = load_data('data/reviews.tsv') preprocess_records = [preprocess(record, stopword=stopword, basic_word = basic_word, lemmatize=lemmatize) for record in records] with open('data/preprocessed_reviews.tsv', 'w') as preprocess_file: header = 'id\treview\tsentiment\n' preprocess_file.write(header) for record in records: try: preprocess_file.write('%s\t%s\t%i\n' % (record['id'].decode('UTF-8'), record['review'].decode('UTF-8'), record['sentiment'])) except UnicodeEncodeError: print("unicode encode error") continue import sys sys.stdout.flush()
def make_query_to_reload_only_affected_objects(input_filename_with_reports, output_query_filename): input_filepath = common.get_file_storage_location() + "/" + input_filename_with_reports output_filepath = root() + 'reload_querries/' + output_query_filename if not os.path.isfile(input_filepath): print("file not found") return directory_path = os.path.split(output_filepath)[0] pathlib.Path(directory_path).mkdir(parents=True, exist_ok=True) archived_filepath = output_filepath + "-archived-" + str(datetime.datetime.now()) + ".query" try: move_file(output_filepath, archived_filepath) except FileNotFoundError: pass # it is OK, it just means that we are running for the first time or cache was deleted with open(output_filepath, 'w') as query_file: all_errors = [] for e in common.load_data(input_filepath): if e['error_id'] not in all_errors: all_errors.append(e['error_id']) query = common.get_query_for_loading_errors_by_category(filename = input_filename_with_reports, printed_error_ids = all_errors, format = "josm") query_file.write(query)
def main(params=None): if params is None: params = { 'dataset': 'DRLD', 'exp_name': 'char_test', 'test_fold': 0, 'n_dev_folds': 1, 'min_doc_thresh': 1, 'initialize_word_vectors': True, 'vectors': 'chars_word2vec_25', # default_word2vec_300, anes_word2vec_300, chars_word2vec_25, eye_1 ... 'init_scale': 0.2, 'add_OOV_dim': True, 'win': 1, # size of context window 'add_DRLD': True, 'rnn_type': 'basic', # basic, GRU, or LSTM 'n_hidden': 50, # size of hidden units 'pooling_method': 'max', # max, mean, or attention1/2 'bidirectional': True, 'bi_combine': 'concat', # concat, max, or mean 'train_embeddings': True, 'lr': 0.1, # learning rate 'lr_emb_fac': 1, # factor to modify learning rate for embeddings 'decay_delay': 10, # number of epochs with no improvement before decreasing learning rate 'decay_factor': 0.5, # factor by which to multiply learning rate in case of delay 'n_epochs': 300, 'add_OOV_noise': True, 'OOV_noise_prob': 0.01, 'minibatch_size': 16, 'classify_minibatch_size': 64, 'ensemble': False, 'save_model': True, 'seed': 42, 'verbose': 1, 'reuse': False, 'orig_T': 0.04, 'tau': 0.01, 'clip_gradients': False } #params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json') #params['exp_name'] += '_best' #params['n_hidden'] = int(params['n_hidden']) keys = params.keys() keys.sort() for key in keys: print key, ':', params[key] # seed the random number generators np.random.seed(params['seed']) random.seed(params['seed']) vector_type = params['vectors'].split('_')[0] params['word2vec_dim'] = int(params['vectors'].split('_')[-1]) reuser = None if params['reuse']: reuser = reusable_holdout.ReuseableHoldout(T=params['orig_T'], tau=params['tau']) if params['dataset'] == 'DRLD': datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes'] elif params['dataset'] == 'MIP': datasets = ['MIP-Personal-1', 'MIP-Personal-2', 'MIP-Political-1', 'MIP-Political-2'] elif params['dataset'] == 'MOLD': datasets = ['McCain-Likes', 'McCain-Dislikes', 'Obama-Likes', 'Obama-Dislikes'] elif params['dataset'] == 'Primary': datasets = ['Obama-Primary', 'Clinton-Primary'] elif params['dataset'] == 'General': datasets = ['Obama-General', 'McCain-General'] else: datasets = [params['dataset']] np.random.seed(params['seed']) random.seed(params['seed']) best_valid_f1s = [] best_true_valid_f1s = [] best_test_f1s = [] best_train_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name']) output_filename = fh.make_filename(output_dir, 'params', 'txt') fh.write_to_json(params, output_filename) for dev_fold in range(params['n_dev_folds']): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'], 'fold' + str(dev_fold)) if vector_type == 'chars': all_data, words2idx, items, all_labels = common.load_char_data(datasets, params['test_fold'], dev_fold) else: all_data, words2idx, items, all_labels = common.load_data(datasets, params['test_fold'], dev_fold, params['min_doc_thresh']) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy #if params['minibatch_size'] > 1 or params['classify_minibatch_size'] > 1: print "padding input with zeros" all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex) train_lex, valid_lex, test_lex = all_data train_masks, valid_masks, test_masks = all_masks #else: # train_masks = [np.ones(len(x)).astype('int32') for x in train_lex] # valid_masks = [np.ones(len(x)).astype('int32') for x in valid_lex] # test_masks = [np.ones(len(x)).astype('int32') for x in test_lex] print "expanding x with context win dows" # Rejigger to convert x to contex win in advance train_x_win = expand_x_with_context_win(train_lex, params['win']) valid_x_win = expand_x_with_context_win(valid_lex, params['win']) test_x_win = expand_x_with_context_win(test_lex, params['win']) order = range(len(train_lex)) print "done" train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, 'n_train', n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] #if vector_type == 'eye': # initial_embeddings = np.eye(vocsize) # emb_dim = initial_embeddings.shape[1] if params['initialize_word_vectors']: initial_embeddings = common.load_embeddings(params, words2idx) emb_dim = initial_embeddings.shape[1] else: initial_embeddings = None emb_dim = params['word2vec_dim'] print "embedding dim =", emb_dim temp_output = fh.make_filename(output_dir, 'embedding_labels', 'json') fh.write_to_json(idx2words, temp_output) extra_input_dims = 0 if params['add_DRLD']: extra_input_dims = 2 print "Building RNN" rnn = RNN(nh=params['n_hidden'], nc=n_codes, ne=vocsize, de=emb_dim, cs=params['win'], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params['init_scale'], rnn_type=params['rnn_type'], train_embeddings=params['train_embeddings'], pooling_method=params['pooling_method'], bidirectional=params['bidirectional'], bi_combine=params['bi_combine'], clip_gradients=params['clip_gradients'] ) temp_filename = fh.make_filename(output_dir, 'initial_embeddings', 'npy') rnn.save_embeddings(temp_filename) train_likes = [1 if re.search('Likes', i) else 0 for i in train_items] dev_likes = [1 if re.search('Likes', i) else 0 for i in dev_items] test_likes = [1 if re.search('Likes', i) else 0 for i in test_items] train_dem = [1 if re.search('Democrat', i) else 0 for i in train_items] dev_dem = [1 if re.search('Democrat', i) else 0 for i in dev_items] test_dem = [1 if re.search('Democrat', i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] ### LOAD #rnn.load(output_dir) # train with early stopping on validation set best_f1 = -np.inf params['clr'] = params['lr'] for e in xrange(params['n_epochs']): # shuffle #shuffle([train_lex, train_y, train_extra, train_masks], params['seed']) # shuffle the input data shuffle([order, train_lex, train_y, train_extra, train_masks], params['seed']) # shuffle the input data params['ce'] = e # store the current epoch tic = timeit.default_timer() ms = params['minibatch_size'] n_train = len(train_lex) nll = 0 #for i, orig_x in enumerate(train_lex): for iteration, i in enumerate(range(0, n_train, ms)): #orig_x = train_lex[i] #n_words = len(orig_x) #if params['add_OOV_noise']: # draws = np.random.rand(n_words) # x = [OOV_index if draws[i] < params['OOV_noise_prob'] else orig_x[i] for i in range(n_words)] #else: # x = orig_x #y = train_y[i] extra = train_extra[i] #mask = train_masks[i] minibatch_x, minibatch_mask,\ minibatch_extra, minibatch_y= select_minibatch(train_x_win, train_masks, train_extra, train_y, params['win'], i, ms, order, params['add_OOV_noise'], params['OOV_noise_prob']) #if i == 0: # print '\n'.join([' '.join([idx2words[idx] for idx in minibatch_x[:, k, 0].tolist()]) for # k in range(ms)]) nll_i, a_sum = rnn.train(minibatch_x, minibatch_mask, minibatch_y, params['win'], params['clr'], params['lr_emb_fac'], extra_input_dims, minibatch_extra) nll += nll_i #rnn.train(x, mask, y, params['win'], params['clr'], params['lr_emb_fac'], # extra_input_dims, extra) print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / float(n_sentences)), print 'completed in %.2f (sec), nll = %.2f, a_sum = %.1f <<\r' % (timeit.default_timer() - tic, nll, np.max(a_sum)), sys.stdout.flush() if np.isnan(nll) or np.isinf(nll): if best_f1 > 0: break else: return {'loss': 1.0, 'final_test_f1': 0, 'valid_f1s': 0, 'true_valid_f1s': 0, 'train_f1s': 0, 'test_f1s': 0, 'status': STATUS_OK } # evaluation // back into the real world : idx -> words print "" #print "true y", train_y[-1] #y_pred = rnn.classify(np.array(train_x_win[-1]).reshape((1, len(train_x_win[-1]))), # train_masks[-1], params['win'], extra_input_dims, train_extra[-1])[0] #print "pred y", y_pred #if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2': # if extra_input_dims == 0: # r = np.random.randint(0, len(train_lex)) # print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32')) predictions_train = predict(n_train, params['classify_minibatch_size'], train_x_win, train_masks, train_y, params['win'], extra_input_dims, train_extra, rnn, order) n_valid = len(valid_lex) n_test = len(test_lex) predictions_valid = predict(n_valid, params['classify_minibatch_size'], valid_x_win, valid_masks, valid_y, params['win'], extra_input_dims, dev_extra, rnn) predictions_test = predict(n_test, params['classify_minibatch_size'], test_x_win, test_masks, test_y, params['win'], extra_input_dims, test_extra, rnn) """ predictions_train = [rnn.classify(x, train_masks[i], params['win'], extra_input_dims, train_extra[i])[0] for i, x in enumerate(train_lex)] predictions_valid = [rnn.classify(x, valid_masks[i], params['win'], extra_input_dims, dev_extra[i])[0] for i, x in enumerate(valid_lex)] predictions_test = [rnn.classify(x, test_masks[i], params['win'], extra_input_dims, test_extra[i])[0] for i, x in enumerate(test_lex)] """ train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) question_f1s = [] question_pps = [] print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1 if valid_f1 > best_f1: best_rnn = copy.deepcopy(rnn) best_f1 = valid_f1 best_test_predictions = predictions_test if params['verbose']: print('NEW BEST: epoch', e, 'valid f1', valid_f1, 'best test f1', test_f1) params['tr_f1'] = train_f1 params['te_f1'] = test_f1 params['v_f1'] = valid_f1 params['be'] = e # store the current epoch as a new best # learning rate decay if no improvement in a given number of epochs if abs(params['be']-params['ce']) >= params['decay_delay']: params['clr'] *= params['decay_factor'] params['be'] = params['ce'] print "Reverting to current best; new learning rate = ", params['clr'] # also reset to the previous best rnn = best_rnn if params['clr'] < 1e-5: break if best_f1 == 1.0: break if best_f1 == 0 and e > 7: break if params['save_model']: predictions_test = predict(len(test_y), params['classify_minibatch_size'], test_x_win, test_masks, test_y, params['win'], extra_input_dims, test_extra, best_rnn) best_rnn.save(output_dir) common.write_predictions(datasets, params['test_fold'], dev_fold, predictions_test, test_items, output_dir) print('BEST RESULT: epoch', params['be'], 'train F1 ', params['tr_f1'], 'valid F1', params['v_f1'], 'best test F1', params['te_f1'], 'with the model', output_dir) best_true_valid_f1s.append(params['v_f1']) best_test_f1s.append(params['te_f1']) best_train_f1s.append(params['tr_f1']) if reuser is not None: best_valid_f1 = reuser.mask_value(params['v_f1'], params['tr_f1']) else: best_valid_f1 = params['v_f1'] best_valid_f1s.append(best_valid_f1) test_prediction_arrays.append(np.array(best_test_predictions, dtype=int)) params['ensemble'] = False if params['ensemble']: test_predictions_stack = np.dstack(test_prediction_arrays) final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0] predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes) true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes) final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df) else: final_test_f1 = np.median(best_test_f1s) return {'loss': -np.median(best_valid_f1s), 'final_test_f1': final_test_f1, 'valid_f1s': best_valid_f1s, 'train_f1s': best_train_f1s, 'true_valid_f1s': best_true_valid_f1s, 'test_f1s': best_test_f1s, 'status': STATUS_OK }
def test_behavior_defects_module(filename, plot, test_type): # initialize modules train_values = 1 train_trees = 10 filename_train = "train_data/behavior_defects_data.output" init_server.init_behavior_defects_module(filename_train, train_values, train_trees) if test_type == "full": # generate new dataset print bcolors.HEADER + "initialize dependent modules" + bcolors.ENDC init_server.init_speed_module ("train_data/speed_acc_data.output", 10, 15) init_server.init_turns_module ("train_data/turns_com_data.output", 5, 10) init_server.init_defects_module ("train_data/defects_acc_data.output", 5, 15) print bcolors.OKGREEN + "Done! " + bcolors.ENDC # load dependent test data and classify actions # structure: time,accx,accy,accz,compass,lat,lon,speed test_values = 5 print bcolors.HEADER + "Start getting speed data" + bcolors.ENDC test_speed_data = cmn.aver_std_array(cmn.load_data(filename, (2,)), test_values) test_speed_data = test_speed_data.reshape(len(test_speed_data)/2, 2) predicted_speed = sp.predicted(test_speed_data) predicted_speed = predicted_speed.reshape(len(predicted_speed), 1) print bcolors.OKGREEN + "Done! " + bcolors.ENDC#, predicted_speed print bcolors.HEADER + "Start getting turns data" + bcolors.ENDC test_turns_data = cmn.get_diff_array(cmn.load_data(filename, (4,))) test_turns_data = cmn.aver_std_array(test_turns_data, test_values) test_turns_data = test_turns_data.reshape(len(test_turns_data)/2, 2) predicted_turns = tr.predicted(test_turns_data) predicted_turns = predicted_turns.reshape(len(predicted_turns), 1) print bcolors.OKGREEN + "Done! " + bcolors.ENDC#, predicted_turns print bcolors.HEADER + "Start getting defects data" + bcolors.ENDC test_defects_data = cmn.aver_std_array(cmn.load_data(filename, (3,)), test_values) test_defects_data = test_defects_data.reshape(len(test_defects_data)/2, 2) predicted_defects = df.predicted(test_defects_data) predicted_defects = predicted_defects.reshape(len(predicted_defects), 1) print bcolors.OKGREEN + "Done! " + bcolors.ENDC#, predicted_defects print bcolors.HEADER + "Start generating test data" + bcolors.ENDC test_times = cmn.label_array(cmn.load_data(filename, (0,)), test_values) test_data = np.hstack((predicted_speed, predicted_turns, predicted_defects)) np.savetxt('generated_behavior_defects_data.output', test_data, delimiter=',', fmt='%i') print bcolors.OKGREEN + "Done! " + bcolors.ENDC#, test_data elif test_type == "express": # use default dataset # structure: time,speed,turn,defect,lat,lon print bcolors.HEADER + "Start getting test data" + bcolors.ENDC test_values = 1 test_speed_data = cmn.label_array(cmn.load_data(filename, (1,)), values) test_speed_data = test_speed_data.reshape(len(test_speed_data), 1) test_turns_data = cmn.label_array(cmn.load_data(filename, (2,)), values) test_turns_data = test_turns_data.reshape(len(test_turns_data), 1) test_defects_data = cmn.label_array(cmn.load_data(filename, (3,)), values) test_defects_data = test_defects_data.reshape(len(test_defects_data), 1) test_data = np.hstack((test_speed_data, test_turns_data, test_defects_data)) test_times = cmn.label_array(cmn.load_data(filename, (6,)), test_values) print bcolors.OKGREEN + "Done! test_data:\n" + bcolors.ENDC, test_data else : print bcolor.FAIL + "behavior_defects_module: invalid test type, exit" + bcolors.ENDC return # plot result is not used currently #if plot == "yes" : # train_speed_data = cmn.label_array(cmn.load_data(filename_train, (1,)), train_values) # train_speed_data = train_speed_data.reshape(len(train_speed_data), 1) # train_turns_data = cmn.label_array(cmn.load_data(filename_train, (2,)), train_values) # train_turns_data = train_turns_data.reshape(len(train_turns_data), 1) # train_defects_data = cmn.label_array(cmn.load_data(filename_train, (3,)), train_values) # train_defects_data = train_defects_data.reshape(len(train_defects_data), 1) # train_data = np.hstack((train_speed_data, train_turns_data, train_defects_data)) # xx, yy = cmn.get_grid(train_data[:, [0, 1]]) # train_predicted = bd.predicted(np.c_[xx.ravel(), yy.ravel()], 1).reshape(xx.shape) # #print "Train data\n", train_data # #print "Train predicted\n", train_predicted # test_predicted = bd.predicted(test_data) # cmn.plot_2D_data(test_data, test_predicted, train_data, train_predicted, [0, 5.0], [0, 5.0]); # skip waiting (speed ~ 0) # check is arrays is empty # get new types for defects raw_input(bcolors.OKBLUE + "Ready to start! Press Enter to continue..." + bcolors.ENDC) bd.find_actions(test_data, test_times) # writing defects to DB is not used in test module #bd.add_defects() return
def main(params=None): if params is None: params = { 'exp_name': 'minibatch_test', 'test_fold': 0, 'n_dev_folds': 1, 'min_doc_thresh': 1, 'initialize_word_vectors': True, 'vectors': 'anes_word2vec', # default_word2vec, anes_word2vec ... 'word2vec_dim': 300, 'init_scale': 0.2, 'add_OOV': True, 'win': 3, # size of context window 'add_DRLD': False, 'rnn_type': 'basic', # basic, GRU, or LSTM 'n_hidden': 3, # size of hidden units 'pooling_method': 'max', # max, mean, or attention1/2 'bidirectional': False, 'bi_combine': 'mean', # concat, max, or mean 'train_embeddings': True, 'lr': 0.1, # learning rate 'lr_emb_fac': 0.2, # factor to modify learning rate for embeddings 'decay_delay': 5, # number of epochs with no improvement before decreasing learning rate 'decay_factor': 0.5, # factor by which to multiply learning rate in case of delay 'n_epochs': 10, 'add_OOV_noise': False, 'OOV_noise_prob': 0.01, 'minibatch_size': 1, 'ensemble': False, 'save_model': True, 'seed': 42, 'verbose': 1, 'reuse': False, 'orig_T': 0.04, 'tau': 0.01 } # load params from a previous experiment params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json') params['exp_name'] += '_minibatch_16' params['n_hidden'] = int(params['n_hidden']) params['orig_T'] = 0.02 params['tau'] = 0.005 reuser = None if params['reuse']: reuser = reusable_holdout.ReuseableHoldout(T=params['orig_T'], tau=params['tau']) keys = params.keys() keys.sort() for key in keys: print key, ':', params[key] # seed the random number generators np.random.seed(params['seed']) random.seed(params['seed']) datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes'] np.random.seed(params['seed']) random.seed(params['seed']) best_valid_f1s = [] best_test_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name']) output_filename = fh.make_filename(output_dir, 'params', 'json') fh.write_to_json(params, output_filename) for dev_fold in range(params['n_dev_folds']): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'], 'fold' + str(dev_fold)) results = [] all_data, words2idx, items, all_labels = common.load_data(datasets, params['test_fold'], dev_fold, params['min_doc_thresh']) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, 'n_train', n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] initial_embeddings = common.load_embeddings(params, words2idx) OOV_index = words2idx['__OOV__'] emb_dim = initial_embeddings.shape[1] print 'emb_dim =', emb_dim extra_input_dims = 0 if params['add_DRLD']: extra_input_dims = 2 print "Building RNN" rnn = RNN(nh=params['n_hidden'], nc=n_codes, ne=vocsize, de=emb_dim, cs=params['win'], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params['init_scale'], rnn_type=params['rnn_type'], train_embeddings=params['train_embeddings'], pooling_method=params['pooling_method'], bidirectional=params['bidirectional'], bi_combine=params['bi_combine'] ) train_likes = [1 if re.search('Likes', i) else 0 for i in train_items] dev_likes = [1 if re.search('Likes', i) else 0 for i in dev_items] test_likes = [1 if re.search('Likes', i) else 0 for i in test_items] train_dem = [1 if re.search('Democrat', i) else 0 for i in train_items] dev_dem = [1 if re.search('Democrat', i) else 0 for i in dev_items] test_dem = [1 if re.search('Democrat', i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] # train with early stopping on validation set best_f1 = -np.inf params['clr'] = params['lr'] for e in xrange(params['n_epochs']): # shuffle shuffle([train_lex, train_y, train_extra], params['seed']) # shuffle the input data params['ce'] = e # store the current epoch tic = timeit.default_timer() #for i, (x, y) in enumerate(zip(train_lex, train_y)): for i, orig_x in enumerate(train_lex): n_words = len(orig_x) if params['add_OOV_noise']: draws = np.random.rand(n_words) x = [OOV_index if draws[idx] < params['OOV_noise_prob'] else orig_x[idx] for idx in range(n_words)] else: x = orig_x y = train_y[i] extra = train_extra[i] if i == 0: print ' '.join([idx2words[w] for w in train_lex[i]]) if i == 0: print x print y nll = rnn.train(x, y, params['win'], params['clr'], params['lr_emb_fac'], extra_input_dims, extra) if float(i/100.0) == float(i//100): print nll print '[learning] epoch %i >> %2.2f%%' % ( e, (i + 1) * 100. / float(n_sentences)), print 'completed in %.2f (sec) <<\r' % (timeit.default_timer() - tic), sys.stdout.flush() #if i == 0: # print ' '.join([idx2words[idx] for idx in orig_x]) # print rnn.classify(orig_x, params['win'], extra_input_dims, extra) if np.isnan(nll) or np.isinf(nll): return {'loss': nll, 'final_test_f1': 0, 'valid_f1s': [0], 'test_f1s': [0], 'status': STATUS_OK } # evaluation // back into the real world : idx -> words print "" #print rnn.classify((np.asarray(contextwin(train_lex[0], params['win'])).astype('int32')), train_likes[0], params['win']) #print rnn.classify(train_lex[0], params['win'], extra_input_dims, train_extra[0]) #print rnn.get_element_weights(np.asarray(contextwin(train_lex[0], params['win'])).astype('int32')) #if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2': # if extra_input_dims == 0: # r = np.random.randint(0, len(train_lex)) # print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32')) """ predictions_train = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in train_lex] predictions_test = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in test_lex] predictions_valid = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in valid_lex] """ #predictions_train = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in train_lex] #predictions_test = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in test_lex] #predictions_valid = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in valid_lex] predictions_train = [rnn.classify(x, params['win'], extra_input_dims, train_extra[i]) for i, x in enumerate(train_lex)] predictions_test = [rnn.classify(x, params['win'], extra_input_dims, test_extra[i]) for i, x in enumerate(test_lex)] predictions_valid = [rnn.classify(x, params['win'], extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex)] train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) if reuser is not None: valid_f1 = reuser.mask_value(valid_f1, train_f1) question_f1s = [] question_pps = [] print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1 results.append((train_f1, valid_f1, test_f1)) if valid_f1 > best_f1: best_rnn = copy.deepcopy(rnn) best_f1 = valid_f1 best_test_predictions = predictions_test if params['verbose']: print('NEW BEST: epoch', e, 'valid f1', valid_f1, 'best test f1', test_f1) params['tr_f1'] = train_f1 params['te_f1'] = test_f1 params['v_f1'] = valid_f1 params['be'] = e # store the current epoch as a new best # learning rate decay if no improvement in a given number of epochs if abs(params['be']-params['ce']) >= params['decay_delay']: params['clr'] *= params['decay_factor'] params['be'] = params['ce'] print "Reverting to current best; new learning rate = ", params['clr'] # also reset to the previous best rnn = best_rnn if params['clr'] < 1e-5: break if best_f1 == 1.0: break if best_f1 == 0 and e > 10: break if params['save_model']: predictions_valid = [rnn.classify(x, params['win'], extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex)] #predictions_valid = [best_rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')) for x in valid_lex] best_rnn.save(output_dir) common.write_predictions(datasets, params['test_fold'], dev_fold, predictions_valid, dev_items, output_dir) print('BEST RESULT: epoch', params['be'], 'train F1 ', params['tr_f1'], 'valid F1', params['v_f1'], 'best test F1', params['te_f1'], 'with the model', output_dir) best_valid_f1s.append(params['v_f1']) best_test_f1s.append(params['te_f1']) test_prediction_arrays.append(np.array(best_test_predictions, dtype=int)) output_filename = fh.make_filename(output_dir, 'results', 'txt') with codecs.open(output_filename, 'w') as output_file: for e, result in enumerate(results): output_file.write('epoch=' + str(e) + '; train_f1=' + str(result[0]) + '; valid_f1=' + str(result[1]) + '; test_f1=' + str(result[2]) + '\n') if params['ensemble']: test_predictions_stack = np.dstack(test_prediction_arrays) final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0] predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes) true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes) final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df) else: final_test_f1 = np.median(best_test_f1s) return {'loss': -np.median(best_valid_f1s), 'final_test_f1': final_test_f1, 'valid_f1s': best_valid_f1s, 'test_f1s': best_test_f1s, 'status': STATUS_OK }
Theta1 = Theta[1] - alpha * delta # 値は同時に更新します Theta = [Theta0, Theta1] # 表示 if i % 10 == 0 or i == iteration-1: cost = compute_cost(x_vals, y_vals, Theta, hypothesis_func) print("itr=%d, cost=%f, Theta0=%f, Theta1=%f" % (i, cost, Theta[0], Theta[1])) return Theta if __name__ == "__main__": # 01. データを読み込む #--------------------------------------------- # 今回利用するデータを読み込みます data, x_vals, y_vals = cmn.load_data() # 上10件ほど、見てみましょう print('-----------------\n#今回利用するデータ(上10件)') pprint(data[:10]) # データをグラフに表示します cmn.show(data) # 02. (最適化前)予測とコストを計算する #--------------------------------------------- # 初期値のシータは10にしておきます(別の値でも良いです) # 「Theta0 = 10, Theta1 = 10」の意味です。 Theta = [10, 10] # このシータを使って、上位10件のデータの予測を作ってみましょう hypo = hypothesis(x_vals, Theta) # 上位3件の予測結果を表示します(最適化前)
def main(params=None): if params is None: params = { "exp_name": "minibatch_test", "test_fold": 0, "n_dev_folds": 1, "min_doc_thresh": 1, "initialize_word_vectors": True, "vectors": "anes_word2vec", # default_word2vec, anes_word2vec ... "word2vec_dim": 300, "init_scale": 0.2, "add_OOV": True, "win": 3, # size of context window "add_DRLD": False, "rnn_type": "basic", # basic, GRU, or LSTM "n_hidden": 3, # size of hidden units "pooling_method": "max", # max, mean, or attention1/2 "bidirectional": False, "bi_combine": "mean", # concat, max, or mean "train_embeddings": True, "lr": 0.1, # learning rate "lr_emb_fac": 0.2, # factor to modify learning rate for embeddings "decay_delay": 5, # number of epochs with no improvement before decreasing learning rate "decay_factor": 0.5, # factor by which to multiply learning rate in case of delay "n_epochs": 10, "add_OOV_noise": False, "OOV_noise_prob": 0.01, "minibatch_size": 1, "ensemble": False, "save_model": True, "seed": 42, "verbose": 1, "reuse": False, "orig_T": 0.04, "tau": 0.01, } # load params from a previous experiment params = fh.read_json("/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json") params["exp_name"] += "_minibatch_16" params["n_hidden"] = int(params["n_hidden"]) params["orig_T"] = 0.02 params["tau"] = 0.005 reuser = None if params["reuse"]: reuser = reusable_holdout.ReuseableHoldout(T=params["orig_T"], tau=params["tau"]) keys = params.keys() keys.sort() for key in keys: print key, ":", params[key] # seed the random number generators np.random.seed(params["seed"]) random.seed(params["seed"]) datasets = ["Democrat-Likes", "Democrat-Dislikes", "Republican-Likes", "Republican-Dislikes"] np.random.seed(params["seed"]) random.seed(params["seed"]) best_valid_f1s = [] best_test_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, "rnn", params["exp_name"]) output_filename = fh.make_filename(output_dir, "params", "json") fh.write_to_json(params, output_filename) for dev_fold in range(params["n_dev_folds"]): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, "rnn", params["exp_name"], "fold" + str(dev_fold)) results = [] all_data, words2idx, items, all_labels = common.load_data( datasets, params["test_fold"], dev_fold, params["min_doc_thresh"] ) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, "n_train", n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] initial_embeddings = common.load_embeddings(params, words2idx) OOV_index = words2idx["__OOV__"] emb_dim = initial_embeddings.shape[1] print "emb_dim =", emb_dim extra_input_dims = 0 if params["add_DRLD"]: extra_input_dims = 2 print "Building RNN" rnn = RNN( nh=params["n_hidden"], nc=n_codes, ne=vocsize, de=emb_dim, cs=params["win"], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params["init_scale"], rnn_type=params["rnn_type"], train_embeddings=params["train_embeddings"], pooling_method=params["pooling_method"], bidirectional=params["bidirectional"], bi_combine=params["bi_combine"], ) train_likes = [1 if re.search("Likes", i) else 0 for i in train_items] dev_likes = [1 if re.search("Likes", i) else 0 for i in dev_items] test_likes = [1 if re.search("Likes", i) else 0 for i in test_items] train_dem = [1 if re.search("Democrat", i) else 0 for i in train_items] dev_dem = [1 if re.search("Democrat", i) else 0 for i in dev_items] test_dem = [1 if re.search("Democrat", i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] # train with early stopping on validation set best_f1 = -np.inf params["clr"] = params["lr"] for e in xrange(params["n_epochs"]): # shuffle shuffle([train_lex, train_y, train_extra], params["seed"]) # shuffle the input data params["ce"] = e # store the current epoch tic = timeit.default_timer() # for i, (x, y) in enumerate(zip(train_lex, train_y)): for i, orig_x in enumerate(train_lex): n_words = len(orig_x) if params["add_OOV_noise"]: draws = np.random.rand(n_words) x = [OOV_index if draws[idx] < params["OOV_noise_prob"] else orig_x[idx] for idx in range(n_words)] else: x = orig_x y = train_y[i] extra = train_extra[i] if i == 0: print " ".join([idx2words[w] for w in train_lex[i]]) if i == 0: print x print y nll = rnn.train(x, y, params["win"], params["clr"], params["lr_emb_fac"], extra_input_dims, extra) if float(i / 100.0) == float(i // 100): print nll print "[learning] epoch %i >> %2.2f%%" % (e, (i + 1) * 100.0 / float(n_sentences)), print "completed in %.2f (sec) <<\r" % (timeit.default_timer() - tic), sys.stdout.flush() # if i == 0: # print ' '.join([idx2words[idx] for idx in orig_x]) # print rnn.classify(orig_x, params['win'], extra_input_dims, extra) if np.isnan(nll) or np.isinf(nll): return {"loss": nll, "final_test_f1": 0, "valid_f1s": [0], "test_f1s": [0], "status": STATUS_OK} # evaluation // back into the real world : idx -> words print "" # print rnn.classify((np.asarray(contextwin(train_lex[0], params['win'])).astype('int32')), train_likes[0], params['win']) # print rnn.classify(train_lex[0], params['win'], extra_input_dims, train_extra[0]) # print rnn.get_element_weights(np.asarray(contextwin(train_lex[0], params['win'])).astype('int32')) # if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2': # if extra_input_dims == 0: # r = np.random.randint(0, len(train_lex)) # print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32')) """ predictions_train = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in train_lex] predictions_test = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in test_lex] predictions_valid = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0) for x in valid_lex] """ # predictions_train = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in train_lex] # predictions_test = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in test_lex] # predictions_valid = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in valid_lex] predictions_train = [ rnn.classify(x, params["win"], extra_input_dims, train_extra[i]) for i, x in enumerate(train_lex) ] predictions_test = [ rnn.classify(x, params["win"], extra_input_dims, test_extra[i]) for i, x in enumerate(test_lex) ] predictions_valid = [ rnn.classify(x, params["win"], extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex) ] train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) if reuser is not None: valid_f1 = reuser.mask_value(valid_f1, train_f1) question_f1s = [] question_pps = [] print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1 results.append((train_f1, valid_f1, test_f1)) if valid_f1 > best_f1: best_rnn = copy.deepcopy(rnn) best_f1 = valid_f1 best_test_predictions = predictions_test if params["verbose"]: print ("NEW BEST: epoch", e, "valid f1", valid_f1, "best test f1", test_f1) params["tr_f1"] = train_f1 params["te_f1"] = test_f1 params["v_f1"] = valid_f1 params["be"] = e # store the current epoch as a new best # learning rate decay if no improvement in a given number of epochs if abs(params["be"] - params["ce"]) >= params["decay_delay"]: params["clr"] *= params["decay_factor"] params["be"] = params["ce"] print "Reverting to current best; new learning rate = ", params["clr"] # also reset to the previous best rnn = best_rnn if params["clr"] < 1e-5: break if best_f1 == 1.0: break if best_f1 == 0 and e > 10: break if params["save_model"]: predictions_valid = [ rnn.classify(x, params["win"], extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex) ] # predictions_valid = [best_rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')) for x in valid_lex] best_rnn.save(output_dir) common.write_predictions(datasets, params["test_fold"], dev_fold, predictions_valid, dev_items, output_dir) print ( "BEST RESULT: epoch", params["be"], "train F1 ", params["tr_f1"], "valid F1", params["v_f1"], "best test F1", params["te_f1"], "with the model", output_dir, ) best_valid_f1s.append(params["v_f1"]) best_test_f1s.append(params["te_f1"]) test_prediction_arrays.append(np.array(best_test_predictions, dtype=int)) output_filename = fh.make_filename(output_dir, "results", "txt") with codecs.open(output_filename, "w") as output_file: for e, result in enumerate(results): output_file.write( "epoch=" + str(e) + "; train_f1=" + str(result[0]) + "; valid_f1=" + str(result[1]) + "; test_f1=" + str(result[2]) + "\n" ) if params["ensemble"]: test_predictions_stack = np.dstack(test_prediction_arrays) final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0] predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes) true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes) final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df) else: final_test_f1 = np.median(best_test_f1s) return { "loss": -np.median(best_valid_f1s), "final_test_f1": final_test_f1, "valid_f1s": best_valid_f1s, "test_f1s": best_test_f1s, "status": STATUS_OK, }
def create_setup(): """Copy files from template and update them with user input.""" global app_name, app_version, app_license, app_author, app_email, \ app_url, app_keywords, DEFAULT_AUTHOR, DEFAULT_EMAIL, \ DEFAULT_LICENSE, DEFAULT_URL, DEFAULT_VERSION data_lst = common.load_data() if data_lst: (DEFAULT_AUTHOR, DEFAULT_EMAIL, DEFAULT_LICENSE, DEFAULT_URL, DEFAULT_VERSION) = data_lst while not app_name: app_name = input(lcl.Q_APP_NAME).decode(lcl.INPUT_ENC) app_version = input(lcl.Q_APP_VERSION + '[' + DEFAULT_VERSION + '] ').decode(lcl.INPUT_ENC) if not app_version: app_version = DEFAULT_VERSION app_license = input(lcl.Q_APP_LICENSE + '[' + DEFAULT_LICENSE + '] ').decode(lcl.INPUT_ENC) if not app_license: app_license = DEFAULT_LICENSE app_author = input(lcl.Q_APP_AUTHOR + '[' + DEFAULT_AUTHOR + '] ').decode(lcl.INPUT_ENC) if not app_author: app_author = DEFAULT_AUTHOR app_email = input(lcl.Q_APP_EMAIL + '[' + DEFAULT_EMAIL + '] ').decode(lcl.INPUT_ENC) if not app_email: app_email = DEFAULT_EMAIL app_url = input(lcl.Q_APP_URL + '[' + DEFAULT_URL + '] ').decode(lcl.INPUT_ENC) if not app_url: app_url = DEFAULT_URL app_keywords = input(lcl.Q_APP_KEYWORDS).decode(lcl.INPUT_ENC) if not app_keywords: app_keywords = app_name data_lst = [app_author, app_email, app_license, app_url, app_version] common.save_data(data_lst) app_url += app_name # backup existing files backup = False filenames = glob.glob('*') filenames += glob.glob('.*') if filenames: backup = True os.mkdir(BAK_DIR) for filename in filenames: dest = BAK_DIR + '/' + filename.split(os.sep)[-1] shu.move(filename, dest) filenames = glob.glob(common.DATA_PATH + 'template/*') filenames += glob.glob(common.DATA_PATH + 'template/.*') # remove doc dir filenames = [filename for filename in filenames if 'template' + os.sep + 'doc' not in filename] # copy files and dirs for filename in filenames: if os.path.isfile(filename): shu.copyfile(filename, filename.split(os.sep)[-1]) else: shu.copytree(filename, filename.split(os.sep)[-1]) common.sleep(2) os.rename('APPLICATION_NAME', app_name) # rename application dir # collect all filenames, including from 1st level subdirs filenames = glob.glob('*') filenames = [filename for filename in filenames if BAK_DIR not in filename] filenames += glob.glob('.*') new_filenames = [] for filename in filenames: if os.path.isdir(filename): new_filenames += glob.glob(filename + '/*') filenames += new_filenames exceptions = ['__init__.py', 'build.cmd', 'requirements.txt', 'requirements-dev.txt', 'setup.py', 'setup_py2exe.py', 'setup_utils.py'] # delete .pyc files and update files for filename in filenames: if os.path.isfile(filename): if '.pyc' in filename: os.remove(filename) else: if filename.split(os.sep)[-1] not in exceptions: update_file(filename) create_redir2rtd_zip() if backup: os.remove(app_name + APPLICATION_TEMPLATE_FILE) # remove app template # restore files from backup, but only if they don't already exist filenames = glob.glob(BAK_DIR + '/*') for filename in filenames: dest = app_name + '/' + filename.split(os.sep)[-1] if not os.path.isfile(dest): shu.copyfile(filename, dest) else: os.rename(app_name + APPLICATION_TEMPLATE_FILE, app_name + '/' + app_name + '.py') # rename app template print(lcl.REMINDERS)
def test_road_quality_module(filename, plot, test_type): # initialize module for testing train_values = 1 train_trees = 10 filename_train = "train_data/road_quality_data.output" init_server.init_road_quality_module(filename_train, train_values, train_trees) if test_type == "full": # generate new dataset print bcolors.HEADER + "initialize dependent modules" + bcolors.ENDC init_server.init_speed_module ("train_data/speed_acc_data.output", 10, 10) init_server.init_turns_module ("train_data/turns_com_data.output", 5, 10) init_server.init_defects_module ("train_data/speed_acc_data.output", 5, 20) init_server.init_behavior_defects_module("train_data/behavior_defects_data.output", 1, 10) print bcolors.OKGREEN + "Done! " + bcolors.ENDC # load dependent test data and classify actions # structure: time,accx,accy,accz,compass,lat,lon,speed test_values = 10 print bcolors.HEADER + "Start getting speed data" + bcolors.ENDC test_speed_data = cmn.aver_std_array(cmn.load_data(filename, (2,)), test_values) test_speed_data = test_speed_data.reshape(len(test_speed_data)/2, 2) predicted_speed = df.predicted(test_speed_data) predicted_speed = predicted_speed.reshape(len(predicted_speed), 1) print bcolors.OKGREEN + "Done! speed data:\n" + bcolors.ENDC, predicted_speed print bcolors.HEADER + "Start getting turns data" + bcolors.ENDC test_turns_data = cmn.get_diff_array(cmn.load_data(filename, (4,))) test_turns_data = cmn.aver_std_array(test_turns_data, test_values) test_turns_data = test_turns_data.reshape(len(test_turns_data)/2, 2) predicted_turns = df.predicted(test_turns_data) predicted_turns = predicted_turns.reshape(len(predicted_turns), 1) print bcolors.OKGREEN + "Done! turns data:\n" + bcolors.ENDC, predicted_turns print bcolors.HEADER + "Start getting defects data" + bcolors.ENDC test_defects_data = cmn.aver_std_array(cmn.load_data(filename, (3,)), test_values) test_defects_data = test_defects_data.reshape(len(test_defects_data)/2, 2) predicted_defects = df.predicted(test_defects_data) predicted_defects = predicted_defects.reshape(len(predicted_defects), 1) print bcolors.OKGREEN + "Done! defects data:\n" + bcolors.ENDC, predicted_defects print bcolors.HEADER + "Start getting behavior defects data" + bcolors.ENDC test_behavior_defects_data = np.hstack((predicted_speed, predicted_turns, predicted_defects)) predicted_behavior_defects = bd.predicted(test_behavior_defects_data) predicted_behavior_defects = predicted_behavior_defects.reshape(len(predicted_behavior_defects), 1) print bcolors.OKGREEN + "Done! defects data:\n" + bcolors.ENDC, predicted_defects print bcolors.HEADER + "Start generating test data" + bcolors.ENDC test_times = cmn.label_array(cmn.load_data(filename, (0,)), test_values) test_data = cmn.sum_array(predicted_behavior_defects, test_values) print bcolors.OKGREEN + "Done! test_data:\n" + bcolors.ENDC, test_data return elif test_type == "express": # use default dataset # structure: time,low_defects,high_defects,lat,lon,label print bcolors.HEADER + "Start getting test data" + bcolors.ENDC test_values = 1 test_low_defects_data = cmn.label_array(cmn.load_data(filename, (1,)), values) test_low_defects_data = test_low_defects_data.reshape(len(test_low_defects_data), 1) test_high_defects_data = cmn.label_array(cmn.load_data(filename, (2,)), values) test_high_defects_data = test_high_defects_data.reshape(len(test_high_defects_data), 1) test_tent_defects_data = cmn.label_array(cmn.load_data(filename, (3,)), values) test_tent_defects_data = test_tent_defects_data.reshape(len(test_tent_defects_data), 1) test_data = np.hstack((test_low_defects_data, test_high_defects_data, test_tent_defects_data)) test_times = cmn.label_array(cmn.load_data(filename, (0,)), test_values) print bcolors.OKGREEN + "Done! test_data:\n" + bcolors.ENDC, test_data else : print bcolor.FAIL + "road_quality_module: invalid test type, exit" + bcolors.ENDC return # choose test data sources by test type if test_type == "full": # generate new dataset init_server.init_speed_module ("train_data/speed_acc_data.output", 10, 10) init_server.init_turns_module ("train_data/turns_acc_data.output", 10, 10) init_server.init_defects_module ("train_data/defects_acc_data.output", 5, 20) init_server.init_behavior_defects_module("train_data/behavior_defects_data.output", 10, 10) elif test_type == "express": # use default dataset # structure: time,low_defects,high_defects test_values = 1 test_data = cmn.aver_std_array(cmn.load_data(filename, (2,)), test_values) test_times = cmn.label_array(cmn.load_data(filename, (0,)), test_values) test_data = test_data.reshape(len(test_data)/2, 2) rq.find_actions(test_data, test_times) else : print "road_quality_module: invalid test type, exit" return # analysis of set of defects and determine road quality # compare with previous results of road quality analysis # update road quality using voting procedure return
def main(params=None): if params is None: params = { 'dataset': 'DRLD', 'exp_name': 'best_minibatch_mod', 'test_fold': 0, 'n_dev_folds': 1, 'min_doc_thresh': 1, 'initialize_word_vectors': False, 'vectors': 'anes_word2vec_300', # default_word2vec_300, anes_word2vec_300, chars_word2vec_25, eye_1 ... 'init_scale': 0.2, 'add_OOV_dim': False, 'win': 1, # size of context window 'add_DRLD': False, 'rnn_type': 'LSTM', # basic, GRU, or LSTM 'n_hidden': 50, # size of hidden units 'pooling_method': 'last', # max, mean, or attention1/2 'bidirectional': False, 'bi_combine': 'concat', # concat, max, or mean 'train_embeddings': False, 'lr': 0.025, # learning rate 'lr_emb_fac': 0.2, # factor to modify learning rate for embeddings 'decay_delay': 5, # number of epochs with no improvement before decreasing learning rate 'decay_factor': 0.5, # factor by which to multiply learning rate in case of delay 'n_epochs': 100, 'add_OOV_noise': False, 'OOV_noise_prob': 0.01, 'minibatch_size': 1, 'classify_minibatch_size': 1, 'ensemble': False, 'save_model': True, 'seed': 42, 'verbose': 1, 'reuse': False, 'orig_T': 0.04, 'tau': 0.01, 'xavier_init': True } params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/rnn/bayes_opt_rnn_LSTM_reuse_mod_34_rerun/params.txt') params['n_hidden'] = int(params['n_hidden']) keys = params.keys() keys.sort() for key in keys: print key, ':', params[key] # seed the random number generators np.random.seed(params['seed']) random.seed(params['seed']) vector_type = params['vectors'].split('_')[0] params['word2vec_dim'] = int(params['vectors'].split('_')[-1]) reuser = None if params['reuse']: reuser = reusable_holdout.ReuseableHoldout(T=params['orig_T'], tau=params['tau']) if params['dataset'] == 'DRLD': datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes'] elif params['dataset'] == 'MIP': datasets = ['MIP-Personal-1', 'MIP-Personal-2', 'MIP-Political-1', 'MIP-Political-2'] elif params['dataset'] == 'MOLD': datasets = ['McCain-Likes', 'McCain-Dislikes', 'Obama-Likes', 'Obama-Dislikes'] elif params['dataset'] == 'Primary': datasets = ['Obama-Primary', 'Clinton-Primary'] elif params['dataset'] == 'General': datasets = ['Obama-General', 'McCain-General'] else: datasets = [params['dataset']] np.random.seed(params['seed']) random.seed(params['seed']) best_valid_f1s = [] best_true_valid_f1s = [] best_test_f1s = [] best_train_f1s = [] test_prediction_arrays = [] output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name']) output_filename = fh.make_filename(output_dir, 'params', 'txt') fh.write_to_json(params, output_filename) for dev_fold in range(params['n_dev_folds']): print "dev fold =", dev_fold output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'], 'fold' + str(dev_fold)) all_data, words2idx, items, all_labels = common.load_data(datasets, params['test_fold'], dev_fold, params['min_doc_thresh']) train_xy, valid_xy, test_xy = all_data train_lex, train_y = train_xy valid_lex, valid_y = valid_xy test_lex, test_y = test_xy train_lengths = [len(x) for x in train_lex] length_order = np.argsort(train_lengths) #if params['minibatch_size'] > 1 or params['classify_minibatch_size'] > 1: print "padding input with zeros" #all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex, preset_max=100) all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex) train_lex, valid_lex, test_lex = all_data train_masks, valid_masks, test_masks = all_masks #else: # train_masks = [np.ones(len(x)).astype('int32') for x in train_lex] # valid_masks = [np.ones(len(x)).astype('int32') for x in valid_lex] # test_masks = [np.ones(len(x)).astype('int32') for x in test_lex] print "expanding x with context win dows" # Rejigger to convert x to contex win in advance train_x_win = expand_x_with_context_win(train_lex, params['win']) valid_x_win = expand_x_with_context_win(valid_lex, params['win']) test_x_win = expand_x_with_context_win(test_lex, params['win']) order = range(len(train_lex)) print "done" train_items, dev_items, test_items = items vocsize = len(words2idx.keys()) idx2words = dict((k, v) for v, k in words2idx.iteritems()) best_test_predictions = None n_sentences = len(train_lex) print "vocsize = ", vocsize, 'n_train', n_sentences codes = all_labels.columns n_items, n_codes = all_labels.shape # get the words in the sentences for the test and validation sets words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex] groundtruth_test = test_y[:] words_test = [map(lambda x: idx2words[x], w) for w in test_lex] #if vector_type == 'eye': # initial_embeddings = np.eye(vocsize) # emb_dim = initial_embeddings.shape[1] if params['initialize_word_vectors']: initial_embeddings = common.load_embeddings(params, words2idx) emb_dim = initial_embeddings.shape[1] else: initial_embeddings = None emb_dim = params['word2vec_dim'] print "embedding dim =", emb_dim extra_input_dims = 0 if params['add_DRLD']: #extra_input_dims = 4 extra_input_dims = 2 print "Building RNN" rnn = RNN(nh=params['n_hidden'], nc=n_codes, ne=vocsize, de=emb_dim, cs=params['win'], extra_input_dims=extra_input_dims, initial_embeddings=initial_embeddings, init_scale=params['init_scale'], rnn_type=params['rnn_type'], train_embeddings=params['train_embeddings'], pooling_method=params['pooling_method'], bidirectional=params['bidirectional'], bi_combine=params['bi_combine'], xavier_init=params['xavier_init'] ) # add extra dimensions to differentiate between paired datasets train_likes = [1 if re.search('Likes', i) else 0 for i in train_items] dev_likes = [1 if re.search('Likes', i) else 0 for i in dev_items] test_likes = [1 if re.search('Likes', i) else 0 for i in test_items] train_dem = [1 if re.search('Democrat', i) else 0 for i in train_items] dev_dem = [1 if re.search('Democrat', i) else 0 for i in dev_items] test_dem = [1 if re.search('Democrat', i) else 0 for i in test_items] """ train_obama = [1 if re.search('Obama', i) else 0 for i in train_items] dev_obama = [1 if re.search('Obama', i) else 0 for i in dev_items] test_obama = [1 if re.search('Obama', i) else 0 for i in test_items] train_personal = [1 if re.search('Personal', i) else 0 for i in train_items] dev_personal = [1 if re.search('Personal', i) else 0 for i in dev_items] test_personal = [1 if re.search('Personal', i) else 0 for i in test_items] train_extra = [[train_likes[i], train_dem[i], train_obama[i], train_personal[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i], dev_obama[i], dev_personal[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i], test_obama[i], test_personal[i]] for i, t in enumerate(test_items)] """ train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)] dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)] test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)] ### LOAD rnn.load(output_dir) # train with early stopping on validation set best_f1 = -np.inf params['clr'] = params['lr'] n_train = len(order) predictions_train = predict(n_train, params['classify_minibatch_size'], train_x_win, train_masks, train_y, params['win'], extra_input_dims, train_extra, rnn, order) n_valid = len(valid_lex) n_test = len(test_lex) predictions_valid = predict(n_valid, params['classify_minibatch_size'], valid_x_win, valid_masks, valid_y, params['win'], extra_input_dims, dev_extra, rnn) predictions_test = predict(n_test, params['classify_minibatch_size'], test_x_win, test_masks, test_y, params['win'], extra_input_dims, test_extra, rnn) """ predictions_train = [rnn.classify(x, train_masks[i], params['win'], extra_input_dims, train_extra[i])[0] for i, x in enumerate(train_lex)] predictions_valid = [rnn.classify(x, valid_masks[i], params['win'], extra_input_dims, dev_extra[i])[0] for i, x in enumerate(valid_lex)] predictions_test = [rnn.classify(x, test_masks[i], params['win'], extra_input_dims, test_extra[i])[0] for i, x in enumerate(test_lex)] """ train_f1 = common.calc_mean_f1(predictions_train, train_y) test_f1 = common.calc_mean_f1(predictions_test, test_y) valid_f1 = common.calc_mean_f1(predictions_valid, valid_y) output_dir = fh.makedirs(output_dir, 'responses') ms = 1 for i in range(n_train): mb_x, mb_masks, mb_extra, mb_y = select_minibatch(train_x_win, train_masks, train_extra, train_y, params['win'], i, ms, order=range(len(train_y))) h, W, b, p_y, s, i_f, i_r, \ f_f, f_r, o_f, o_r, c = rnn.step_through(mb_x, mb_masks, params['win'], extra_input_dims, mb_extra) temp = np.dot(h, W) + b s = 1.0/(1.0 + np.exp(-temp)) output_filename = fh.make_filename(output_dir, train_items[i], 'csv') np.savetxt(output_filename, s[:, 0, :], delimiter=',') output_npy_files(output_dir, train_items[i], i_f, i_r, f_f, f_r, o_f, o_r, h, c) for i in range(n_valid): mb_x, mb_masks, mb_extra, mb_y = select_minibatch(valid_x_win, valid_masks, dev_extra, valid_y, params['win'], i, ms, order=range(len(valid_y))) h, W, b, p_y, s, i_f, i_r, \ f_f, f_r, o_f, o_r, c = rnn.step_through(mb_x, mb_masks, params['win'], extra_input_dims, mb_extra) temp = np.dot(h, W) + b s = 1.0/(1.0 + np.exp(-temp)) output_filename = fh.make_filename(output_dir, dev_items[i], 'csv') np.savetxt(output_filename, s[:, 0, :], delimiter=',') output_npy_files(output_dir, dev_items[i], i_f, i_r, f_f, f_r, o_f, o_r, h, c) for i in range(n_test): mb_x, mb_masks, mb_extra, mb_y = select_minibatch(test_x_win, test_masks, test_extra, test_y, params['win'], i, ms, order=range(len(test_y))) h, W, b, p_y, s, i_f, i_r,\ f_f, f_r, o_f, o_r, c = rnn.step_through(mb_x, mb_masks, params['win'], extra_input_dims, mb_extra) temp = np.dot(h, W) + b s = 1.0/(1.0 + np.exp(-temp)) output_filename = fh.make_filename(output_dir, test_items[i], 'csv') np.savetxt(output_filename, s[:, 0, :], delimiter=',') output_npy_files(output_dir, test_items[i], i_f, i_r, f_f, f_r, o_f, o_r, h, c) print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1
def setUp(self): data = load_data('report.json') self.records = data['asg']['records'] self.headers = data['asg']['headers'] self.rows = data['asg']['rows']
def preprocess(record, stopword=False, filtered_post_tag=False, basic_word = False, lemmatize=True): review_str = record['review'].decode('UTF-8') review_str = review_str.replace('.', '. ') tokens = word_tokenize(review_str) preprocessed_string = [preprocess_word(word, pos_tag, stopword, filtered_post_tag, lemmatize, basic_word) for (word, pos_tag) in nltk.tag._pos_tag(tokens, None, tagger)] preprocessed_string = [word for word in preprocessed_string if word != ""] record['review'] = u' '.join(preprocessed_string).encode('utf-8').strip() return record if __name__ == '__main__': # Preprocess train data records = load_data('data/reviews.tsv') preprocess_records = [preprocess(record, stopword=False, basic_word = True, lemmatize=True) for record in records] with open('data/preprocessed_reviews.tsv', 'w') as preprocess_file: header = 'id\treview\tsentiment\n' preprocess_file.write(header) for record in records: try: preprocess_file.write('%s\t%s\t%i\n' % (record['id'].decode('UTF-8'), record['review'].decode('UTF-8'), record['sentiment'])) except UnicodeEncodeError: print("unicode encode error") continue
import common '''tutaj leci kod''' def calc_price(word): price = {} for i in word: if(price.has_key(i)): price[i] += 1 else: price[i] = 1 return price; i = 2; data = common.load_data('../dane/sets/inter0' + str(i) + '.in') alph = 'abcdefghijklmnopqrstuvwxyz' al = {} left_data = data[0] left_data = left_data.split(' ') for i in range(0,26): al[alph[i]] = int(left_data[i]) lines = int(data[1]) words = [] sents = [] #read lines
pickup = np.array([row["pickup_longitude"], row["pickup_latitude"]]) dropoff = np.array([row["dropoff_longitude"], row["dropoff_latitude"]]) _, p_label = stations_kd.query(pickup) _, d_label = stations_kd.query(dropoff) freqs[interval][wday][p_label][d_label] += 1 del df return freqs if __name__ == "__main__": n_workers = 8 chunksize = 100000 pool = Pool(n_workers, maxtasksperchild=1) dfs = common.load_data(chunksize) freqs = np.zeros((intervals_per_day, 7, n_stations, n_stations), dtype=np.int) print "Computing probabilities..." stuff_to_do = True pbar = tqdm.tqdm(total=n_lines) while stuff_to_do: sub_dfs = list() for i in xrange(n_workers): try: sub_dfs.append(next(dfs)) except StopIteration: stuff_to_do = False if len(sub_dfs) > 0: freqs += sum(pool.map(calc_freqs, sub_dfs))
def on_timer_task(self, event): self.users = common.load_data("users.pickle") event.reactor.schedule(self.user_reread_period, self)