def find_gun(idx): dl = Data_loader(labeled_only=True) if idx is None: for idx in range(100, 200): print(idx, dl.convert2unicode([idx])) else: print(idx, dl.convert2unicode([idx]))
def get_pair_to_tids(): print('Initializing Data Loader...') dl = Data_loader() test_ids = [tweet['tweet_id'] for tweet in dl.test_data()] pair2tids = {} for record in dl.all_data(): if record['tweet_id'] not in test_ids: involved = set() involved.add(record['user_post']) if 'user_retweet' in record: involved.add(record['user_retweet']) if 'user_mentions' in record: for user in record['user_mentions']: involved.add(user) involved = sorted(list(involved)) for i, u1 in enumerate(involved): for u2 in involved[i + 1:]: pair_id = str(u1) + '_' + str(u2) if pair_id in pair2tids: pair2tids[pair_id].append(record['tweet_id']) else: pair2tids[pair_id] = [record['tweet_id']] return pair2tids
def make_word_emb_for_nn(extension): size = 300 window = 5 min_count = 5 epochs = 20 w2v_file = '../data/{0}_w2v_s{1}_w{2}_mc{3}_ep{4}.bin'.format( extension, size, window, min_count, epochs) wv = KeyedVectors.load_word2vec_format(w2v_file, binary=True) print('Number of embeddings in {}: {}'.format(w2v_file, len(wv.vocab))) unicode2idx_pkl = 'unicode2idx_' + extension + '.pkl' unicode2idx = pickle.load(open(unicode2idx_pkl, 'rb')) # complete vocab print('Size of complete vocab:', len(unicode2idx)) dl = Data_loader(labeled_only=True) vocab_size = 40000 dim = 300 embeds = np.zeros((vocab_size, dim), dtype=np.float) embeds[1] = np.random.uniform(-0.25, 0.25, dim) not_in_vocab = 0 not_in_w2v = 0 unknown_idx = set() avg_vocab = np.zeros(dim) known_vocab = 0 for dl_idx in range(2, vocab_size): unicode = dl.convert2unicode([dl_idx]).encode('utf-8') if unicode in unicode2idx: ext_idx = unicode2idx[unicode] if str(ext_idx) in wv.vocab: known_vocab += 1 embeds[dl_idx] = wv[str(ext_idx)] avg_vocab += wv[str(ext_idx)] else: #this word is in the training corpus of the pretrained embedding but is thrown away #because its frequency does not reach min_count = 5 not_in_w2v += 1 unknown_idx.add(dl_idx) #embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim) else: #this word is not even in the training corpus of the pretrained embedding not_in_vocab += 1 unknown_idx.add(dl_idx) #embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim) #assign unknown vocabs to average of known vocabs avg_vocab /= known_vocab for unk_idx in unknown_idx: embeds[unk_idx] = avg_vocab print(not_in_vocab, 'not in vocab') print(not_in_w2v, 'not in word2vec (min_count=5)') missed = not_in_vocab + not_in_w2v print('Total: got {} embeddings, missed {}, out of {}'.format( vocab_size - missed, missed, vocab_size)) save_file = 'word_emb_' + extension + '.np' np.savetxt(save_file, embeds) #embeds is final embedding by idx print('Saved embeddings in', save_file)
def __init__(self, dataset='labeled'): bilm_args = pkl.load( open('../experiments/ELMo_weights/4-23-9pm.param', 'rb')) bilm_args['experiment_path'] = 'ELMo_weights/4-23-9pm' self.bilm = create_bilm_from_args(bilm_args) self.dataset = dataset if dataset == 'labeled': self.dl = Data_loader(labeled_only=True, option='both') else: self.dl = Data_loader(labeled_only=False, option='both')
def __init__(self): ######################################################################################### # Generator Hyper-parameters ######################################################################################### self.PRE_EMB_DIM = 32 self.PRE_HIDDEN_DIM = 32 self.SEQ_LENGTH = 64 self.PRE_START_TOKEN = 0 # self.PRE_EMB_DIM = 16 # self.PRE_HIDDEN_DIM = 32 # self.SEQ_LENGTH = 64 self.PRE_EPOCH_NUM = 1 self.PRE_TRAIN_ITER = 1 # generator self.PRE_SEED = 88 self.batch_size = 16 ########################################################################################## self.TOTAL_BATCH = 300 # TOTAL_BATCH = 800 ######################################################################################### # Discriminator Hyper-parameters ######################################################################################### self.dis_embedding_dim = 64 self.dis_filter_sizes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20] self.dis_num_filters = [ 100, 200, 200, 200, 200, 100, 100, 100, 100, 100, 160, 160 ] self.dis_dropout_keep_prob = 0.75 self.dis_l2_reg_lambda = 0.2 # Training parameters # self.dis_num_epochs = 20 self.dis_num_epochs = 1 # dis_alter_epoch = 50 self.dis_alter_epoch = 25 self.positive_file = 'save/midi_trans.pkl' self.negative_file = 'target_generate/pretrain_small.pkl' # eval_file = 'target_generate/midi_trans_eval.pkl' self.generated_num = 40 self.melody_size = 68 self.RL_update_rate = 0.8 self.data_loader = Data_loader() self.positive_x, self.positive_y = self.data_loader.load_data( self.positive_file, self.batch_size)
def visualize_labeled_dataset(): print('Initializing Data Loader') dl = Data_loader() tr, val, tst = dl.cv_data(fold_idx=0) labeled_tweets = tr + val + tst labeled_tweets = [(x['tweet_id'], x['label']) for x in labeled_tweets] print('Number of labeled tweets:', len(labeled_tweets)) # plot_tweets(labeled_tweets, emb_type='splex', rep_mode='sum', include_sub=False, force_TSNE=True) plot_tweets(labeled_tweets, emb_type='w2v', rep_mode='avg')
def __init__(self, mode): assert mode in ['train', 'eval'] if mode == 'train': dl = Data_loader(labeled_only=True, option='both') self.train_test_val_data = dl.cv_data(0)[0] + dl.cv_data( 0)[1] + dl.cv_data(0)[2] self.train() else: model_dict = pkl.load(open("../data/logistic_regression.pkl", 'rb')) self.thresholds = model_dict['thresholds'] self.classifiers = model_dict['models']
class Texture_dataset_val(Dataset): def __init__(self, data_size, textures_path, max_region=10): self.data_size = data_size self.data = Data_loader(textures_path, 1, max_region) self.preload = [] for i in range(self.data_size): x, y, x_ref = self.data.get_batch_data() x = x[0] y = y[0] x_ref = x_ref[0] x = np.swapaxes(x, 1, 2) x = np.swapaxes(x, 0, 1) y = np.swapaxes(y, 1, 2) y = np.swapaxes(y, 0, 1) x_ref = np.swapaxes(x_ref, 1, 2) x_ref = np.swapaxes(x_ref, 0, 1) x, y, x_ref = x.astype('float32'), y.astype( 'float32'), x_ref.astype('float32') self.preload.append((x, y, x_ref)) def __len__(self): return self.data_size def __getitem__(self, idx): return self.preload[idx]
def add_context_level_inputs(all_inputs, labeled_tweets, emb_to_sizes): print('Pre-popping cl:', len(all_inputs)) no_cl_inputs = {} for input_name, np in all_inputs.items(): if not input_name.endswith('cl'): no_cl_inputs[input_name] = np all_inputs = no_cl_inputs print('Post-popping cl:', len(all_inputs)) sorted_tids = sorted([tweet['tweet_id'] for tweet in labeled_tweets]) print('Initializing complete Data Loader...') complete_dl = Data_loader() tweet_dict, user_ct_tweets, id_to_location = None, None, None for emb_type in emb_to_sizes: for size in emb_to_sizes[emb_type]: cl = init_context(emb_type, size, complete_dl, tweet_dict=tweet_dict, user_ct_tweets=user_ct_tweets, id_to_location=id_to_location) combine_modes = ['avg'] if emb_type == 'w2v' else ['sum'] sorted_reps = [ cl.get_representation(tid, modes=combine_modes) for tid in sorted_tids ] sorted_reps = StandardScaler().fit_transform(sorted_reps) all_inputs['{}_{}_cl'.format(emb_type, str(size))] = dict( (sorted_tids[i], sorted_reps[i]) for i in range(len(sorted_tids))) if tweet_dict is None: tweet_dict, user_ct_tweets, id_to_location = cl.get_params() print('Post-adding cl:', len(all_inputs))
def main(args): # params for data loader option = args['option'] print('Initializing Data Loader') dl = Data_loader(option=option) all_data = dl.all_data() print('Len of all data:', len(all_data)) test_ids = set([tweet['tweet_id'] for tweet in dl.test_data()]) print('Len of test data:', len(test_ids)) ensemble_ids = get_ensemble_tids() print('Len of ensemble data:', len(ensemble_ids)) mode = args['mode'] assert (mode == 'w2v' or mode == 'svd' or mode == 'd2v') if mode == 'w2v': sentences = [] for tweet in all_data: # need indices split if tweet['tweet_id'] not in test_ids and tweet[ 'tweet_id'] not in ensemble_ids: sentences.append([str(x) for x in tweet['int_arr']]) print('Num sentences:', len(sentences)) print('Check sentence0:', sentences[0]) generate_w2v_embs(sentences, option) elif mode == 'svd': sentences = [] for i, tweet in enumerate(all_data): # need indices joined if tweet['tweet_id'] not in test_ids and tweet[ 'tweet_id'] not in ensemble_ids: sentences.append(' '.join([str(x) for x in tweet['int_arr']])) print('Num sentences:', len(sentences)) print('Check sentence0:', sentences[0]) generate_svd_embs(sentences, option) else: # mode == d2v sentences = [] tags = [] for tweet in all_data: if tweet['tweet_id'] not in test_ids and tweet[ 'tweet_id'] not in ensemble_ids: # need indices split and use id's as tags sentences.append([str(x) for x in tweet['int_arr']]) tags.append([str(tweet['tweet_id'])]) print('Num sentences:', len(sentences)) print('Check sentence0:', sentences[0]) print('Check tag0:', tags[0]) generate_d2v_embs(sentences, tags, option)
def __init__(self, model_predict, model_threshold, output_dir, input_format, tweet_records, truth_label, pad_elmo=False, unigram_observe_ids=None): #model_predict is a function the takes X and evaluates the score, abstracted to keep LIME decoupled from model #architecture, input format and use of context features. self.dl = Data_loader(labeled_only=True, option='both') self.model_predict = model_predict self.model_threshold = model_threshold self.output_dir = output_dir self.input_format = input_format self.pad_elmo = pad_elmo self.unigram_observe_ids = unigram_observe_ids self.tweet_records, self.truth_label = tweet_records, truth_label self.scores = self.model_predict(self.tweet_records).flatten() self.label_prediction = [ 1 if self.scores[idx] >= self.model_threshold else 0 for idx in range(len(self.scores)) ] idx_considered = [ idx for idx in range(len(self.label_prediction)) if self.label_prediction[idx] == 1 ] self.tweet_id_considered = [ self.tweet_records['tweet_id'][idx] for idx in idx_considered ] included_tweet_records = {} for key in self.tweet_records.keys(): if key == 'word_content_input_elmo' and pad_elmo is False: included_tweet_records[key] = [ self.tweet_records[key][idx] for idx in idx_considered ] else: included_tweet_records[key] = np.array( [self.tweet_records[key][idx] for idx in idx_considered]) self.tweet_records = included_tweet_records self.scores = np.array([self.scores[idx] for idx in idx_considered])
def make_word_emb_for_nn(extension): size = 300 window = 5 min_count = 5 epochs = 20 w2v_file = '../data/{0}_w2v_s{1}_w{2}_mc{3}_ep{4}.bin'.format( extension, size, window, min_count, epochs) wv = KeyedVectors.load_word2vec_format(w2v_file, binary=True) print('Number of embeddings in {}: {}'.format(w2v_file, len(wv.vocab))) unicode2idx_pkl = 'unicode2idx_' + extension + '.pkl' unicode2idx = pickle.load(open(unicode2idx_pkl, 'rb')) # complete vocab print('Size of complete vocab:', len(unicode2idx)) dl = Data_loader(labeled_only=True) vocab_size = 40000 dim = 300 embeds = np.zeros((vocab_size, dim), dtype=np.float) embeds[1] = np.random.uniform(-0.25, 0.25, dim) not_in_vocab = 0 not_in_w2v = 0 for dl_idx in range(2, vocab_size): unicode = dl.convert2unicode([dl_idx]).encode('utf-8') if unicode in unicode2idx: ext_idx = unicode2idx[unicode] if str(ext_idx) in wv.vocab: embeds[dl_idx] = wv[str(ext_idx)] else: not_in_w2v += 1 embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim) else: not_in_vocab += 1 embeds[dl_idx] = np.random.uniform(-0.25, 0.25, dim) print(not_in_vocab, 'not in vocab') print(not_in_w2v, 'not in word2vec (min_count=5)') missed = not_in_vocab + not_in_w2v print('Total: got {} embeddings, missed {}, out of {}'.format( vocab_size - missed, missed, vocab_size)) save_file = 'word_emb_' + extension + '.np' np.savetxt(save_file, embeds) print('Saved embeddings in', save_file)
def main(_): pp.pprint(flags.FLAGS.__flags) with tf.Session() as sess: data_loader = Data_loader(FLAGS.embedding_file, FLAGS.embedding_size) q_network = Q_network(sess, FLAGS.embedding_size, FLAGS.step_size, FLAGS.target_frequency, FLAGS.hidden_units, FLAGS.final_units, FLAGS.greedy_ratio, data_loader) replay = Replay(q_network, FLAGS.minibatch_size, FLAGS.replay_size) model = DQL(FLAGS.budget, data_loader, q_network, replay) model.run()
def load(self): transform = transforms.Compose([ transforms.Resize(28), transforms.ToTensor(), transforms.Normalize(mean=[0.5], std=[0.5]), ]) Ominiglot = Data_loader(path=self.path, train=True, transform=transform) characters = Ominiglot.characters return characters
def __init__(self, batch_size, image_size, lr, epoch): self.input_images = tf.placeholder(tf.float32, [None, image_size, image_size, 3]) self.pos_images = tf.placeholder(tf.float32, [None, image_size, image_size, 3]) self.neg_images = tf.placeholder(tf.float32, [None, image_size, image_size, 3]) # sparse label self.input_cate = tf.placeholder(tf.int32, [None]) self.input_attr = tf.placeholder(tf.float32, [None, 1000]) self.num_cate = 50 self.num_attr = 1000 self.dropout_keep_prob = tf.placeholder(tf.float32) self.g_step = tf.Variable(0) self.lr = tf.train.exponential_decay(lr, self.g_step, 50000, 0.98) self.batch_size = batch_size self.image_size = image_size self.max_epoch = epoch self.d_loader = Data_loader(root, cate_path, attr_path, partition_path, self.batch_size, image_size) self.trainX, self.train_pos, self.train_neg, self.trainY1, self.trainY2 = self.d_loader.get_queue(1000, 'train', epoch, True) self.valX, self.valY1, self.valY2 = self.d_loader.get_queue(1000, 'val', None, False) self.testX, self.testY1, self.testY2 = self.d_loader.get_queue(1000, 'test', None, False)
def check_splex_top_k(mode, k=100, print_top=True): assert(mode == 'loss' or mode == 'agg' or mode == 'sub') splex = pickle.load(open('../data/splex_minmax_svd_word_s300_seeds_hc.pkl', 'rb')) if mode == 'loss': mode_idx = 0 elif mode == 'agg': mode_idx = 1 else: mode_idx = 2 tuples = [(k, splex[k][mode_idx]) for k in splex] tuples = sorted(tuples, key=lambda x: x[1], reverse=True) if print_top: dl = Data_loader(labeled_only=True) row_format = '{:<7}' * 2 + '{:<15}' * 2 print(row_format.format('Rank', 'Index', 'Unicode', 'SPLex {} Score (minmax scaling)'.format(mode.capitalize()))) for rank, (idx, score) in enumerate(tuples[:k]): print(row_format.format(rank, idx, dl.convert2unicode([int(idx)]), round(score, 5))) return tuples[:k]
def add_inputs(): save_file = 'all_inputs.pkl' all_inputs = pickle.load(open(save_file, 'rb')) print('Initializing labeled Data Loader...') labeled_dl = Data_loader(labeled_only=True) labeled_tweets = labeled_dl.all_data() # TIME INPUT # add_time_input(all_inputs) # print('Added time input, shape =', np.array(list(all_inputs['time'].values())).shape) # TWEET-LEVEL INPUTS # add_tweet_level_input(all_inputs, labeled_tweets, emb_type='splex') # print('Added splex_tl input, shape =', np.array(list(all_inputs['splex_tl'].values())).shape) # CONTEXT-LEVEL INPUTS # emb_to_sizes = {'w2v': [30, 60], 'splex': [2, 30]} # add_context_level_inputs(all_inputs, labeled_tweets, emb_to_sizes=emb_to_sizes) # print('Added context inputs') # print('w2v shape =', np.array(list(all_inputs['30_w2v_cl'].values())).shape) # print('splex shape =', np.array(list(all_inputs['2_splex_cl'].values())).shape) # USER INPUTS # add_user_inputs(all_inputs, labeled_tweets, num_users=300) # add_user_inputs(all_inputs, labeled_tweets, num_users=50) # print('Added user inputs: 50 users shape =', np.array(list(all_inputs['50_post_user_index'].values())).shape) # # PAIRWISE INPUT # add_pairwise_input(all_inputs, labeled_tweets, cutoff=1) # add_pairwise_input(all_inputs, labeled_tweets, cutoff=2) # add_pairwise_input(all_inputs, labeled_tweets, cutoff=3) # print('Added pairwise inputs') # print('splex shape =', np.array(list(all_inputs['pairwise_c1_splex'].values())).shape) # print('w2v shape =', np.array(list(all_inputs['pairwise_c1_w2v'].values())).shape) pickle.dump(all_inputs, open(save_file, 'wb')) print('Saved', save_file)
def make_user_embeds(num_users): dim = 300 embeds = np.random.rand(num_users, dim) print('Initializing Data Loader...') dl = Data_loader() tl = init_tl('w2v') test_ids = [tweet['tweet_id'] for tweet in dl.test_data()] pretrained_count = 0 for user_idx in range( 2, num_users ): # reserve 0 for padding (i.e. no user), 1 for unknown user tweet_dicts = dl.tweets_by_user( user_idx) # all tweets WRITTEN by this user if tweet_dicts is not None and len(tweet_dicts) > 0: tweet_count = 0 all_tweets_sum = np.zeros(dim, dtype=np.float) for tweet_dict in tweet_dicts: tid = tweet_dict['tweet_id'] if tid not in test_ids: tweet_count += 1 tweet_avg = tl.get_representation(tid, mode='avg') all_tweets_sum += tweet_avg if tweet_count > 0: pretrained_count += 1 all_tweets_avg = all_tweets_sum / tweet_count embeds[user_idx] = all_tweets_avg print('Found tweets for {} out of {} users'.format(pretrained_count, num_users - 2)) embeds = StandardScaler().fit_transform(embeds) # mean 0, variance 1 embeds[0] = np.zeros(dim) # make sure padding is all 0's save_file = str(num_users) + '_user_emb.np' np.savetxt(save_file, embeds) print('Saved embeddings in', save_file)
def generate(args): data_loader = Data_loader(batch_size=1, bias_init=args.bias_init, train=False) model = Model(wemb_dim=args.wemb_dim, hid_dim=args.hid_dim, seq_len=data_loader.maxlen + 1, learning_rate=args.learning_rate, batch_size=1, num_batches=data_loader.num_batches, num_words=data_loader.num_words, biivector=data_loader.biivector, use_gru=args.use_gru, inference=True) model.build() saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) if args.model_path is not None: print('Using model: {}'.format(args.model_path)) saver.restore(sess, args.model_path) else: latest_ckpt = tf.train.latest_checkpoint(args.logdir) print( 'Did not provide model path, using latest: {}'.format(latest_ckpt)) saver.restore(sess, latest_ckpt) feat = extract_single(sess, args.img_path, cnn='vgg') feed_dict = { model.ctx_ph: feat.reshape(-1, model.ctx_dim[0], model.ctx_dim[1]) } captions_ix = sess.run(model.output_argmax, feed_dict=feed_dict) captions_wd = [data_loader.ixtoword[x] for x in captions_ix] try: captions_wd = ' '.join(captions_wd[:captions_wd.index('.')]) except ValueError: captions_wd = ' '.join(captions_wd) print(captions_wd) print('Sentence generated.')
class Texture_dataset_train(Dataset): def __init__(self, data_size, textures_path, max_region=10): self.data_size = data_size self.data = Data_loader(textures_path, 1, max_region) def __len__(self): return self.data_size def __getitem__(self, idx): x, y, x_ref = self.data.get_batch_data() x = x[0] y = y[0] x_ref = x_ref[0] x = np.swapaxes(x, 1, 2) x = np.swapaxes(x, 0, 1) y = np.swapaxes(y, 1, 2) y = np.swapaxes(y, 0, 1) x_ref = np.swapaxes(x_ref, 1, 2) x_ref = np.swapaxes(x_ref, 0, 1) x, y, x_ref = x.astype('float32'), y.astype('float32'), x_ref.astype( 'float32') return x, y, x_ref
import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier from sklearn.linear_model import SGDClassifier from sklearn.neural_network import MLPClassifier from data_loader import Data_loader # Init data_loader data_loader = Data_loader() Xs_lsi, Ys_lsi, Xa_lsi, Ya_lsi = data_loader.load_LSI() X, y = [], [] p = 2 * data_loader.num_topics # Fetch all duplicate data print('\n Duplicate data \n') for q1, q2 in zip(Xs_lsi, Ys_lsi): try: q12 = np.concatenate((q1, q2), axis=0) q12 = q12.reshape(p) X.append(q12) y.append(1) except: pass #print(q1.shape) #print(q2.shape) # Fetch all NON duplicate data print('\n Non Duplicate data \n') for q1, q2 in zip(Xa_lsi, Ya_lsi):
class Adversarial_generator(): def __init__(self, dataset='labeled'): bilm_args = pkl.load( open('../experiments/ELMo_weights/4-23-9pm.param', 'rb')) bilm_args['experiment_path'] = 'ELMo_weights/4-23-9pm' self.bilm = create_bilm_from_args(bilm_args) self.dataset = dataset if dataset == 'labeled': self.dl = Data_loader(labeled_only=True, option='both') else: self.dl = Data_loader(labeled_only=False, option='both') def compute_log_prob(self, sentences_int_arr): tokens = self.bilm.dg.transform_sentences(sentences_int_arr) loss = self.bilm.compute_loss_on_data(tokens) return -loss def sanity_check(self): # For each two adjacent tweets, switch the word on every positions and see if both tweets' log probability # decrease most of the time tweet_ids = list(self.dl.data['data'].keys()) count_prob_decrease = 0 # number of times the revised sentence has lower probability than original sentence count_prob_increase = 0 # number of times the revised sentence has higher probability than original sentence prob_increase_samples = {} prob_increase_samples['original'] = [] prob_increase_samples['revised'] = [] prob_increase_samples['original score'] = [] prob_increase_samples['revised score'] = [] for idx in range(len(tweet_ids) - 1): tweet_id1 = tweet_ids[idx] tweet_id2 = tweet_ids[idx + 1] sentence1 = trim( self.dl.data['data'][tweet_id1]['word_padded_int_arr']) sentence2 = trim( self.dl.data['data'][tweet_id2]['word_padded_int_arr']) log_prob_sentence1 = self.compute_log_prob([sentence1]) log_prob_sentence2 = self.compute_log_prob([sentence2]) for word_idx in range(min(len(sentence1), len(sentence2))): # swap the two sentences word on this position sentence1[word_idx], sentence2[word_idx] = sentence2[ word_idx], sentence1[word_idx] log_prob_revised_sentence1 = self.compute_log_prob([sentence1]) log_prob_revised_sentence2 = self.compute_log_prob([sentence2]) if log_prob_revised_sentence1 <= log_prob_sentence1: count_prob_decrease += 1 else: count_prob_increase += 1 prob_increase_samples['revised'].append( self.dl.convert2unicode(sentence1)) prob_increase_samples['revised score'].append( log_prob_revised_sentence1) prob_increase_samples['original score'].append( log_prob_sentence1) if log_prob_revised_sentence2 <= log_prob_sentence2: count_prob_decrease += 1 else: count_prob_increase += 1 prob_increase_samples['revised'].append( self.dl.convert2unicode(sentence2)) prob_increase_samples['revised score'].append( log_prob_revised_sentence2) prob_increase_samples['original score'].append( log_prob_sentence2) # recover the original sentence sentence1[word_idx], sentence2[word_idx] = sentence2[ word_idx], sentence1[word_idx] if log_prob_revised_sentence1 > log_prob_sentence1: prob_increase_samples['original'].append( self.dl.convert2unicode(sentence1)) if log_prob_revised_sentence2 > log_prob_sentence2: prob_increase_samples['original'].append( self.dl.convert2unicode(sentence2)) if idx % 10 == 0: print("increase: ", count_prob_decrease) print("decrease: ", count_prob_increase) if idx > 100: break print("Probability decrease: ", count_prob_decrease) print("Probability increase: ", count_prob_increase) pd.DataFrame.from_dict(prob_increase_samples).to_csv( "../showable/ELMo_sanity_check.csv", index=False) def create_natural_sentences(self, mode, token, tweet_dicts): assert mode in ['insert', 'replace'] token_id = self.dl.token2property[token.encode("utf-8")]['id'] sentence_outputs = {} keys = [ 'original_sentence', 'generated_sentence', 'original_prob', 'generated_prob', 'original_int_arr', 'generated_int_arr', 'tweet_id' ] for key in keys: sentence_outputs[key] = [] for tweet_id in tweet_dicts.keys(): sentence = tweet_dicts[tweet_id]['word_padded_int_arr'] num_words = sum([x != 0 for x in sentence]) if mode == 'insert': if num_words == 50: #already max length, cannot add more words continue idx_range = range(num_words + 1) else: idx_range = range(num_words) sentence_outputs['original_int_arr'].append(np.array(sentence)) original_sentence_unicode = self.dl.convert2unicode(trim(sentence)) sentence_outputs['original_sentence'].append( original_sentence_unicode) original_sentence_prob = self.compute_log_prob([trim(sentence)]) sentence_outputs['original_prob'].append(original_sentence_prob) sentence_outputs['tweet_id'].append(tweet_id) max_generated_prob = -np.inf most_natural_generated_sentence = None for pos in idx_range: if mode == 'insert': generated_sentence = insert_element( sentence, pos, token_id) else: generated_sentence = np.array(sentence) generated_sentence[pos] = token_id new_sentence_prob = self.compute_log_prob( [trim(generated_sentence)]) if new_sentence_prob > max_generated_prob: max_generated_prob = new_sentence_prob most_natural_generated_sentence = generated_sentence most_natural_revised_sentence_unicode = self.dl.convert2unicode( trim(most_natural_generated_sentence)) sentence_outputs['generated_sentence'].append( most_natural_revised_sentence_unicode) sentence_outputs['generated_prob'].append(max_generated_prob) sentence_outputs['generated_int_arr'].append( np.array(most_natural_generated_sentence)) if len(sentence_outputs['generated_int_arr']) % 100 == 0: print(len(sentence_outputs['generated_int_arr'])) pkl.dump( sentence_outputs, open( "../adversarial_data/%s_%s_natural_sentence_%s.pkl" % (mode, token, self.dataset), 'wb')) #order the records in order of maximum probability increase to minimum probability increase prob_diff = np.array(sentence_outputs['generated_prob']) - np.array( sentence_outputs['original_prob']) sorted_idx = np.argsort(prob_diff)[::-1] for key in sentence_outputs.keys(): sentence_outputs[key] = [ sentence_outputs[key][idx] for idx in sorted_idx ] sentence_outputs['prob_change'] = np.array( sentence_outputs['generated_prob']) - np.array( sentence_outputs['original_prob']) pd.DataFrame.from_dict(sentence_outputs).to_csv( "../showable/%s_%s_natural_sentence_%s.csv" % (mode, token, self.dataset), index=False) pkl.dump( sentence_outputs, open( "../adversarial_data/%s_%s_natural_sentence_%s.pkl" % (mode, token, self.dataset), 'wb')) def generate_natural_tweets(self, mode, token): tweet_dicts = self.dl.data['data'] self.create_natural_sentences(mode, token, tweet_dicts) def evaluate_logistic_regression_prediction(self, mode): assert mode in ['score', 'binary'] lr = Logistic_regr(mode='eval') generated_sentences = pkl.load( open("../data/insert_a_natural_sentence.pkl", 'rb')) original_int_arrs = generated_sentences['original_int_arr'] generated_int_arrs = generated_sentences['generated_int_arr'] if mode == 'score': original_agg_scores, original_loss_scores = lr.predict( original_int_arrs, mode="score") generated_agg_scores, generated_loss_scores = lr.predict( generated_int_arrs, mode="score") return original_agg_scores, original_loss_scores, generated_agg_scores, generated_loss_scores else: original_agg_labels, original_loss_labels = lr.predict( original_int_arrs, mode="binary") generated_agg_labels, generated_loss_labels = lr.predict( generated_int_arrs, mode="binary") new_agg_positive_tweet_ids = [] for idx in range(len(original_agg_labels)): if original_agg_labels[idx] == 0 and generated_agg_labels[ idx] == 1: new_agg_positive_tweet_ids.append( generated_sentences['tweet_id'][idx]) new_loss_positive_tweet_ids = [] for idx in range(len(original_loss_labels)): if original_loss_labels[idx] == 0 and generated_loss_labels[ idx] == 1: new_loss_positive_tweet_ids.append( generated_sentences['tweet_id'][idx]) return new_agg_positive_tweet_ids, new_loss_positive_tweet_ids def evaluate_model_prediction(self, token, model_id, run_idx, fold_idx, class_idx, mode='binary', top_num=800): generated_sentences = pkl.load( open( "../adversarial_data/insert_%s_natural_sentence_labeled.pkl" % token, 'rb')) original_int_arrs = generated_sentences['original_int_arr'][:top_num] revised_int_arrs = generated_sentences['generated_int_arr'][:top_num] tweet_ids = generated_sentences['tweet_id'][:top_num] all_tweets = self.dl.all_data() original_tweets = [] generated_tweets = [] tweetid2tweetidx = {} for idx in range(len(all_tweets)): tweetid2tweetidx[all_tweets[idx]['tweet_id']] = idx for idx in range(len(original_int_arrs)): tweet = all_tweets[tweetid2tweetidx[tweet_ids[idx]]] original_tweets.append(tweet) generated_tweet = deepcopy(tweet) assert np.all(generated_tweet['word_padded_int_arr'] == original_int_arrs[idx]) generated_tweet['word_padded_int_arr'] = revised_int_arrs[idx] generated_tweet['word_int_arr'] = trim( generated_tweet['word_padded_int_arr']) generated_tweets.append(generated_tweet) generated_elmo_dir = None original_elmo_dir = None if model_id in (3, 4, 6, 7): #DS ELMo generated_elmo_dir = "../adversarial_data/DS_ELMo_adversarial_insert_%s" % token original_elmo_dir = "../data/DS_ELMo_rep" if model_id == 5: #NonDS ELMo generated_elmo_dir = "../adversarial_data/NonDS_ELMo_adversarial_insert_%s" % token original_elmo_dir = "../data/NonDS_ELMo_rep" load_model_tweet_dicts(model_id, generated_tweets, elmo_dir=generated_elmo_dir) generated_tweet_X = pkl.load( open("../data/adversarial_tweet_X.pkl", 'rb')) load_model_tweet_dicts(model_id, original_tweets, elmo_dir=original_elmo_dir) original_tweet_X = pkl.load( open("../data/adversarial_tweet_X.pkl", 'rb')) model = load_model(model_id, run_idx, fold_idx, class_idx) original_predictions = model.predict(original_tweet_X) generated_predictions = model.predict(generated_tweet_X) assert mode in ['score', 'binary'] if mode == 'score': # analyze prediction numerical score change return original_predictions, generated_predictions else: # analyze label flipping threshold = get_model_info(num_runs=5, num_folds=5, num_models=model_id)['thresholds'][( model_id, run_idx)][class_idx][fold_idx] original_pred_labels = [ 1 if x >= threshold else 0 for x in original_predictions ] generated_pred_labels = [ 1 if x >= threshold else 0 for x in generated_predictions ] new_positive_tweet_ids = [] new_negative_tweet_ids = [] for idx in range(len(original_predictions)): if original_pred_labels[idx] == 0 and generated_pred_labels[ idx] == 1: new_positive_tweet_ids.append( original_tweets[idx]['tweet_id']) if original_pred_labels[idx] == 1 and generated_pred_labels[ idx] == 0: new_negative_tweet_ids.append( original_tweets[idx]['tweet_id']) return len(new_positive_tweet_ids) def evaluate_all_models(self, token, class_idx): results = {} for model_id in [1, 2, 18, 19]: flipped_counts = [] for fold_idx in range(5): counts = [] for run_idx in range(5): counts.append( self.evaluate_model_prediction(token, model_id, run_idx, fold_idx, class_idx)) flipped_counts.append(sum(counts) / len(counts)) results[model_id] = sum(flipped_counts) / len(flipped_counts) pkl.dump( results, open( "../adversarial_data/insert_%s_model_stats_labeled_121819.pkl" % token, 'wb')) analysis_dict = {} analysis_dict['model_id'] = sorted([x for x in results.keys()]) analysis_dict['num_flipped_adversarials'] = [ results[x] for x in analysis_dict['model_id'] ] pd.DataFrame.from_dict(analysis_dict).to_csv( "../showable/adversarial_%s_stats_labeled.csv" % token, index=False)
type=int, default=5, help='min_count for word2vec; ignored if svd') parser.add_argument('-ep', '--epochs', type=int, default=20, help='iterations for word2vec; ignored if svd') args = vars(parser.parse_args()) print(args) # main(args) option = args['option'] print('Initializing Data Loader') dl = Data_loader(option=option) all_data = dl.all_data() all_tids = set([str(tweet['tweet_id']) for tweet in all_data]) print(list(all_tids)[:10]) print('Len of all data:', len(all_data)) test_ids = set([tweet['tweet_id'] for tweet in dl.test_data()]) print('Len of test data:', len(test_ids)) ensemble_ids = get_ensemble_tids() print('Len of ensemble data:', len(ensemble_ids)) print(list(ensemble_ids)[:10]) assert (len(ensemble_ids.intersection(all_tids)) == 0) # w2v_file = '../data/w2v_word_s300_w5_mc5_ep20.bin' # svd_file = '../data/svd_word_s300.pkl' # sample_usage(w2v_file, svd_file)
import torch torch.manual_seed(args.seed) use_cuda = torch.cuda.is_available() and not args.unuse_cuda args.channel_dims = list(map(int, args.channel_dims.split(','))) if use_cuda: torch.cuda.manual_seed(args.seed) # ############################################################################## # Load data ################################################################################ from data_loader import Data_loader real_datas = Data_loader('data/', args.img_size, args.batch_size, use_cuda) # ############################################################################## # Build model # ############################################################################## import model G = model.Generator(args.img_size, args.img_size, args.channel_dims, args.z_dim) D = model.Discriminator(args.img_size, args.img_size, args.channel_dims, args.relu_leak) if use_cuda: G, D = G.cuda(), D.cuda() optimizer_D = torch.optim.Adam(D.parameters(), lr=args.lr, betas=(args.beta1, 0.999)) optimizer_G = torch.optim.Adam(G.parameters(), lr=args.lr, betas=(args.beta1, 0.999)) criterion = torch.nn.BCELoss()
def __init__(self, data_size, textures_path, max_region=10): self.data_size = data_size self.data = Data_loader(textures_path, 1, max_region)
vertical_all = [] #vertical f = plt.figure('vertical') for i in range(0, len(model.weights) - 1): vertical = [] for j in range(0, len(model.weights) - 1): dst = np.sum(model.weights[i, j, :] - model.weights[i, j + 1, :]) vertical.append(dst) vertical_all.append(vertical) plt.imshow(vertical_all, cmap=plt.cm.gray) plt.colorbar() f.savefig(graph_name + '_vertical_' + '.png') ## load data d = Data_loader('dataset.txt') inputs = d.dataset.T (dim, count) = inputs.shape ## train model rows = 30 cols = 30 metric = L_max top_left = np.array((0, 0)) bottom_right = np.array((rows - 1, cols - 1)) lambda_s = metric(top_left, bottom_right) * 0.5 # there was *0.5 model = SOM(dim, rows, cols, inputs) model.train(inputs,
batch_size=batch_size, sample=True), create_data(input_name2id2np, val, return_generators=return_generators, batch_size=batch_size, sample=False), create_data(input_name2id2np, test, return_generators=return_generators, batch_size=batch_size, sample=False)) if __name__ == '__main__': from data_loader import Data_loader option = 'word' max_len = 50 vocab_size = 40000 dl = Data_loader(vocab_size=vocab_size, max_len=max_len, option=option) fold_idx = 0 data_fold = dl.cv_data(fold_idx) tr, val, test = data_fold print(tr[0]) ''' (X_train, y_train), (X_val, y_val), (X_test, y_test) = create_clf_data(simplest_tweet2data, data_fold) for key in X_train: print(X_train[key]) '''
if use_cuda: torch.cuda.manual_seed(args.seed) # ############################################################################## # Load data ################################################################################ from data_loader import Data_loader data = torch.load(args.data) args.max_len = data["max_word_len"] args.dict = data["dict"] args.vocab_size = data["vocab_size"] training_data = Data_loader("data/train2017/", data['train']['imgs'], data['train']['captions'], args.max_len, batch_size=args.batch_size, is_cuda=use_cuda) validation_data = Data_loader("data/val2017/", data['valid']['imgs'], data['valid']['captions'], args.max_len, batch_size=args.batch_size, is_cuda=use_cuda, evaluation=True) # ############################################################################## # Build model # ############################################################################## import model
if options.mode == 'people': test = Test(config, options.ckpt_dir, id2word) test.init_test(Model(config), options.ckpt_index) print('请输入:') line = '' is_continue = False while line != 'stop': line = input() pins = line.strip().split() query = np.ones([1, config.seq_len], dtype=np.int32) target_seq_len = [0] target_seq_len[0] = len(pins) for i, pin in enumerate(pins): if pin not in p2id: is_continue = True print('Invalid input!') break query[0][i] = p2id[pin] if is_continue: is_continue = False continue test.people_test_one_step(query, target_seq_len) elif options.mode == 'computer': test_data_loader = Data_loader('test', config) test = Test(config, options.ckpt_dir, id2word, test_data_loader) test.init_test(Model(config)) test._test()
def generate_samples(sess, model, inv_charmap): samples = sess.run(model.fake_inputs) samples = np.argmax(samples, axis=2) decoded_samples = [] for i in range(len(samples)): # batch_size decoded = [] for j in range(len(samples[i])): # seq_length decoded.append(inv_charmap[samples[i][j]]) decoded_samples.append(tuple(decoded)) return decoded_samples if __name__ == "__main__": data_loader = Data_loader(pm.batch_size) lines, charmap, inv_charmap = data_loader.load_datasets( max_length=pm.seq_length, example_num=pm.example_num, vocab_size=pm.vocab_size, data_path=pm.data_path ) model = WGAN(pm.data_path, pm.batch_size, pm.epochs, pm.vocab_size, pm.seq_length, pm.embed_dims, pm.dis_epochs, pm.example_num, pm.learning_rate, charmap, pm.lamb) if len(pm.data_path) == 0: raise Exception("Please specify path to data directory in adver_train.py!") # model.print_model_settings(locals().copy())