def __init__(self): self.number_of_results = 10 self.number_analogy_results = 20 self.autoAddTags = True # specify the folder of the pretrained word embeddings word2vec_bin_loc = 'scholar/postagged_wikipedia_for_word2vec.bin' self.model = word2vec.load(word2vec_bin_loc) # This is a list of the tags as organized in the text file self.tag_list = [ 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB' ] self.load_tags() # init the neural network for neuroevolution self.network = Sequential() self.network.add(Dense(100, input_shape=(100, ), activation='tanh')) # below for debug #print(self.network.summary()) self.nnw = NNWeightHelper(self.network) self.weights = self.nnw.get_weights() # keeps track of how many times self.get_results_for_words are called self.counter = 0 #keep track of words it has seen self.words_tags_last_seen = {}
(np.zeros(y_train.shape[0]), np.ones(y_test.shape[0])), axis=0) # Neural network architecture model = Sequential() model.add( Conv2D(32, kernel_size=(5, 5), activation='relu', input_shape=(28, 28, 3))) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Conv2D(32, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(50, activation='relu')) # this is irrelevant for what we want to achieve model.compile(loss="mse", optimizer="adam") print("compilation is over") nnw = NNWeightHelper(model) weights = nnw.get_weights() # Neural network architecture model = Sequential() model.add( Conv2D(32, kernel_size=(5, 5), activation='relu', input_shape=(28, 28, 3))) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Conv2D(32, (5, 5), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(50, activation='relu')) # this is irrelevant for what we want to achieve model.compile(loss="mse", optimizer="adam") print("compilation is over")
combined_test_domain_class = np.vstack( [mnist_test_domain_class, mnistm_test_domain_class]) model = Sequential() model.add( Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=[28, 28, 3])) # model.add(Conv2D(64, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(10, activation='relu')) #model.summary() model.compile(loss="mse", optimizer="adam") print("compilation is over") nnw = NNWeightHelper(model) weights = nnw.get_weights() def domain_classcifier(): print("Total number of weights to evolve is:", weights.shape) all_examples_indices = list(range(combined_train_imgs.shape[0])) clf, _ = train_classifier(model, combined_train_imgs, combined_train_domain_class) print("combined_train_imgs shape is :", combined_train_imgs.shape) print("combined_train_labels shape is :", combined_train_domain_class.shape) y_pred = predict_classifier(model, clf, combined_test_imgs) print(combined_test_domain_class.shape, y_pred.shape)
class Vector(): def __init__(self): self.number_of_results = 10 self.number_analogy_results = 20 self.autoAddTags = True # specify the folder of the pretrained word embeddings word2vec_bin_loc = 'scholar/postagged_wikipedia_for_word2vec.bin' self.model = word2vec.load(word2vec_bin_loc) # This is a list of the tags as organized in the text file self.tag_list = [ 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB' ] self.load_tags() # init the neural network for neuroevolution self.network = Sequential() self.network.add(Dense(100, input_shape=(100, ), activation='tanh')) # below for debug #print(self.network.summary()) self.nnw = NNWeightHelper(self.network) self.weights = self.nnw.get_weights() # keeps track of how many times self.get_results_for_words are called self.counter = 0 #keep track of words it has seen self.words_tags_last_seen = {} def load_tags(self): """ function to load the POS tag counts into a dictionary """ tag_distribution_loc = 'scholar/postag_distributions_for_scholar.txt' # Loads the part of speech tag counts into a dictionary (words to tag string delimited by '-'s) with open(tag_distribution_loc) as f: word_tag_dist = f.read() # Save each word to a list of tags in a global dictionary self.word_to_tags = {} for line in word_tag_dist.split(): pieces = line.split('.') word = pieces[0] tag_counts = pieces[1].split('-') tag_counts_as_ints = [int(tag_count) for tag_count in tag_counts] self.word_to_tags[word] = tag_counts_as_ints def get_verbs(self, noun, snes_weights, tags, number_of_user_results): return self.get_canonical_results_for_nouns(noun, 'VB', 'scholar/canon_verbs.txt', False, snes_weights, tags, number_of_user_results) def get_canonical_results_for_nouns(self, noun, query_tag, canonical_tag_filename, plural, snes_weights, tags, number_of_user_results): if self.autoAddTags: noun += '_NNS' if plural else '_NN' canonical_pairs = open(canonical_tag_filename) result_map = {} # For every line in the file of canonical pairs... for line in canonical_pairs: # ...split into separate words... words = line.split() if plural: if query_tag == 'VB' or query_tag == 'JJ': query_string = words[0] + '_' + query_tag + ' -' + words[ 1] + '_NNS ' + noun else: if query_tag == 'VB' or query_tag == 'JJ': query_string = words[0] + '_' + query_tag + ' -' + words[ 1] + '_NN ' + noun # ...performs an analogy using the words... try: result_list = self.analogy(query_string, snes_weights, tags) except: result_list = [] # ...and adds those results to a map (sorting depending on popularity, Poll method) for result in result_list: if result in result_map.keys(): result_map[result] += 1 else: result_map[result] = 1 final_results = [] current_max = number_of_user_results # While we haven't reached the requested number of results and the number of possible matches is within reason... while len(final_results) < number_of_user_results and current_max > 0: # ...for every key in the results... for key in result_map.keys(): # ...if the number of times a result has been seen equals the current 'number of matches'... if result_map[key] == current_max: # ...add it to the list. (This is so that the results are sorted to the list in order of popularity) final_results.append(key) current_max -= 1 if len(final_results) >= number_of_user_results: return final_results[0:number_of_user_results] return final_results def analogy(self, words_string, snes_weights, tags): """ Return the analogy results for a list of words (input: "king -man woman") """ positives, negatives = self.get_positives_and_negatives( words_string.split()) return self.get_results_for_words(positives, negatives, snes_weights, tags) def get_positives_and_negatives(self, words): positives = [] negatives = [] for x in range(len(words)): word_arg = words[x] if word_arg.startswith('-'): negatives.append(word_arg[1:]) else: positives.append(word_arg) return positives, negatives def get_results_for_words(self, positives, negatives, snes_weights, tags): """ Returns the results of entering a list of positive and negative words into word2vec """ # for first 14 times, we dont use fine-tune the embeddings but use the original pretrained word vector # why 14? 15 is the number of verb combination in canon_verbs.txt if self.counter > 14: # run the function below everytime get_results_for_words gets called self.transform_word_vectors(snes_weights, tags) indexes, metrics = self.model.analogy(pos=positives, neg=negatives, n=self.number_analogy_results) results = self.model.generate_response(indexes, metrics).tolist() self.counter += 1 return self.format_output(results) def format_output(self, output): """ Changes the output from a list of tuples (u'man', 0.816015154188), ... to a list of single words """ words = [] for word_value in output: words.append(str(word_value[0])) return words def return_weights(self): return self.weights def return_words_tags_last_seen(self): return self.words_tags_last_seen def return_trained_word2vec(self): """ save the models seen in the game """ labels = [] tokens = [] for y in self.words_tags_last_seen: new_token = self.model.get_vector(y) tokens.append(new_token) labels.append(y) return tokens, labels def nnw_set_weights(self, weights): self.nnw.set_weights(weights) def transform_word_vectors(self, snes_weights=None, tags=None): """ pass the previous vectors for the word to the neuroevolution algorithm and send the new vectors back to the word2vec file """ self.nnw.set_weights(snes_weights) sentence_sequences = [] sentence_word_vectors = [] # for debug # keep track of errors # error = 0 # error_list = [] # receive the words from tags code for words in tags: try: x = (words[0].lower() + '_' + words[1]) # get the vector of the words first # then get the index of the word # as the word might not have the vectors. # below we get the vectors of the words sentence_word_vectors.append(self.model.get_vector(x)) if x in self.words_tags_last_seen: index = self.words_tags_last_seen[x] else: self.words_tags_last_seen[x] = self.model.ix(x) # below we get the index of the words sentence_sequences.append(index) except: # error += 1 # debug to see which word is not on the list # error_list.append(x) pass # convert from list to array changed_word2vec_vectors = self.network.predict( np.array(sentence_word_vectors)) i = 0 for index in sentence_sequences: self.model.vectors[index] = changed_word2vec_vectors[i] # check if the vectors changed? #word_vectors_changed = self.model.vectors[index] i += 1
class Vector(): def __init__(self): self.number_of_results = 10 self.number_analogy_results = 20 self.autoAddTags = True word2vec_bin_loc = 'scholar/postagged_wikipedia_for_word2vec.bin' self.model = word2vec.load(word2vec_bin_loc) #self.load_word2vec() # This is a list of the tags as organized in the text file self.tag_list = [ 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB' ] self.load_tags() # init the neural network for neuroevolution self.network = Sequential() self.network.add(Dense(100, input_shape=(100, ), activation='elu')) #print(self.network.summary()) self.nnw = NNWeightHelper(self.network) self.weights = self.nnw.get_weights() #self.init_NN() # keeps track of how many times self.get_results_for_words are called self.counter = 0 #self.last_tag = {} #keep track of words it has seen self.words_tags_last_seen = {} def load_tags(self): tag_distribution_loc = 'scholar/postag_distributions_for_scholar.txt' # Loads the part of speech tag counts into a dictionary (words to tag string delimited by '-'s) # Read in the tag information for each word from the file with open(tag_distribution_loc) as f: word_tag_dist = f.read() # Save each word to a list of tags in a global dictionary self.word_to_tags = {} for line in word_tag_dist.split(): pieces = line.split('.') word = pieces[0] tag_counts = pieces[1].split('-') tag_counts_as_ints = [int(tag_count) for tag_count in tag_counts] self.word_to_tags[word] = tag_counts_as_ints def get_verbs(self, noun, snes_weights, tags, number_of_user_results): return self.get_canonical_results_for_nouns(noun, 'VB', 'scholar/canon_verbs.txt', False, snes_weights, tags, number_of_user_results) def get_canonical_results_for_nouns(self, noun, query_tag, canonical_tag_filename, plural, snes_weights, tags, number_of_user_results): if self.autoAddTags: noun += '_NNS' if plural else '_NN' canonical_pairs = open(canonical_tag_filename) result_map = {} # For every line in the file of canonical pairs... for line in canonical_pairs: # ...split into separate words... words = line.split() if plural: if query_tag == 'VB' or query_tag == 'JJ': query_string = words[0] + '_' + query_tag + ' -' + words[ 1] + '_NNS ' + noun else: if query_tag == 'VB' or query_tag == 'JJ': query_string = words[0] + '_' + query_tag + ' -' + words[ 1] + '_NN ' + noun # ...performs an analogy using the words... try: result_list = self.analogy(query_string, snes_weights, tags) except: result_list = [] # ...and adds those results to a map (sorting depending on popularity, Poll method) for result in result_list: if result in result_map.keys(): result_map[result] += 1 else: result_map[result] = 1 final_results = [] current_max = number_of_user_results # While we haven't reached the requested number of results and the number of possible matches is within reason... while len(final_results) < number_of_user_results and current_max > 0: # ...for every key in the results... for key in result_map.keys(): # ...if the number of times a result has been seen equals the current 'number of matches'... if result_map[key] == current_max: # ...add it to the list. (This is so that the results are sorted to the list in order of popularity) final_results.append(key) current_max -= 1 if len(final_results) >= number_of_user_results: return final_results[0:number_of_user_results] return final_results # Return the analogy results for a list of words (input: "king -man woman") def analogy(self, words_string, snes_weights, tags): positives, negatives = self.get_positives_and_negatives( words_string.split()) return self.get_results_for_words(positives, negatives, snes_weights, tags) def get_positives_and_negatives(self, words): positives = [] negatives = [] for x in range(len(words)): word_arg = words[x] if word_arg.startswith('-'): negatives.append(word_arg[1:]) else: positives.append(word_arg) return positives, negatives # Returns the results of entering a list of positive and negative words into word2vec def get_results_for_words(self, positives, negatives, snes_weights, tags): # keeps track of number of times this function is called # for first 15 tries, we dont use the function but use the original pretrained word vector # to evaluate how good it is # why 15? 15 is the number of verb combination in canon_verbs.txt # what im doing below is as it goes through 15 word combinations in canon_verbs, # making sure it evalues each combination with a state and then change the word vectos values if self.counter > 14: # run the function below everytime get_results_for_words gets called self.transform_word_vectors(snes_weights, tags) indexes, metrics = self.model.analogy(pos=positives, neg=negatives, n=self.number_analogy_results) results = self.model.generate_response(indexes, metrics).tolist() self.counter += 1 #self.last_tag = tags return self.format_output(results) # Changes the output from a list of tuples (u'man', 0.816015154188), ... to a list of single words def format_output(self, output): words = [] for word_value in output: words.append(str(word_value[0])) return words def return_weights(self): return self.weights def return_words_tags_last_seen(self): return self.words_tags_last_seen def return_trained_word2vec(self): # save the models seen in the game with open('test_model.pkl', 'wb') as i: pkl.dump(self.model, i) def nnw_set_weights(self, weights): self.nnw.set_weights(weights) # pass in the asked values def transform_word_vectors(self, snes_weights=None, tags=None): self.nnw.set_weights(snes_weights) # self.nnw_set_weights(snes_weights) # get the word vectors for the state it has seen # code to show based on state, how we pass these vectors to neural network. # we need to use the index instead of the word as some words dont appear in word2vec # code to show based on state, how we pass these vectors to neural network. sentence_sequences = [] sentence_word_vectors = [] #narrative_vectors = [] #narrative_indexes = [] # keep track of errors # error = 0 # error_list = [] # keep track of the words it has seen # receive the words from tags code for words in tags: try: x = (words[0].lower() + '_' + words[1]) # print(x) # get the vector of the words first # then get the index of the word # as the word might not have the vectors. # word_indexes = word_vectors.ix(x) # below we get the vectors of the words # narrative_vectors = self.model.get_vector(x) sentence_word_vectors.append(self.model.get_vector(x)) if x in self.words_tags_last_seen: index = self.words_tags_last_seen[x] else: self.words_tags_last_seen[x] = self.model.ix(x) #sentence_word_vectors.append(self.model.get_vector(x)) # sentence_word_vectors.append(narrative_vectors) # BELOW FOR DEBUG # CHECK WHAT word depend on the vector # check_if_correct_word = self.model.get_word(narrative_vectors) # below we get the index of the words # narrative_indexes = self.model.ix(x) sentence_sequences.append(index) #sentence_sequences.append(self.model.ix(x)) # sentence_sequences.append(narrative_indexes) # print(word_vectors[x]) except: # error += 1 # debug to see which word is not on the list # error_list.append(x) pass # convert from list to array # specify that its float #sentence_word_vectors_array = np.array(sentence_word_vectors,dtype='f') #del sentence_word_vectors_array #changed_word2vec_vectors = self.network.predict(sentence_word_vectors_array) changed_word2vec_vectors = self.network.predict( np.array(sentence_word_vectors, dtype='f')) i = 0 # deletes the variables to clear memory #del narrative_vectors #del sentence_word_vectors #del narrative_indexes #del sentence_word_vectors_array # pull out the vectors # word2vec_vectors = self.model.vivan_get_word2vec_vectors() for index in sentence_sequences: #print(index) #desired_word = word_vectors.vocab[index] #print(desired_word) #debug word2vec # current_word_vector = self.model.vectors[index] #to_change = changed_word2vec_vectors[i] # here it changes #word2vec_vectors[index] = changed_word2vec_vectors[i] self.model.vectors[index] = changed_word2vec_vectors[i] # check if the vectors changed? #check_if_word_vectors_changed = self.model.vectors[index] i += 1 #self.model.model_scholar_new_word2vec_vectors(word2vec_vectors) #self.model.vectors = word2vec_vectors[:] # take current vectors and use SNES to predict new vectors # send the new vectors back to the word2vec file # reset the array # del changed_word2vec_vectors
def train(self): random_domainTrain_indices = np.random.choice(a=list( range(self.mixed_trainX.shape[0])), size=1024) mixed_domain_trainX = self.mixed_trainX[random_domainTrain_indices] mixed_domain_trainY = self.mixed_trainY[random_domainTrain_indices] random_domainValid_indices = np.random.choice(a=list( range(self.mixed_validationX.shape[0])), size=1024) mixed_domain_validX = self.mixed_validationX[ random_domainValid_indices] mixed_domain_validY = self.mixed_validationY[ random_domainValid_indices] source_validation_indices = np.random.choice(a=list( (range(self.source_validationX.shape[0]))), size=1024) validX = self.source_validationX[source_validation_indices] validY = self.source_validationY[source_validation_indices] self.label_clf = self.clf_train(self.feature_extractor, self.source_trainX, self.source_trainY) label_pred = self.clf_predict(self.feature_extractor, self.label_clf, validX) label_accuracy = accuracy_score(validY, label_pred) self.domain_clf = self.clf_train(self.feature_extractor, mixed_domain_trainX, mixed_domain_trainY) domain_pred = self.clf_predict(self.feature_extractor, self.domain_clf, mixed_domain_validX) domain_accuracy = accuracy_score(mixed_domain_validY, domain_pred) print( 'Baseline label-clf accuracy: %0.3f, baseline domain_clf: %0.3f, domain_invariant label-accuracy , domain_accurcy ' % (label_accuracy, domain_accuracy)) weight_modifier = NNWeightHelper(self.feature_extractor) weights = weight_modifier.get_weights() print('total weights to evolve:', len(weights)) snes = SSNES(weights, 1, 100) batchScores = [] batch_clf_acc = [] batch_domain_acc = [] for i in range(20): fitnesses = [] domain_accuracys = [] label_accuracys = [] indices = [] for i in range(10): new_weights, index = snes.predict() weight_modifier.set_weights(new_weights) indices.append(index) self.label_clf = self.clf_train( self.feature_extractor, self.source_trainX[source_validation_indices], self.source_trainY[source_validation_indices]) label_predictions = self.clf_predict(self.feature_extractor, self.label_clf, validX) label_accuracy = accuracy_score(validY, label_predictions) self.domain_clf = self.clf_train(self.feature_extractor, mixed_domain_trainX, mixed_domain_trainY) domain_predictions = self.clf_predict(self.feature_extractor, self.domain_clf, mixed_domain_validX) domain_accuracy = accuracy_score(mixed_domain_validY, domain_predictions) domain_f1 = f1_score(mixed_domain_validY, domain_pred) fitnesses.append(self.fitness(label_accuracy, domain_f1)) domain_accuracys.append(domain_accuracy) label_accuracys.append(label_accuracy) # batchScores.append(fitnesses) batchScores.append(np.mean(fitnesses)) batch_clf_acc.append(np.mean(label_accuracys)) batch_domain_acc.append(np.mean(domain_accuracys)) most_fit_model = np.argmax(fitnesses) print( "Most fit model has label_accuracy: %0.3f and domain_accuracy:%0.3f and fitness_score:%0.3f" % (label_accuracys[most_fit_model], domain_accuracys[most_fit_model], fitnesses[most_fit_model])) snes.fit(fitnesses, indices) weight_modifier.set_weights(snes.snes.center) self.label_clf = self.clf_train(self.feature_extractor, self.source_trainX, self.source_trainY) random_domainTrain_indices = np.random.choice( a=list(range(self.mixed_trainX.shape[0])), size=self.mixed_trainX.shape[0]) all_domain_trainX = self.mixed_trainX[random_domainTrain_indices] all_domain_trainY = self.mixed_trainY[random_domainTrain_indices] self.domain_clf = self.clf_train(self.feature_extractor, all_domain_trainX, all_domain_trainY) random_domainValid_indices = np.random.choice( a=list(range(self.mixed_validationX.shape[0])), size=self.mixed_validationX.shape[0]) all_domain_testX = self.mixed_validationX[random_domainValid_indices] all_domain_testY = self.mixed_validationY[random_domainValid_indices] source_label_predictions = self.clf_predict(self.feature_extractor, self.label_clf, self.source_validationX) target_label_predictions = self.clf_predict(self.feature_extractor, self.label_clf, self.target_trainX) domain_predictions = self.clf_predict(self.feature_extractor, self.domain_clf, all_domain_testX) print('**-validation--**') print('label-predicitions on source data: %0.3f' % (accuracy_score( self.source_validationY, source_label_predictions))) print('label-predicitions on target data: %0.3f' % (accuracy_score(self.target_trainY, target_label_predictions))) print('domain-predicitions on source data: %0.3f' % (accuracy_score(all_domain_testY, domain_predictions)))