def get_embedding_params(self, dictionary, dim, pre_trained_embedding_file_name=None): """ Returning the parameters of the network. Args: dictionary (obj): an instance of the class Dictionary containing terms and term IDs. dim (int): embedding dimensionality. pre_trained_embedding_file_name (str): the path to the pre-trained word embeddings for initialization. This is optional. If a term in the dictionary does not appear in the pre-trained vector file, its embedding will be initialized by a random vector. If this argument is 'None', the embedding matrix will be initialized randomly with a uniform distribution. Returns: embedding_matrix (obj): a 2D TensorFlow Varibale containing the embedding vector for each term ID. For unknown terms, the term_id is zero. """ if pre_trained_embedding_file_name is None: return tf.Variable( tf.random_uniform([dictionary.size(), dim], -1.0, 1.0)) else: term_to_id, id_to_term, we_matrix = util.load_word_embeddings( pre_trained_embedding_file_name, dim) init_matrix = np.random.random((dictionary.size(), dim)) for i in xrange(dictionary.size()): if dictionary.id_to_term[i] in term_to_id: tid = term_to_id[dictionary.id_to_term[i]] init_matrix[i] = we_matrix[tid] return tf.get_variable( 'embeddings', shape=[dictionary.size(), dim], trainable=True, initializer=tf.constant_initializer(init_matrix))
def readEmbedding(self, pretrained_file): term_to_id, id_to_term, we_matrix = util.load_word_embeddings( pretrained_file, self.emb_dim) init_matrix = np.random.random((self.max_dic_size + 1, self.emb_dim)) for i in range(self.max_dic_size): init_matrix[i] = we_matrix[i] init_matrix[i + 1] = np.zeros([1, self.emb_dim]) ## zero padding print(i, ' embeddings are read from the pretrained file', file=sys.stderr) return torch.from_numpy(init_matrix)
parser = argparse.ArgumentParser() parser.add_argument('-output_file', action='store', dest='output_file', help='Output csv file', default='output_' + now + '.csv') args = parser.parse_args() tqdm.pandas(desc='Progress bar') in_phrase = input("Enter a phrase for the niche you are interested in (e.g. 'job board software', 'resume builders'): ") job_board_search = input("Do you want to check if these websites are job boards? (Y/N): ") traffic_threshold = int(input("Please enter the upper organic traffic threshold to filter results by (default 100000): ")) #exclusions = input("Are there any words/niches you don't want? Enter as keywords (e.g. 'music writing'). If no, type N: ") print("Loading language model...") language_model = util.language_model # once this is ready to deploy should be using a larger model print("Loading word embeddings...") word_embeddings = util.load_word_embeddings('word_embeddings/glove.6B.300d.txt') # use 42B common crawl when ready for prod, possibly just import directly from spaCy print("Word embeddings and language model loaded!") competitor_list_files = [] competitor_dataframes = [] path = "competitor_lists/" # make this part smarter at pulling out a column of domains from any generic csv for filename in glob.glob(os.path.join(path, '*.csv')): competitor_list_files.append(filename) print ("There are " + str(len(competitor_list_files)) + " competitor list files to extract information from.") for csv_file in competitor_list_files: competitor_dataframes.append(util.filter_competitor_list(csv_file, traffic_threshold)) print ("Filtered out sites based on traffic and competitor relevance!") df = pd.concat(competitor_dataframes)
def main(): params = parse_arguments() if params.mode == 0: u = util.Utils(params) u.run() else: print("Reading embedding numpy files...") use_cuda = False if params.mode == 1: use_cuda = True src = params.src_lang tgt = params.tgt_lang suffix_str = src + '_' + tgt src_data = util.load_subword_embeddings( os.path.join(params.data_dir, params.src_file)) tgt_data = util.load_word_embeddings( os.path.join(params.data_dir, params.tgt_file)) print("Done.") if params.center_embeddings > 0: # centering src_data['E'].center() tgt_data['E'].center() if params.mode == 1: # Memorize the original word embeddings src_data['vecs'].copy_(src_data['F'](src_data['seqs'], src_data['E'], transform=False).data) t = Trainer(params) g = t.train(src_data, tgt_data) elif params.mode == 2: params = _get_eval_params(params) # evaluator = Evaluator(params, src_emb.weight.data, tgt_emb.weight.data) model_file_path = os.path.join(params.model_dir, params.model_file_name) g = Generator(input_size=params.g_input_size, hidden_size=params.g_hidden_size, output_size=params.g_output_size, hyperparams=get_hyperparams(params, disc=False)) g.load_state_dict(torch.load(model_file_path, map_location='cpu')) try: knn_list = pickle.load( open('full_knn_list_' + suffix_str + '.pkl', 'rb')) except FileNotFoundError: print("k-nn file not found!") knn_emb = util.convert_to_embeddings(knn_list, use_cuda=False) attn = Attention(atype=params.atype) indices = torch.arange(params.top_frequent_words).type( torch.LongTensor) mapped_src_emb = g(src_emb.weight).data # print(mapped_src_emb) evaluator.get_all_precisions(mapped_src_emb) # print("Unsupervised criterion: ", evaluator.calc_unsupervised_criterion(mapped_src_emb)) # unsupervised_criterion = [] # # for i in range(40): # model_file_path = os.path.join(params.model_dir, 'generator_weights_en_es_' + str(i+1) + '.t7') # g = Generator(input_size=g_input_size, output_size=g_output_size) # g.load_state_dict(torch.load(model_file_path, map_location='cpu')) # if torch.cuda.is_available(): # g = g.cuda() # mapped_src_emb = g(src_emb.weight).data # uc = evaluator.calc_unsupervised_criterion(mapped_src_emb) # print("i: %d, uc: %f" % (i, uc)) # unsupervised_criterion.append(uc) # # np.save("uc.npy", np.array(unsupervised_criterion)) else: raise "Invalid flag!"