def get_embedding_params(self,
                             dictionary,
                             dim,
                             pre_trained_embedding_file_name=None):
        """
            Returning the parameters of the network.
            Args:
                dictionary (obj): an instance of the class Dictionary containing terms and term IDs.
                dim (int): embedding dimensionality.
                pre_trained_embedding_file_name (str): the path to the pre-trained word embeddings for initialization.
                 This is optional. If a term in the dictionary does not appear in the pre-trained vector file, its
                 embedding will be initialized by a random vector. If this argument is 'None', the embedding matrix will
                 be initialized randomly with a uniform distribution.

            Returns:
                embedding_matrix (obj): a 2D TensorFlow Varibale containing the embedding vector for each term ID. For
                 unknown terms, the term_id is zero.
         """
        if pre_trained_embedding_file_name is None:
            return tf.Variable(
                tf.random_uniform([dictionary.size(), dim], -1.0, 1.0))
        else:
            term_to_id, id_to_term, we_matrix = util.load_word_embeddings(
                pre_trained_embedding_file_name, dim)
            init_matrix = np.random.random((dictionary.size(), dim))
            for i in xrange(dictionary.size()):
                if dictionary.id_to_term[i] in term_to_id:
                    tid = term_to_id[dictionary.id_to_term[i]]
                    init_matrix[i] = we_matrix[tid]
            return tf.get_variable(
                'embeddings',
                shape=[dictionary.size(), dim],
                trainable=True,
                initializer=tf.constant_initializer(init_matrix))
Example #2
0
 def readEmbedding(self, pretrained_file):
     term_to_id, id_to_term, we_matrix = util.load_word_embeddings(
         pretrained_file, self.emb_dim)
     init_matrix = np.random.random((self.max_dic_size + 1, self.emb_dim))
     for i in range(self.max_dic_size):
         init_matrix[i] = we_matrix[i]
     init_matrix[i + 1] = np.zeros([1, self.emb_dim])  ## zero padding
     print(i,
           ' embeddings are read from the pretrained file',
           file=sys.stderr)
     return torch.from_numpy(init_matrix)
Example #3
0
parser = argparse.ArgumentParser()
parser.add_argument('-output_file', action='store', dest='output_file', help='Output csv file', default='output_' + now + '.csv')
args = parser.parse_args()
tqdm.pandas(desc='Progress bar')

in_phrase = input("Enter a phrase for the niche you are interested in (e.g. 'job board software', 'resume builders'): ")
job_board_search = input("Do you want to check if these websites are job boards? (Y/N): ")
traffic_threshold = int(input("Please enter the upper organic traffic threshold to filter results by (default 100000): "))
#exclusions = input("Are there any words/niches you don't want? Enter as keywords (e.g. 'music writing'). If no, type N: ")

print("Loading language model...")
language_model = util.language_model # once this is ready to deploy should be using a larger model

print("Loading word embeddings...")
word_embeddings = util.load_word_embeddings('word_embeddings/glove.6B.300d.txt') # use 42B common crawl when ready for prod, possibly just import directly from spaCy
print("Word embeddings and language model loaded!")

competitor_list_files = []
competitor_dataframes = []
path = "competitor_lists/"

# make this part smarter at pulling out a column of domains from any generic csv
for filename in glob.glob(os.path.join(path, '*.csv')):
	competitor_list_files.append(filename)
print ("There are " + str(len(competitor_list_files)) + " competitor list files to extract information from.")
for csv_file in competitor_list_files:
	competitor_dataframes.append(util.filter_competitor_list(csv_file, traffic_threshold))
print ("Filtered out sites based on traffic and competitor relevance!")

df = pd.concat(competitor_dataframes)
Example #4
0
def main():
    params = parse_arguments()

    if params.mode == 0:
        u = util.Utils(params)
        u.run()
    else:
        print("Reading embedding numpy files...")
        use_cuda = False
        if params.mode == 1:
            use_cuda = True

        src = params.src_lang
        tgt = params.tgt_lang

        suffix_str = src + '_' + tgt

        src_data = util.load_subword_embeddings(
            os.path.join(params.data_dir, params.src_file))
        tgt_data = util.load_word_embeddings(
            os.path.join(params.data_dir, params.tgt_file))
        print("Done.")

        if params.center_embeddings > 0:  # centering
            src_data['E'].center()
            tgt_data['E'].center()

        if params.mode == 1:
            # Memorize the original word embeddings
            src_data['vecs'].copy_(src_data['F'](src_data['seqs'],
                                                 src_data['E'],
                                                 transform=False).data)
            t = Trainer(params)
            g = t.train(src_data, tgt_data)

        elif params.mode == 2:
            params = _get_eval_params(params)
            # evaluator = Evaluator(params, src_emb.weight.data, tgt_emb.weight.data)

            model_file_path = os.path.join(params.model_dir,
                                           params.model_file_name)
            g = Generator(input_size=params.g_input_size,
                          hidden_size=params.g_hidden_size,
                          output_size=params.g_output_size,
                          hyperparams=get_hyperparams(params, disc=False))
            g.load_state_dict(torch.load(model_file_path, map_location='cpu'))

            try:
                knn_list = pickle.load(
                    open('full_knn_list_' + suffix_str + '.pkl', 'rb'))
            except FileNotFoundError:
                print("k-nn file not found!")
            knn_emb = util.convert_to_embeddings(knn_list, use_cuda=False)

            attn = Attention(atype=params.atype)
            indices = torch.arange(params.top_frequent_words).type(
                torch.LongTensor)

            mapped_src_emb = g(src_emb.weight).data

            #             print(mapped_src_emb)
            evaluator.get_all_precisions(mapped_src_emb)
            # print("Unsupervised criterion: ", evaluator.calc_unsupervised_criterion(mapped_src_emb))

            # unsupervised_criterion = []
            #
            # for i in range(40):
            #     model_file_path = os.path.join(params.model_dir, 'generator_weights_en_es_' + str(i+1) + '.t7')
            #     g = Generator(input_size=g_input_size, output_size=g_output_size)
            #     g.load_state_dict(torch.load(model_file_path, map_location='cpu'))
            #     if torch.cuda.is_available():
            #         g = g.cuda()
            #     mapped_src_emb = g(src_emb.weight).data
            #     uc = evaluator.calc_unsupervised_criterion(mapped_src_emb)
            #     print("i: %d, uc: %f" % (i, uc))
            #     unsupervised_criterion.append(uc)
            #
            # np.save("uc.npy", np.array(unsupervised_criterion))

        else:
            raise "Invalid flag!"