def main_build(args): # load language model ft_model_filename = args.path_model print(f"Loading language model from: {ft_model_filename}") model = ck.load_FT_model(ft_model_filename) print("Loaded embeddings!") # build new embedder es_embedder = ck.SIFEmbedder(model) print("Built embedder!") # get keywords keyword_filename = args.path_keywords print(f"Loading keywords from: {keyword_filename}") keywords = ck.load_csv_column( keyword_filename, args.keywords_column, delimiter = args.keywords_delimiter) print(f'Loaded {len(keywords)} keywords.') # run embedder es_embedder.fit(keywords, sample_size=args.sample) # store parameters embedder_params_filename = args.path_embedder_parameters print(f"Dumping embedder parameters to: {embedder_params_filename}") with open(embedder_params_filename, "w") as outfile: outfile.write(es_embedder.serialize())
def main_build(args): # load language model ft_model_filename = args.path_model print(f"Loading language model from: {ft_model_filename}") model = ck.load_FT_model(ft_model_filename) print("Loaded embeddings!") # build new embedder es_embedder = ck.SIFEmbedder(model) print("Built embedder!") # get keywords keyword_filename = args.path_keywords print(f"Loading keywords from: {keyword_filename}") keywords = ck.load_csv_column(keyword_filename, args.keywords_column, delimiter=args.keywords_delimiter) print(f'Loaded {len(keywords)} keywords.') # run embedder embeddings = es_embedder.fit_embed(keywords) # store parameters embedder_params_filename = args.path_embedder_parameters print(f"Dumping embedder parameters to: {embedder_params_filename}") with open(embedder_params_filename, "w") as outfile: outfile.write(es_embedder.serialize()) # if specified, store embeddings if args.path_embeddings is not None: embeddings_path = args.path_embeddings print(f"Dumping embeddings to: {embeddings_path}") np.save(open(embeddings_path, 'wb'), embeddings)
def main_categorise(args): # load language model ft_model_filename = args.path_model print(f"Loading language model from: {ft_model_filename}") model = ck.load_FT_model(ft_model_filename) print("Loaded embeddings!") # build embedder embedder_parameters_filename = args.path_embedder_parameters print(f"Loading embedder parameters from: {embedder_parameters_filename}") de_embedder_parameters_json = open(embedder_parameters_filename).read() de_embedder = ck.SIFEmbedder(model) de_embedder.load(de_embedder_parameters_json) print("Built embedder!") # get categories categories_filename = args.path_categories print(f"Loading categories from: {categories_filename}") categories = ck.load_csv_column(categories_filename, args.categories_column, delimiter=',') category_ids = ck.load_csv_column(categories_filename, args.categories_id_column, delimiter=',') print(f'Loaded {len(categories)} categories.') # build categorizer categorizer = ck.Categorizer(de_embedder) categorizer.fit(categories, category_ids=category_ids) print("Categorizer built!") # get keywords keyword_filename = args.path_keywords print(f"Loading keywords from: {keyword_filename}") keywords = ck.load_csv_column(keyword_filename, args.keywords_column, delimiter=args.keywords_delimiter) print(f'Loaded {len(keywords)} keywords.') # run categorizer n_categories = args.n_categories keyword_categories = categorizer.categorize(keywords, n_categories=n_categories) output_filename = args.path_output print(f"Writing categories to: {output_filename}") with open(output_filename, "w", encoding="utf8") as outfile: outwriter = csv.writer(outfile, delimiter=",", quotechar='"') # write header out_header = ["keyword"] for cat_i in range(1, n_categories + 1): out_header.extend( [f"category{cat_i}", f"category{cat_i}_distance"]) outwriter.writerow(out_header) # write results row by row for keyword, categories in zip(keywords, keyword_categories): row = [f"{keyword}"] for category, distance in categories: row.extend([f"{category}", f"{distance}"]) outwriter.writerow(row) print("DONE!")
def main_categorise(args): # load language model ft_model_filename = args.path_model print(f"Loading language model from: {ft_model_filename}") model = ck.load_FT_model(ft_model_filename) print("Loaded embeddings!") # build embedder embedder_parameters_filename = args.path_embedder_parameters print(f"Loading embedder parameters from: {embedder_parameters_filename}") de_embedder_parameters_json = open(embedder_parameters_filename).read() de_embedder = ck.SIFEmbedder(model) de_embedder.load(de_embedder_parameters_json) print("Built embedder!") # get categories categories_filename = args.path_categories print(f"Loading categories from: {categories_filename}") categories = ck.load_csv_column(categories_filename, args.categories_column, delimiter=',') category_ids = ck.load_csv_column(categories_filename, args.categories_id_column, delimiter=',') print(f'Loaded {len(categories)} categories.') # build categorizer categorizer = ck.Categorizer(de_embedder) categorizer.fit(categories, category_ids=category_ids) print("Categorizer built!") # get keywords keyword_filename = args.path_keywords print(f"Loading keywords from: {keyword_filename}") keywords = ck.load_csv_column( keyword_filename, args.keywords_column, delimiter = args.keywords_delimiter) print(f'Loaded {len(keywords)} keywords.') # assigning closest 'n_categories' to each keyword if args.mode == "categorise_keywords": n_categories = args.n_categories keyword_categories = categorizer.categorize(keywords, n_categories=n_categories) output_filename = args.path_output print(f"Writing categories to: {output_filename}") with open(output_filename, "w", encoding="utf8") as outfile: outwriter = csv.writer(outfile, delimiter=",", quotechar='"') # write header out_header = ["keyword"] for cat_i in range(1, n_categories + 1): out_header.extend([f"category{cat_i}_id", f"category{cat_i}", f"category{cat_i}_distance"]) outwriter.writerow(out_header) # write results row by row for keyword, categories in zip(keywords, keyword_categories): row = [f"{keyword}"] for category, id, distance in categories: row.extend([f"{id}", f"{category}", f"{distance}"]) outwriter.writerow(row) # assigning closest 'n_keywords' to each category elif args.mode == "relevance_to_category": n_keywords = args.n_keywords keyword_categories = categorizer.closest_keywords(keywords, n_keywords=n_keywords) output_filename = args.path_output print(f"Writing categories to: {output_filename}") with open(output_filename, "w", encoding="utf8") as outfile: outwriter = csv.writer(outfile, delimiter=",", quotechar='"') # write header out_header = ["keyword", "categories"] outwriter.writerow(out_header) # write results row by row for keyword, categories in tqdm(zip(keywords, keyword_categories), desc='Writting to file'): row = [f"{keyword}"] if len(categories) == 0: row += ["none"] else: #row += ["#".join([f"{category}({id})" for category, id, distance in categories])] for category, id, distance in categories: tmp_row = row + [f"{category}({id})"] outwriter.writerow(tmp_row) print("DONE!")
'Name of column containing category ids in the categories csv file. (default: \'CategoryID\')' ) argparser.add_argument("-p", "--port", type=int, default=5000) args = argparser.parse_args() # load language model ft_model_filename = args.path_model print(f"Loading language model from: {ft_model_filename}") model = ck.load_FT_model(ft_model_filename) print("Loaded embeddings!") # build embedder embedder_parameters_filename = args.path_embedder_parameters print(f"Loading embedder parameters from: {embedder_parameters_filename}") de_embedder_parameters_json = open(embedder_parameters_filename).read() de_embedder = ck.SIFEmbedder(model) de_embedder.load(de_embedder_parameters_json) print("Built embedder!") # get categories categories_filename = args.path_categories print(f"Loading categories from: {categories_filename}") categories = ck.load_csv_column(categories_filename, args.categories_column, delimiter=',') category_ids = ck.load_csv_column(categories_filename, args.categories_id_column, delimiter=',') print(f'Loaded {len(categories)} categories.') # build categorizer
word_frequencies = ck.count_word_frequencies(keywords) print(word_frequencies) embeddings, principal_components = ck.sif_embedding(keywords, model, word_frequencies, n_principal_components=1, alpha=1e-3, principal_components=None, return_components=True) print("embeddings[5,:]:\n", embeddings[:, :5]) print("principal_components[5,:]\n", principal_components[:, :5]) embedder = ck.SIFEmbedder(model) try: embedder.embed(keywords) except RuntimeError: print("Embedder exception ok.") embeddings1 = embedder.fit_embed(keywords) print("Embeddings same:", np.array_equal(embeddings, embeddings1)) print("Principal components same:", np.array_equal(principal_components, embedder.principal_components)) param_json = embedder.serialize() print("serialization done")