Example #1
0
def main_build(args):
    # load language model
    ft_model_filename = args.path_model
    print(f"Loading language model from: {ft_model_filename}")
    model = ck.load_FT_model(ft_model_filename)
    print("Loaded embeddings!")


    # build new embedder
    es_embedder = ck.SIFEmbedder(model)
    print("Built embedder!")


    # get keywords
    keyword_filename = args.path_keywords
    print(f"Loading keywords from: {keyword_filename}")
    keywords = ck.load_csv_column(
        keyword_filename,
        args.keywords_column,
        delimiter = args.keywords_delimiter)
    print(f'Loaded {len(keywords)} keywords.')


    # run embedder
    es_embedder.fit(keywords, sample_size=args.sample)


    # store parameters
    embedder_params_filename = args.path_embedder_parameters
    print(f"Dumping embedder parameters to: {embedder_params_filename}")
    with open(embedder_params_filename, "w") as outfile:
        outfile.write(es_embedder.serialize())
Example #2
0
def main_build(args):
    # load language model
    ft_model_filename = args.path_model
    print(f"Loading language model from: {ft_model_filename}")
    model = ck.load_FT_model(ft_model_filename)
    print("Loaded embeddings!")

    # build new embedder
    es_embedder = ck.SIFEmbedder(model)
    print("Built embedder!")

    # get keywords
    keyword_filename = args.path_keywords
    print(f"Loading keywords from: {keyword_filename}")
    keywords = ck.load_csv_column(keyword_filename,
                                  args.keywords_column,
                                  delimiter=args.keywords_delimiter)
    print(f'Loaded {len(keywords)} keywords.')

    # run embedder
    embeddings = es_embedder.fit_embed(keywords)

    # store parameters
    embedder_params_filename = args.path_embedder_parameters
    print(f"Dumping embedder parameters to: {embedder_params_filename}")
    with open(embedder_params_filename, "w") as outfile:
        outfile.write(es_embedder.serialize())

    # if specified, store embeddings
    if args.path_embeddings is not None:
        embeddings_path = args.path_embeddings
        print(f"Dumping embeddings to: {embeddings_path}")
        np.save(open(embeddings_path, 'wb'), embeddings)
Example #3
0
def main_categorise(args):
    # load language model
    ft_model_filename = args.path_model
    print(f"Loading language model from: {ft_model_filename}")
    model = ck.load_FT_model(ft_model_filename)
    print("Loaded embeddings!")

    # build embedder
    embedder_parameters_filename = args.path_embedder_parameters
    print(f"Loading embedder parameters from: {embedder_parameters_filename}")
    de_embedder_parameters_json = open(embedder_parameters_filename).read()
    de_embedder = ck.SIFEmbedder(model)
    de_embedder.load(de_embedder_parameters_json)
    print("Built embedder!")

    # get categories
    categories_filename = args.path_categories
    print(f"Loading categories from: {categories_filename}")
    categories = ck.load_csv_column(categories_filename,
                                    args.categories_column,
                                    delimiter=',')
    category_ids = ck.load_csv_column(categories_filename,
                                      args.categories_id_column,
                                      delimiter=',')
    print(f'Loaded {len(categories)} categories.')

    # build categorizer
    categorizer = ck.Categorizer(de_embedder)
    categorizer.fit(categories, category_ids=category_ids)
    print("Categorizer built!")

    # get keywords
    keyword_filename = args.path_keywords
    print(f"Loading keywords from: {keyword_filename}")
    keywords = ck.load_csv_column(keyword_filename,
                                  args.keywords_column,
                                  delimiter=args.keywords_delimiter)
    print(f'Loaded {len(keywords)} keywords.')

    # run categorizer
    n_categories = args.n_categories
    keyword_categories = categorizer.categorize(keywords,
                                                n_categories=n_categories)
    output_filename = args.path_output
    print(f"Writing categories to: {output_filename}")
    with open(output_filename, "w", encoding="utf8") as outfile:
        outwriter = csv.writer(outfile, delimiter=",", quotechar='"')
        # write header
        out_header = ["keyword"]
        for cat_i in range(1, n_categories + 1):
            out_header.extend(
                [f"category{cat_i}", f"category{cat_i}_distance"])
        outwriter.writerow(out_header)

        # write results row by row
        for keyword, categories in zip(keywords, keyword_categories):
            row = [f"{keyword}"]
            for category, distance in categories:
                row.extend([f"{category}", f"{distance}"])
            outwriter.writerow(row)

    print("DONE!")
Example #4
0
def main_categorise(args):
    # load language model
    ft_model_filename = args.path_model
    print(f"Loading language model from: {ft_model_filename}")
    model = ck.load_FT_model(ft_model_filename)
    print("Loaded embeddings!")


    # build embedder
    embedder_parameters_filename = args.path_embedder_parameters
    print(f"Loading embedder parameters from: {embedder_parameters_filename}")
    de_embedder_parameters_json = open(embedder_parameters_filename).read()
    de_embedder = ck.SIFEmbedder(model)
    de_embedder.load(de_embedder_parameters_json)
    print("Built embedder!")


    # get categories
    categories_filename = args.path_categories
    print(f"Loading categories from: {categories_filename}")
    categories = ck.load_csv_column(categories_filename, args.categories_column, delimiter=',')
    category_ids = ck.load_csv_column(categories_filename, args.categories_id_column, delimiter=',')
    print(f'Loaded {len(categories)} categories.')


    # build categorizer
    categorizer = ck.Categorizer(de_embedder)
    categorizer.fit(categories, category_ids=category_ids)
    print("Categorizer built!")


    # get keywords
    keyword_filename = args.path_keywords
    print(f"Loading keywords from: {keyword_filename}")
    keywords = ck.load_csv_column(
        keyword_filename,
        args.keywords_column,
        delimiter = args.keywords_delimiter)
    print(f'Loaded {len(keywords)} keywords.')


    # assigning closest 'n_categories' to each keyword
    if args.mode == "categorise_keywords":
        n_categories = args.n_categories
        keyword_categories = categorizer.categorize(keywords, n_categories=n_categories)
        output_filename = args.path_output
        print(f"Writing categories to: {output_filename}")
        with open(output_filename, "w", encoding="utf8") as outfile:
            outwriter = csv.writer(outfile, delimiter=",", quotechar='"')
            # write header
            out_header = ["keyword"]
            for cat_i in range(1, n_categories + 1):
                out_header.extend([f"category{cat_i}_id", f"category{cat_i}", f"category{cat_i}_distance"])
            outwriter.writerow(out_header)

            # write results row by row
            for keyword, categories in zip(keywords, keyword_categories):
                row = [f"{keyword}"]
                for category, id, distance in categories:
                    row.extend([f"{id}", f"{category}", f"{distance}"])
                outwriter.writerow(row)
    # assigning closest 'n_keywords' to each category
    elif args.mode == "relevance_to_category":
        n_keywords = args.n_keywords
        keyword_categories = categorizer.closest_keywords(keywords, n_keywords=n_keywords)
        output_filename = args.path_output
        print(f"Writing categories to: {output_filename}")
        with open(output_filename, "w", encoding="utf8") as outfile:
            outwriter = csv.writer(outfile, delimiter=",", quotechar='"')
            # write header
            out_header = ["keyword", "categories"]
            outwriter.writerow(out_header)
            
            # write results row by row
            for keyword, categories in tqdm(zip(keywords, keyword_categories), desc='Writting to file'):
                row = [f"{keyword}"]
                if len(categories) == 0:
                    row += ["none"]
                else:
                    #row += ["#".join([f"{category}({id})" for category, id, distance in categories])]
                    for category, id, distance in categories:
                        tmp_row = row + [f"{category}({id})"]
                        outwriter.writerow(tmp_row)
                
    print("DONE!")
Example #5
0
        'Name of column containing category ids in the categories csv file. (default: \'CategoryID\')'
    )
    argparser.add_argument("-p", "--port", type=int, default=5000)
    args = argparser.parse_args()

    # load language model
    ft_model_filename = args.path_model
    print(f"Loading language model from: {ft_model_filename}")
    model = ck.load_FT_model(ft_model_filename)
    print("Loaded embeddings!")

    # build embedder
    embedder_parameters_filename = args.path_embedder_parameters
    print(f"Loading embedder parameters from: {embedder_parameters_filename}")
    de_embedder_parameters_json = open(embedder_parameters_filename).read()
    de_embedder = ck.SIFEmbedder(model)
    de_embedder.load(de_embedder_parameters_json)
    print("Built embedder!")

    # get categories
    categories_filename = args.path_categories
    print(f"Loading categories from: {categories_filename}")
    categories = ck.load_csv_column(categories_filename,
                                    args.categories_column,
                                    delimiter=',')
    category_ids = ck.load_csv_column(categories_filename,
                                      args.categories_id_column,
                                      delimiter=',')
    print(f'Loaded {len(categories)} categories.')

    # build categorizer
word_frequencies = ck.count_word_frequencies(keywords)

print(word_frequencies)

embeddings, principal_components = ck.sif_embedding(keywords,
                                                    model,
                                                    word_frequencies,
                                                    n_principal_components=1,
                                                    alpha=1e-3,
                                                    principal_components=None,
                                                    return_components=True)

print("embeddings[5,:]:\n", embeddings[:, :5])
print("principal_components[5,:]\n", principal_components[:, :5])

embedder = ck.SIFEmbedder(model)

try:
    embedder.embed(keywords)
except RuntimeError:
    print("Embedder exception ok.")

embeddings1 = embedder.fit_embed(keywords)

print("Embeddings same:", np.array_equal(embeddings, embeddings1))
print("Principal components same:",
      np.array_equal(principal_components, embedder.principal_components))

param_json = embedder.serialize()

print("serialization done")