def main() -> None:  # pylint: disable=missing-function-docstring
    formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
    parser = argparse.ArgumentParser(prog="ffp-similar",
                                     description="Similarity queries.")
    parser.add_argument("embeddings",
                        type=str,
                        help="Input embeddings",
                        metavar="EMBEDDINGS")
    add_format_args(parser, "f", "format", formats, "finalfusion")
    parser.add_argument("-k",
                        type=int,
                        help="Number of neighbours. Default: 10",
                        default=10,
                        metavar="K")
    parser.add_argument(
        "input",
        help=
        "Optional input file with one word per line. If unspecified reads from stdin",
        nargs='?',
        default=0,
    )
    add_common_args(parser)
    args = parser.parse_args()
    embeds = Format(args.format).load(args.embeddings, args.lossy, args.mmap)
    with open(args.input) as queries:
        for query in queries:
            query = query.strip()
            if not query:
                continue
            res = embeds.word_similarity(query, k=args.k)
            if res is None:
                print(f"Could not compute neighbours for: {query}",
                      file=sys.stderr)
            else:
                print("\n".join(f"{ws.word} {ws.similarity}" for ws in res))
def main() -> None:  # pylint: disable=missing-function-docstring
    formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
    parser = argparse.ArgumentParser(prog="ffp-convert",
                                     description="Convert embeddings.")
    add_input_output_args(parser)
    add_format_args(parser, "f", "from", formats, "word2vec")
    add_format_args(parser, "t", "to", formats, "finalfusion")
    add_common_args(parser)
    args = parser.parse_args()
    embeds = Format(getattr(args, 'from')).load(args.input, args.lossy,
                                                args.mmap)
    Format(args.to).write(args.output, embeds)
def main() -> None:  # pylint: disable=missing-function-docstring
    parser = argparse.ArgumentParser(
        prog="ffp-bucket-to-explicit",
        description="Convert bucket embeddings to explicit lookups.")

    add_input_output_args(parser)
    add_format_args(parser, "f", "from", ["finalfusion", "fasttext"],
                    "finalfusion")
    add_common_args(parser)
    args = parser.parse_args()
    embeds = Format(getattr(args, 'from')).load(args.input, args.lossy,
                                                args.mmap)
    embeds.bucket_to_explicit().write(args.output)
def main() -> None:  # pylint: disable=missing-function-docstring
    formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
    parser = argparse.ArgumentParser(prog="ffp-analogy",
                                     description="Analogy queries.")
    parser.add_argument("embeddings",
                        help="Input embeddings",
                        type=str,
                        metavar="EMBEDDINGS")
    add_format_args(parser, "f", "format", formats, "finalfusion")
    parser.add_argument(
        "-i",
        "--include",
        choices=["a", "b", "c"],
        nargs="+",
        default=[],
        help=
        "Specify query parts that should be allowed as answers. Valid choices: ['a', 'b', 'c']"
    )
    parser.add_argument("-k",
                        type=int,
                        default=10,
                        help="Number of neighbours. Default: 10",
                        metavar="K")
    parser.add_argument(
        "input",
        help=
        "Optional input file with 3 words per line. If unspecified reads from stdin",
        nargs='?',
        default=0)
    add_common_args(parser)
    args = parser.parse_args()
    if args.include != [] and len(args.include) > 3:
        print("-i/--include can take up to 3 unique values: a, b and c.",
              file=sys.stderr)
        sys.exit(1)
    embeds = Format(args.format).load(args.embeddings, args.lossy, args.mmap)
    with open(args.input) as queries:
        for query in queries:
            query_a, query_b, query_c = query.strip().split()
            skips = get_skips(query_a, query_b, query_c, args.include)
            res = embeds.analogy(query_a,
                                 query_b,
                                 query_c,
                                 k=args.k,
                                 skip=skips)
            if res is None:
                print(
                    f"Could not compute for: {query_a} : {query_b}, {query_c} : ? ",
                    file=sys.stderr)
            else:
                print("\n".join(f"{ws.word} {ws.similarity}" for ws in res))
Example #5
0
def main() -> None:  # pylint: disable=missing-function-docstring
    formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
    parser = argparse.ArgumentParser(
        prog="ffp-select", description="Build embeddings from list of words.")
    add_input_output_args(parser)
    add_format_args(parser, "f", "format", formats, "finalfusion")
    parser.add_argument(
        "words",
        nargs='?',
        default=0,
        metavar="WORDS",
        help=
        "List of words to include in the embeddings. One word per line. Spaces permitted."
        "Reads from stdin if unspecified.")
    parser.add_argument("--ignore_unk",
                        "-i",
                        action="store_true",
                        default=False,
                        help="Skip unrepresentable words.")
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        default=False,
        help=
        "Print which tokens are skipped because they can't be represented to stderr."
    )
    add_common_args(parser)
    args = parser.parse_args()
    embeds = Format(args.format).load(args.input, args.lossy, args.mmap)
    with open(args.words, errors='replace' if args.lossy else 'strict') as inp:
        unique_words = set(word.strip() for word in inp)
        matrix = np.zeros((len(unique_words), embeds.storage.shape[1]),
                          dtype=np.float32)
        vocab = SimpleVocab(list(unique_words))
        for i, word in enumerate(vocab):
            try:
                matrix[i] = embeds[word]
            except KeyError:
                if args.verbose or not args.ignore_unk:
                    print(f"Cannot represent '{word}'.", file=sys.stderr)
                if not args.ignore_unk:
                    sys.exit(1)
    metadata = Metadata({"source_embeddings": args.input})
    if embeds.metadata is not None:
        metadata["source_metadata"] = embeds.metadata
    Embeddings(storage=NdArray(matrix), vocab=vocab,
               metadata=metadata).write(args.output)