Ejemplo n.º 1
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Vectors file (text-based)"),
    vocab_file: str = typer.Argument(..., help="Vocabulary file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    min_freq_ratio: float = typer.Option(0.0, "--min-freq-ratio", "-r", help="Frequency ratio threshold for discarding minority senses or casings"),
    min_distance: float = typer.Option(0.0, "--min-distance", "-s", help="Similarity threshold for discarding redundant keys"),
    # fmt: on
):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab = read_vocab(f)
    vectors = {}
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1)
        all_senses.add(sense)
        vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
    discarded = set()
    discarded.update(get_minority_keys(vocab, min_freq_ratio))
    discarded.update(get_redundant_keys(vocab, vectors, min_distance))
    n_vectors = len(vectors) - len(discarded)
    s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
    for key, vector in vectors.items():
        if key not in discarded:
            s2v.add(key, vector)
            s2v.set_freq(key, vocab[key])
    msg.good("Created the sense2vec model")
    msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
Ejemplo n.º 2
0
def main(in_file, vocab_file, out_dir):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab_data = f.readlines()
    data = []
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})",
                     exits=1)
        all_senses.add(sense)
        data.append((key, numpy.asarray(vec, dtype=numpy.float32)))
    s2v = Sense2Vec(shape=(len(data), vector_size), senses=all_senses)
    for key, vector in data:
        s2v.add(key, vector)
    for item in vocab_data:
        item = item.rstrip()
        if item.endswith(" word"):  # for fastText vocabs
            item = item[:-5]
        try:
            key, freq = item.rsplit(" ", 1)
        except ValueError:
            continue
        s2v.set_freq(key, int(freq))
    msg.good("Created the sense2vec model")
    msg.info(f"{len(data)} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
Ejemplo n.º 3
0
def get_blacklisted_sense_keys(freqs):
    """Remove keys with sense that is blacklisted"""
    discarded = []
    msg.info('collecting blacklisted sense keys')
    for key, freq in freqs.items():
        try:
            term, sense = split_key(key)
        except ValueError:
            continue
        if sense and sense not in sense_whitelist:
            discarded.append(key)
    return discarded
Ejemplo n.º 4
0
def get_markdown_and_url_keys(freqs):
    """Remove keys that are markdown syntax or full urls"""
    discarded = []
    msg.info('collecting markdown and url keys')
    for key, freq in freqs.items():
        try:
            term, sense = split_key(key)
        except ValueError:
            continue

        # remove keys that contain: `http://` or `https://` or `](` or (
        #   (`.php` or `.html` or `.asp`) and also contain `?` or `/`
        # )
        if term:
            if re.search(r'(\]\()|(http:\/\/)|(https:\/\/)',
                         term) or (re.search(r'(\.php)|(\.html)|(\.asp)', term)
                                   and re.search(r'[\?\/]', term)):
                discarded.append(key)

    return discarded
Ejemplo n.º 5
0
def get_minority_keys(freqs, min_ratio):
    """Remove keys that are too infrequent relative to a main sense."""
    by_word = defaultdict(list)
    for key, freq in freqs.items():
        try:
            term, sense = split_key(key)
        except ValueError:
            continue
        if freq:
            by_word[term.lower()].append((freq, key))
    discarded = []
    for values in by_word.values():
        if len(values) >= 2:
            values.sort(reverse=True)
            freq1, key1 = values[0]
            for freq2, key2 in values[1:]:
                ratio = freq2 / freq1
                if ratio < min_ratio:
                    discarded.append(key2)
    return discarded
Ejemplo n.º 6
0
def get_redundant_keys(vocab, vectors, min_distance):
    if min_distance <= 0.0:
        return []
    by_word = defaultdict(list)
    for key, freq in vocab.items():
        try:
            term, sense = split_key(key)
        except ValueError:
            continue
        term = term.split("_")[-1]
        by_word[term.lower()].append((freq, key))
    too_similar = []
    for values in by_word.values():
        if len(values) >= 2:
            values.sort(reverse=True)
            freq1, key1 = values[0]
            vector1 = vectors[key1]
            for freq2, key2 in values[1:]:
                vector2 = vectors[key2]
                sim = cosine_similarity(vector1, vector2)
                if sim >= (1 - min_distance):
                    too_similar.append(key2)
    return too_similar
Ejemplo n.º 7
0
def test_make_split_key(word, sense, expected):
    assert make_key(word, sense) == expected
    assert split_key(expected) == (word, sense)