def main( # fmt: off in_file: str = typer.Argument(..., help="Vectors file (text-based)"), vocab_file: str = typer.Argument(..., help="Vocabulary file"), out_dir: str = typer.Argument(..., help="Path to output directory"), min_freq_ratio: float = typer.Option(0.0, "--min-freq-ratio", "-r", help="Frequency ratio threshold for discarding minority senses or casings"), min_distance: float = typer.Option(0.0, "--min-distance", "-s", help="Similarity threshold for discarding redundant keys"), # fmt: on ): """ Step 5: Export a sense2vec component Expects a vectors.txt and a vocab file trained with GloVe and exports a component that can be loaded with Sense2vec.from_disk. """ input_path = Path(in_file) vocab_path = Path(vocab_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if input_path.suffix == ".bin": msg.fail("Need text-based vectors file, not binary", in_file, exits=1) if not vocab_path.exists(): msg.fail("Can't find vocab file", vocab_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") with input_path.open("r", encoding="utf8") as f: (n_vectors, vector_size), f = _get_shape(f) vectors_data = f.readlines() with vocab_path.open("r", encoding="utf8") as f: vocab = read_vocab(f) vectors = {} all_senses = set() for item in vectors_data: item = item.rstrip().rsplit(" ", vector_size) key = item[0] try: _, sense = split_key(key) except ValueError: continue vec = item[1:] if len(vec) != vector_size: msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) all_senses.add(sense) vectors[key] = numpy.asarray(vec, dtype=numpy.float32) discarded = set() discarded.update(get_minority_keys(vocab, min_freq_ratio)) discarded.update(get_redundant_keys(vocab, vectors, min_distance)) n_vectors = len(vectors) - len(discarded) s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses) for key, vector in vectors.items(): if key not in discarded: s2v.add(key, vector) s2v.set_freq(key, vocab[key]) msg.good("Created the sense2vec model") msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses") s2v.to_disk(output_path) msg.good("Saved model to directory", out_dir)
def main(in_file, vocab_file, out_dir): """ Step 5: Export a sense2vec component Expects a vectors.txt and a vocab file trained with GloVe and exports a component that can be loaded with Sense2vec.from_disk. """ input_path = Path(in_file) vocab_path = Path(vocab_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if input_path.suffix == ".bin": msg.fail("Need text-based vectors file, not binary", in_file, exits=1) if not vocab_path.exists(): msg.fail("Can't find vocab file", vocab_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") with input_path.open("r", encoding="utf8") as f: (n_vectors, vector_size), f = _get_shape(f) vectors_data = f.readlines() with vocab_path.open("r", encoding="utf8") as f: vocab_data = f.readlines() data = [] all_senses = set() for item in vectors_data: item = item.rstrip().rsplit(" ", vector_size) key = item[0] try: _, sense = split_key(key) except ValueError: continue vec = item[1:] if len(vec) != vector_size: msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) all_senses.add(sense) data.append((key, numpy.asarray(vec, dtype=numpy.float32))) s2v = Sense2Vec(shape=(len(data), vector_size), senses=all_senses) for key, vector in data: s2v.add(key, vector) for item in vocab_data: item = item.rstrip() if item.endswith(" word"): # for fastText vocabs item = item[:-5] try: key, freq = item.rsplit(" ", 1) except ValueError: continue s2v.set_freq(key, int(freq)) msg.good("Created the sense2vec model") msg.info(f"{len(data)} vectors, {len(all_senses)} total senses") s2v.to_disk(output_path) msg.good("Saved model to directory", out_dir)
def get_blacklisted_sense_keys(freqs): """Remove keys with sense that is blacklisted""" discarded = [] msg.info('collecting blacklisted sense keys') for key, freq in freqs.items(): try: term, sense = split_key(key) except ValueError: continue if sense and sense not in sense_whitelist: discarded.append(key) return discarded
def get_markdown_and_url_keys(freqs): """Remove keys that are markdown syntax or full urls""" discarded = [] msg.info('collecting markdown and url keys') for key, freq in freqs.items(): try: term, sense = split_key(key) except ValueError: continue # remove keys that contain: `http://` or `https://` or `](` or ( # (`.php` or `.html` or `.asp`) and also contain `?` or `/` # ) if term: if re.search(r'(\]\()|(http:\/\/)|(https:\/\/)', term) or (re.search(r'(\.php)|(\.html)|(\.asp)', term) and re.search(r'[\?\/]', term)): discarded.append(key) return discarded
def get_minority_keys(freqs, min_ratio): """Remove keys that are too infrequent relative to a main sense.""" by_word = defaultdict(list) for key, freq in freqs.items(): try: term, sense = split_key(key) except ValueError: continue if freq: by_word[term.lower()].append((freq, key)) discarded = [] for values in by_word.values(): if len(values) >= 2: values.sort(reverse=True) freq1, key1 = values[0] for freq2, key2 in values[1:]: ratio = freq2 / freq1 if ratio < min_ratio: discarded.append(key2) return discarded
def get_redundant_keys(vocab, vectors, min_distance): if min_distance <= 0.0: return [] by_word = defaultdict(list) for key, freq in vocab.items(): try: term, sense = split_key(key) except ValueError: continue term = term.split("_")[-1] by_word[term.lower()].append((freq, key)) too_similar = [] for values in by_word.values(): if len(values) >= 2: values.sort(reverse=True) freq1, key1 = values[0] vector1 = vectors[key1] for freq2, key2 in values[1:]: vector2 = vectors[key2] sim = cosine_similarity(vector1, vector2) if sim >= (1 - min_distance): too_similar.append(key2) return too_similar
def test_make_split_key(word, sense, expected): assert make_key(word, sense) == expected assert split_key(expected) == (word, sense)