def setup_vocab(src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / 'vectors.tgz' if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / 'clusters.txt') probs = _read_probs(src_dir / 'words.sgt.prob') for word in clusters: if word not in probs: probs[word] = -17.0 lexicon = [] for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): entry = get_lex_props(word) if word in clusters or float(prob) >= -17: entry['prob'] = float(prob) cluster = clusters.get(word, '0') # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx entry['cluster'] = int(cluster[::-1], 2) vocab[word] = entry vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt'))
def setup_vocab(src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / "vectors.tgz" if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / "vec.bin")) else: print("Warning: Word vectors file not found") vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / "clusters.txt") probs = _read_probs(src_dir / "words.sgt.prob") if not probs: min_prob = 0.0 else: min_prob = min(probs.values()) for word in clusters: if word not in probs: probs[word] = min_prob lexicon = [] for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): entry = get_lex_props(word) if word in clusters or float(prob) >= -17: entry["prob"] = float(prob) cluster = clusters.get(word, "0") # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx entry["cluster"] = int(cluster[::-1], 2) orth_senses = set() lemmas = [] vocab[word] = entry vocab.dump(str(dst_dir / "lexemes.bin")) vocab.strings.dump(str(dst_dir / "strings.txt"))
def setup_vocab(src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / 'vectors.tgz' if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / 'clusters.txt') senses = _read_senses(src_dir / 'supersenses.txt') probs = _read_probs(src_dir / 'words.sgt.prob') for word in set(clusters).union(set(senses)): if word not in probs: probs[word] = -17.0 lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ) lexicon = [] for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])): entry = get_lex_props(word) if word in clusters or float(prob) >= -17: entry['prob'] = float(prob) cluster = clusters.get(word, '0') # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx entry['cluster'] = int(cluster[::-1], 2) orth_senses = set() lemmas = [] for pos in [NOUN, VERB, ADJ]: for lemma in lemmatizer(word.lower(), pos): lemmas.append(lemma) orth_senses.update(senses[lemma][pos]) if word.lower() == 'dogging': print word print lemmas print [spacy.senses.STRINGS[si] for si in orth_senses] entry['senses'] = list(sorted(orth_senses)) vocab[word] = entry vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt'))
def setup_vocab(src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() vectors_src = src_dir / 'vectors.tgz' if vectors_src.exists(): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: print("Warning: Word vectors file not found") vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') if not probs: oov_prob = 0.0 else: oov_prob = min(probs.values()) for word in clusters: if word not in probs: probs[word] = oov_prob lexicon = [] for word, prob in reversed( sorted(list(probs.items()), key=lambda item: item[1])): entry = get_lex_props(word) entry['prob'] = float(prob) cluster = clusters.get(word, '0') # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx entry['cluster'] = int(cluster[::-1], 2) vocab[word] = entry vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt')) with (dst_dir / 'oov_prob').open('w') as file_: file_.write('%f' % oov_prob)