def cluster_chunks( chunks: List[Span], stopwords: bool = False, filter_pos: List[str] = None, min_score: float = None, ): """ Cluster chunks by using a revisited **Radial Ball Mapper** algorithm Parameters ---------- chunks : List[Span] Chunks to cluster. stopwords : bool, optional Flag to exclude stopwords from chunks, by default False. filter_pos : List[str], optional POS tags to filter chunk words, by default None min_score : float, optional Threshold for clustering chunks, by default None Returns ------- List[List[Span]] Clusters of chunks """ key2index, key2vector = _map_key_to_vector(chunks, stopwords, filter_pos) if not key2index or not key2vector: return model = KeyedVectors(chunks[0].vector.size) keys = list(key2vector.keys()) weights = list(key2vector.values()) model.add_vectors(keys, weights) clusters = cluster_balls_multi(model, keys, min_score=min_score) return [[chunks[key2index[i]] for i in cluster] for cluster in clusters]
def to_gensim_model(self): """ function to convert wv attribute to gensim wv""" vocabList = list(self.wv.keys()) weights = list(self.wv.values()) from gensim.models import KeyedVectors gensim_w2v = KeyedVectors(self.hiddenSize) gensim_w2v.add_vectors(vocabList, weights) return gensim_w2v
def gensim_wv(self): """ function to convert self.result attribute to gensim wv""" if self.gensim_result is None: vocabList = list(self.keys()) weights = list(self.values()) from gensim.models import KeyedVectors gensim_w2v = KeyedVectors(self.hiddenSize) gensim_w2v.add_vectors(vocabList, weights) self.gensim_result = gensim_w2v return self.gensim_result
def test_cluster_balls(nlp): ents, wgts = zip(*[(c.text.lower(), c.vector) for c in ( nlp("apple"), nlp("pear"), nlp("orange"), nlp("lemon"), )]) model = KeyedVectors(wgts[0].size) model.add_vectors(ents, list(wgts)) print(cluster_balls(model)) # is not None # no root print(cluster_balls(model, root="orange")) # with root
def to_gensim_model(self): """ function to convert wv attribute to gensim wv""" vocabList = list(self.wv.keys()) weights = list(self.wv.values()) from gensim.models import KeyedVectors gensim_w2v = KeyedVectors(self.hiddenSize) try: gensim_w2v.add_vectors(vocabList, weights) except AttributeError as err: raise AttributeError(str(err) + \ "\nPlease ensure gensim >= 4.0.1 is installed!") return gensim_w2v
def handle(self, output_file, debug_output_file, **options): logger.info("Building definition vectors") definitions = Definition.objects.filter( auto_translation_source_id__isnull=True).prefetch_related( "wordform__lemma") count = definitions.count() news_vectors = google_news_vectors() definition_vector_keys = [] definition_vector_vectors = [] unknown_words = set() with create_debug_output(debug_output_file) as debug_output: for d in tqdm(definitions.iterator(), total=count): keys = extract_keyed_words(d.semantic_definition, news_vectors, unknown_words) debug_output( json.dumps( { "definition": d.text, "wordform_text": d.wordform.text, "extracted_keys": keys, }, ensure_ascii=False, )) if keys: vec_sum = vector_for_keys(news_vectors, keys) definition_vector_keys.append(definition_to_cvd_key(d)) definition_vector_vectors.append(vec_sum) definition_vectors = KeyedVectors( vector_size=news_vectors.vector_size) definition_vectors.add_vectors(definition_vector_keys, definition_vector_vectors) output_file.parent.mkdir(exist_ok=True) definition_vectors.save(fspath(output_file))
def as_keyed_vectors(self) -> KeyedVectors: """ Generated a KeyedVectors instance with all the possible edge embeddings :return: Edge embeddings """ edge_generator = combinations_with_replacement(getattr( self.kv, self.INDEX_MAPPING_KEY), r=2) if not self.quiet: vocab_size = len(getattr(self.kv, self.INDEX_MAPPING_KEY)) total_size = reduce(lambda x, y: x * y, range(1, vocab_size + 2)) / \ (2 * reduce(lambda x, y: x * y, range(1, vocab_size))) edge_generator = tqdm(edge_generator, desc='Generating edge features', total=total_size) # Generate features tokens = [] features = [] for edge in edge_generator: token = str(tuple(sorted(edge))) embedding = self._embed(edge) tokens.append(token) features.append(embedding) # Build KV instance edge_kv = KeyedVectors(vector_size=self.kv.vector_size) if pkg_resources.get_distribution("gensim").version < '4.0.0': edge_kv.add(entities=tokens, weights=features) else: edge_kv.add_vectors(keys=tokens, weights=features) return edge_kv
print("Loading vectors in gensim...") model = KeyedVectors.load_word2vec_format(args.filename) if not args.compress: compute_accuracy(model) exit(0) embeddings = model.vectors size = embeddings.nbytes print("Reduce dimensions using PCA...") embeddings = reduce_dimensions_pca(embeddings) new_size = embeddings.nbytes print("Size reduction: {:f}%".format((size - new_size) * 100 / size)) print("Compress embeddings using product quantization...") embeddings, codes, centroids = product_quantize(embeddings) new_size = codes.nbytes + centroids.nbytes print("Size reduction: {:f}%".format((size-new_size)*100/size)) words = [model.index_to_key[idx] for idx in range(len(embeddings))] model = KeyedVectors(vector_size=embeddings.shape[1]) model.add_vectors(words, embeddings, replace=True) compute_accuracy(model) save_model('generated', embeddings.shape[1], words, codes, centroids)