def lang(): vector_data = {k: np.random.normal(0, 1, (2,)) for k in ["red", "blue", "cat", "dog", "green", "purple"]} vector_data['cat'] += 10 vector_data['dog'] += 10 vocab = Vocab(strings=vector_data.keys()) for word, vector in vector_data.items(): vocab.set_vector(word, vector) nlp = Language(vocab=vocab) return SpacyLanguage(nlp)
def get_docs(): vocab = Vocab() for t in texts: for word in t.split(): hash_id = vocab.strings.add(word) vector = numpy.random.uniform(-1, 1, (7, )) vocab.set_vector(hash_id, vector) docs = [English(vocab)(t) for t in texts] return docs
def test_vector_is_oov(): vocab = Vocab(vectors_name="test_vocab_is_oov") data = OPS.xp.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) assert vocab["cat"].is_oov is False assert vocab["dog"].is_oov is False assert vocab["hamster"].is_oov is True
def test_vocab_add_vector(): vocab = Vocab() data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) cat = vocab["cat"] assert list(cat.vector) == [1.0, 1.0, 1.0] dog = vocab["dog"] assert list(dog.vector) == [2.0, 2.0, 2.0]
def color_lang(): vector_data = { "red": np.array([1.0, 0.0]), "green": np.array([0.5, 0.5]), "blue": np.array([0.0, 1.0]), "purple": np.array([0.0, 1.0]), } vocab = Vocab(strings=vector_data.keys()) for word, vector in vector_data.items(): vocab.set_vector(word, vector) nlp = Language(vocab=vocab) return SpacyLanguage(nlp)
def test_doc_token_api_vectors(): vocab = Vocab() vocab.reset_vectors(width=2) vocab.set_vector("apples", vector=numpy.asarray([0.0, 2.0], dtype="f")) vocab.set_vector("oranges", vector=numpy.asarray([0.0, 1.0], dtype="f")) doc = Doc(vocab, words=["apples", "oranges", "oov"]) assert doc.has_vector assert doc[0].has_vector assert doc[1].has_vector assert not doc[2].has_vector apples_norm = (0 * 0 + 2 * 2)**0.5 oranges_norm = (0 * 0 + 1 * 1)**0.5 cosine = ((0 * 0) + (2 * 1)) / (apples_norm * oranges_norm) assert doc[0].similarity(doc[1]) == cosine
def test_doc_token_api_vectors(): vocab = Vocab() vocab.reset_vectors(width=2) vocab.set_vector("apples", vector=numpy.asarray([0.0, 2.0], dtype="f")) vocab.set_vector("oranges", vector=numpy.asarray([0.0, 1.0], dtype="f")) doc = Doc(vocab, words=["apples", "oranges", "oov"]) assert doc.has_vector assert doc[0].has_vector assert doc[1].has_vector assert not doc[2].has_vector apples_norm = (0 * 0 + 2 * 2) ** 0.5 oranges_norm = (0 * 0 + 1 * 1) ** 0.5 cosine = ((0 * 0) + (2 * 1)) / (apples_norm * oranges_norm) assert doc[0].similarity(doc[1]) == cosine
def test_vocab_add_vector(): vocab = Vocab(vectors_name="test_vocab_add_vector") data = OPS.xp.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) cat = vocab["cat"] assert list(cat.vector) == [1.0, 1.0, 1.0] dog = vocab["dog"] assert list(dog.vector) == [2.0, 2.0, 2.0] with pytest.raises(ValueError): vocab.vectors.add(vocab["hamster"].orth, row=1000000)
def write_vectors_model(tmp_dir): import numpy vocab = Vocab() vector_data = { "dog": numpy.random.uniform(-1, 1, (300, )), "cat": numpy.random.uniform(-1, 1, (300, )), "orange": numpy.random.uniform(-1, 1, (300, )) } for word, vector in vector_data.items(): vocab.set_vector(word, vector) nlp_path = tmp_dir / "vectors_model" nlp = English(vocab) nlp.to_disk(nlp_path) return str(nlp_path)
def test_issue4725(): # ensures that this runs correctly and doesn't hang or crash because of the global vectors vocab = Vocab(vectors_name="test_vocab_add_vector") data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) nlp = English(vocab=vocab) ner = nlp.create_pipe("ner") nlp.add_pipe(ner) nlp.begin_training() docs = ["Kurt is in London."] * 10 for _ in nlp.pipe(docs, batch_size=2, n_process=2): pass
def read_vectors(filename): def load_embeddings(filename): with open(filename, encoding='utf-8') as infile: for i, line in enumerate(infile): items = line.rstrip().split(' ') if len(items) == 2: # This is a header row giving the shape of the matrix continue word = items[0] vec = np.array([float(x) for x in items[1:]], 'f') yield word, vec / np.linalg.norm(vec) vocab = Vocab() for word, vector in load_embeddings(filename): vocab.set_vector(word, vector) return vocab
def test_pickle_vocab(text1, text2): vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) vocab.set_vector("dog", numpy.ones((5,), dtype="f")) lex1 = vocab[text1] lex2 = vocab[text2] assert lex1.norm_ == text1[:-1] assert lex2.norm_ == text2[:-1] data = srsly.pickle_dumps(vocab) unpickled = srsly.pickle_loads(data) assert unpickled[text1].orth == lex1.orth assert unpickled[text2].orth == lex2.orth assert unpickled[text1].norm == lex1.norm assert unpickled[text2].norm == lex2.norm assert unpickled[text1].norm != unpickled[text2].norm assert unpickled.vectors is not None assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]
def test_pickle_vocab(text1, text2): vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) vocab.set_vector("dog", numpy.ones((5, ), dtype="f")) lex1 = vocab[text1] lex2 = vocab[text2] assert lex1.norm_ == text1[:-1] assert lex2.norm_ == text2[:-1] data = srsly.pickle_dumps(vocab) unpickled = srsly.pickle_loads(data) assert unpickled[text1].orth == lex1.orth assert unpickled[text2].orth == lex2.orth assert unpickled[text1].norm == lex1.norm assert unpickled[text2].norm == lex2.norm assert unpickled[text1].norm != unpickled[text2].norm assert unpickled.vectors is not None assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]
def test_issue2871(): """Test that vectors recover the correct key for spaCy reserved words.""" words = ["dog", "cat", "SUFFIX"] vocab = Vocab(vectors_name="test_issue2871") vocab.vectors.resize(shape=(3, 10)) vector_data = numpy.zeros((3, 10), dtype="f") for word in words: _ = vocab[word] # noqa: F841 vocab.set_vector(word, vector_data[0]) vocab.vectors.name = "dummy_vectors" assert vocab["dog"].rank == 0 assert vocab["cat"].rank == 1 assert vocab["SUFFIX"].rank == 2 assert vocab.vectors.find(key="dog") == 0 assert vocab.vectors.find(key="cat") == 1 assert vocab.vectors.find(key="SUFFIX") == 2
def test_issue4725_2(): if isinstance(get_current_ops, NumpyOps): # ensures that this runs correctly and doesn't hang or crash because of the global vectors # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), # or because of issues with pickling the NER (cf test_issue4725_1) vocab = Vocab(vectors_name="test_vocab_add_vector") data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) nlp = English(vocab=vocab) nlp.add_pipe("ner") nlp.initialize() docs = ["Kurt is in London."] * 10 for _ in nlp.pipe(docs, batch_size=2, n_process=2): pass
def test_vocab_prune_vectors(): vocab = Vocab(vectors_name="test_vocab_prune_vectors") _ = vocab["cat"] # noqa: F841 _ = vocab["dog"] # noqa: F841 _ = vocab["kitten"] # noqa: F841 data = numpy.ndarray((5, 3), dtype="f") data[0] = [1.0, 1.2, 1.1] data[1] = [0.3, 1.3, 1.0] data[2] = [0.9, 1.22, 1.05] vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) vocab.set_vector("kitten", data[2]) remap = vocab.prune_vectors(2, batch_size=2) assert list(remap.keys()) == ["kitten"] neighbour, similarity = list(remap.values())[0] assert neighbour == "cat", remap assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
def test_vocab_prune_vectors(): vocab = Vocab() _ = vocab["cat"] # noqa: F841 _ = vocab["dog"] # noqa: F841 _ = vocab["kitten"] # noqa: F841 data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 data[2] = 1.1 vocab.set_vector("cat", data[0]) vocab.set_vector("dog", data[1]) vocab.set_vector("kitten", data[2]) remap = vocab.prune_vectors(2) assert list(remap.keys()) == ["kitten"] neighbour, similarity = list(remap.values())[0] assert neighbour == "cat", remap assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
class Neighbors: def __init__(self, nlp_obj): file = 'data/programGeneratedData/768embedding2016.txt' df = pd.read_csv(file, sep=" ", encoding='cp1252', header=None) df = df.drop(columns=769) D = {} # dictionary of all words and vectors in bert semeval data L = df.loc[:, 0].values # list of all words for i, word in enumerate(L): D[word] = df.loc[i, 1:].values self.vocab = Vocab() for word, vector in D.items(): self.vocab.set_vector(word, vector) self.nlp = nlp_obj self.nlp.tokenizer = Tokenizer(self.nlp.vocab) self.to_check = [self.vocab[w] for w in self.vocab.strings] self.n = {} def neighbors(self, word): word = unicode(word) orig_word = word if word not in self.n: if word not in self.vocab.strings: self.n[word] = [] else: word = self.vocab[unicode(word)] queries = [w for w in self.to_check] by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True) self.n[orig_word] = [(self.nlp(by_similarity[0].orth_)[0], word.similarity(by_similarity[0]))] self.n[orig_word] += [ (self.nlp(w.orth_)[0], word.similarity(w)) for w in by_similarity[100:600] if self.nlp(word.orth_)[0].text.split('_')[0] != self.nlp( w.orth_)[0].text.split('_')[0] ] return self.n[orig_word]
"email <TO> my friend <BODY> I'm sick I cannot come to school.", "email <TO> my colleagues <BODY> I am about 15 minutes late for work.", "remind <TO> me <WHEN> 8am tomorrow <BODY> Take out the trash.", "remind <TO> me <WHEN> next week, monday, 5pm <BODY> Dentist appointment.", "remind <TO> my team <WHEN> friday <BODY> Meeting."] # Add custom vectors for tagging stuff keywords = {"<START>": np.random.uniform(-1, 1, (300,)), "<END>": np.random.uniform(-1, 1, (300,)), "<TO>": np.random.uniform(-1, 1, (300,)), "<WHEN>": np.random.uniform(-1, 1, (300,)), "<BODY>": np.random.uniform(-1, 1, (300,)), "<PAD>": np.random.uniform(-1, 1, (300,))} for word, vector in keywords.items(): nlp.vocab.set_vector(word, vector) vocab_subset.set_vector(word, vector) # TODO(alexander): These needs to be stored so we can interpret future uses of this model # Separate dataset outputs (inputs to decoder) and its target dataset_targets = [] for i in range(len(dataset_outputs)): dataset_targets.append(dataset_outputs[i] + " <END>"); dataset_outputs[i] = "<START> " + dataset_outputs[i] # Convert sentences into vectors of words input_vectors = extract_word_embeddings(nlp, keywords, dataset_inputs) output_vectors = extract_word_embeddings(nlp, keywords, dataset_outputs) target_vectors = extract_word_embeddings(nlp, keywords, dataset_targets) # Make sure that each sentence have num_steps vectors, add padding if needed # NOTE: the neural network requires sequences to have the same dimensions, strict requirement.
def test_doc_api_has_vector(): vocab = Vocab() vocab.reset_vectors(width=2) vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f")) doc = Doc(vocab, words=["kitten"]) assert doc.has_vector
def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" vocab = Vocab(vectors_name="test_issue1807") assert "hello" not in vocab vocab.set_vector("hello", numpy.ones((50, ), dtype="f")) assert "hello" in vocab
def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" vocab = Vocab() assert "hello" not in vocab vocab.set_vector("hello", numpy.ones((50,), dtype="f")) assert "hello" in vocab
"man", "woman", "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire", "dog", "cat", "mouse", "red", "bluee", "green", "yellow", "water", "person", "family", "brother", "sister", ] nlp = spacy.load("en_core_web_md") vec_data = {w: nlp(w).vector for w in words} vocab = Vocab(strings=words) for word, vector in vec_data.items(): vocab.set_vector(word, vector) nlp = Language(vocab=vocab, meta={"lang": "en"}) vocab.to_disk("custom_test_vocab")
class NlPipe: """ class for creating LDA models using sklearn. Deprecated as gensim is used. """ def __init__(self, list_of_docs, document_ids=None, language_model="en_core_web_lg", tagger=False, parser=False, ner=False, categorization=False, remove_stopwords=True, remove_punctuation=True, set_lower=True, remove_num=True, expand_stopwords=True, language_detection=False, allowed_languages=frozenset({'en'})): """ :param list_of_docs: List of strings where every document is one string. :param document_ids: The ids of the documents, matching the order of the list_of_docs :param language_model: Spacy language model to be used for text preprocessing :param tagger: Use spacy part-of-speech tagger. :param parser: Use spacy to annotate syntactic dependencies in documents. :param ner: Use spacy for entity recognition and annotation. :param categorization: Use spacy to assign document labels :param remove_stopwords: Remove stop words during text preprocessing. :param remove_punctuation: Remove punctuation during text prssing. :param set_lower: Convert all strings to lowercase during text preprocessing. :param remove_num: Remove numeric characters during text preprocessing. :param expand_stopwords: Remove non-alpha-characters in stop words and add them to the stop words. :param language_detection: Detect language of docs. :param allowed_languages: Allowed language for the documents. """ self.pipe_disable = [] if not tagger: self.pipe_disable.append("tagger") if not parser: self.pipe_disable.append("parser") if not ner: self.pipe_disable.append("ner") if not categorization: self.pipe_disable.append("textcat") self.remove_punctuation = remove_punctuation self.remove_stop_words = remove_stopwords self.remove_num = remove_num self.set_lower = set_lower self.input_docs = list_of_docs self.document_ids = document_ids self.nlp = spacy.load(language_model) if expand_stopwords: stops = [stop for stop in self.nlp.Defaults.stop_words] for stop in stops: self.nlp.Defaults.stop_words.add(re.sub(r"[\W]", "", stop)) self.spacy_docs = None self.processed_docs = None self.vectorizer = None self.bag_of_words = None self.tf_idf = None self.preprocessing_batch_size = 500 self.processes = multiprocessing.cpu_count() - 2 self.lda_model = None self.lda_output = None self.grid_search = None self.evaluation_output = None self.result_df = None self.word_topic_df = None self.word_topic_intersection = None self.intersection_score = None self.allowed_languages = allowed_languages self.language_detection = language_detection self.spacy_vocab = None self.word_distance_dict = None self.word_topic_distance_sum = 0 self.unigram_dict = None self.bigram_dict = None def enable_pipe_component(self, component): if component in self.pipe_disable: self.pipe_disable.remove(component) #todo: add info if not in list from beginning or if successfully enable def disable_pipe_component(self, component): if component not in self.pipe_disable: self.pipe_disable.append(component) # todo: add info if not in list from beginning or if successfully enable def preprocess_spacy(self): # todo: add language check if self.language_detection: self.spacy_docs = [ doc for doc in tqdm(self.nlp.pipe( self.input_docs, disable=self.pipe_disable, n_process=self.processes, batch_size=self.preprocessing_batch_size), desc="Preprocessing text with spacy: ") if detect(doc.text) in self.allowed_languages ] else: self.spacy_docs = [ doc for doc in tqdm(self.nlp.pipe( self.input_docs, disable=self.pipe_disable, n_process=self.processes, batch_size=self.preprocessing_batch_size), desc="Preprocessing text with spacy: ") ] def preprocess(self): self.processed_docs = [] if not self.spacy_docs: self.preprocess_spacy() for spacy_doc in tqdm( self.spacy_docs, desc="Removing stop words/punctuation/numeric chars: "): doc = [] for token in spacy_doc: if not self.remove_stop_words and token.is_stop: word = token.text elif token.is_stop: continue else: word = token.text if self.set_lower: word = word.lower() if self.remove_num: word = re.sub(r"[\d]", "", word) if self.remove_punctuation: word = re.sub(r"[\W]", "", word) if len(word) >= 2: doc.append(word) self.processed_docs.append(doc) def create_bag_of_words(self, n_grams=(1, 1), min_df=0.01, max_df=0.6): self.preprocess_spacy() self.preprocess() joined_docs = [] for doc in self.processed_docs: joined_docs.append(" ".join(doc)) self.vectorizer = CountVectorizer(lowercase=False, ngram_range=n_grams, min_df=min_df, max_df=max_df) self.bag_of_words = self.vectorizer.fit_transform(joined_docs) def create_tf_idf(self, n_grams=(1, 1), min_df=0.01, max_df=0.6): self.preprocess_spacy() self.preprocess() joined_docs = [] for doc in self.processed_docs: joined_docs.append(" ".join(doc)) self.vectorizer = TfidfVectorizer(lowercase=False, ngram_range=n_grams, min_df=min_df, max_df=max_df) self.tf_idf = self.vectorizer.fit_transform(joined_docs) def create_lda_model(self, no_topics=10, input_type="bag"): self.lda_model = LDA(n_jobs=self.processes, n_components=no_topics) if input_type == "bag": if self.bag_of_words is None: self.create_bag_of_words() self.lda_output = self.lda_model.fit_transform(self.bag_of_words) else: self.create_tf_idf() self.lda_output = self.lda_model.fit_transform(self.tf_idf) def search_best_model(self, n_components=[2, 3, 4, 5, 10, 15, 20, 25], learning_decay=[.5, .7, .9], input_type="bag"): lda_model = LDA() self.grid_search = GridSearchCV(lda_model, { "n_components": n_components, "learning_decay": learning_decay }) if input_type == "bag": if self.bag_of_words is None: self.create_bag_of_words() self.grid_search.fit(self.bag_of_words) else: if self.tf_idf is None: self.create_tf_idf() self.grid_search.fit(self.tf_idf) def create_document_topic_df(self, model=None, no_topics=10, input_type="bag", input_matrix=None): if model is None: self.create_lda_model(no_topics=no_topics, input_type=input_type) else: self.lda_model = model if input_matrix is not None: self.evaluation_output = self.lda_model.fit_transform(input_matrix) elif input_type == "bag": self.evaluation_output = self.lda_model.fit_transform( self.bag_of_words) else: self.evaluation_output = self.lda_model.fit_transform(self.tf_idf) self.result_df = pd.DataFrame(self.evaluation_output) if self.document_ids is not None and not self.language_detection: self.result_df.index = self.document_ids elif self.document_ids is not None and self.language_detection: raise Warning( "Using document ids and language detection together is not implemented (yet)." ) dominant_topic = np.argmax(self.result_df.values, axis=1) self.result_df['dominant_topic'] = dominant_topic def plot_document_topic_distribution(self): #todo: log normalize counter = Counter(self.result_df.dominant_topic) topic_dict = OrderedDict( sorted(counter.items(), key=lambda x: x[1], reverse=True)) sns.barplot(x=list(topic_dict.values()), y=list(topic_dict.keys()), order=list(topic_dict.keys()), orient='h') plt.show() def evaluate_model(self, no_words=30): keywords = np.array(self.vectorizer.get_feature_names()) topic_keywords = [] for topic_weights in self.lda_model.components_: top_keyword_locations = (-topic_weights).argsort()[:no_words] topic_keywords.append(keywords.take(top_keyword_locations)) self.word_topic_df = pd.DataFrame( topic_keywords, columns=[f"word_{x}" for x in range(no_words)]) def evaluate_pyldavis(self): panel = pyLDAvis.sklearn.prepare(self.lda_model, self.bag_of_words, self.vectorizer) pyLDAvis.show(panel) def get_word_topic_intersection(self, no_words=30, no_topics=10): if not isinstance(self.word_topic_df, pd.DataFrame): self.evaluate_model(no_words=no_words) elif isinstance( self.word_topic_df, pd.DataFrame) and self.word_topic_df.shape[1] != no_words: self.evaluate_model(no_words=no_words) intersection_list = [] intersection_score = 0 all_combinations = [ combo for combo in combinations(range(no_topics), 2) ] for x in range(no_topics): temp_list = [] for y in range(no_topics): if x != y: temp_list.append( len( set(self.word_topic_df[self.word_topic_df.index == x].values[0]). intersection(self.word_topic_df[ self.word_topic_df.index == y].values[0])) / no_words) if (x, y) in all_combinations: intersection_score += len( set(self.word_topic_df[self.word_topic_df.index == x]. values[0]).intersection( self.word_topic_df[self.word_topic_df.index == y].values[0])) / no_words else: temp_list.append(1) intersection_list.append(temp_list) self.intersection_score = intersection_score / len(all_combinations) self.word_topic_intersection = pd.DataFrame(intersection_list) def get_topic_word_distance_sum(self, no_words=30): self.word_distance_dict = {} if not isinstance(self.word_topic_df, pd.DataFrame): self.evaluate_model(no_words=no_words) elif isinstance( self.word_topic_df, pd.DataFrame) and self.word_topic_df.shape[1] != no_words: self.evaluate_model(no_words=no_words) if self.spacy_vocab is None: self.load_textgain_embs() for index in self.word_topic_df.index: topic_distance_sum = 0 missing_count = 0 for word_a, word_b in combinations( self.word_topic_df[self.word_topic_df.index == index].values[0], 2): if self.spacy_vocab.has_vector( str(word_a)) and self.spacy_vocab.has_vector( str(word_b)): topic_distance_sum += np.linalg.norm( self.spacy_vocab.get_vector(str(word_a)) - self.spacy_vocab.get_vector(str(word_b))) else: missing_count += 1 self.word_distance_dict[index] = topic_distance_sum / ( (factorial(no_words) / (factorial(2) * factorial(no_words - 2))) - missing_count) self.word_topic_distance_sum = sum( self.word_distance_dict.values()) / len( self.word_distance_dict.keys()) # todo: sum of distance between words in topic derived from word embedding # todo: sum of sum of distances divided by no topics def load_textgain_embs(self, from_txt=False, path="textgain_embeddings/spacy_vocab"): self.spacy_vocab = Vocab() if from_txt: with open(path) as f: for line in f: split_line = line.split() self.spacy_vocab.set_vector( "".join(split_line[:-150]), np.array([float(coord) for coord in split_line[-150:]])) else: self.spacy_vocab.from_disk(path) def calculate_coherence(self, type="cosine"): pass #todo: add coherence function here def calculate_jaccard(self): pass #todo: calculate jaccard distance here def calculate_cosine(self, word_1, word_2): return np.dot( word_1, word_2) / (np.linalg.norm(word_1) * np.linalg.norm(word_2)) def calculate_dice(self): pass #todo: calculate dice coefficient here def calculate_centroid_sim(self): pass #todo: calculate centroid similarity here def calculate_word_probs(self): #todo: calculate unigram and probability of words self.unigram_dict = defaultdict(int) self.bigram_dict = defaultdict(int) unigram_count = 0 bigram_count = 0 for doc in tqdm(self.processed_docs, desc="calculation uni- and bigram probabilities: "): for i, word in enumerate(doc): self.unigram_dict[word] += 1 unigram_count += 1 try: self.bigram_dict[" ".join([word, doc[i + 1]])] += 1 bigram_count += 1 except: pass self.unigram_dict = { k: v / unigram_count for k, v in self.unigram_dict.items() } self.bigram_dict = { k: v / bigram_count for k, v in self.bigram_dict.items() } def calculate_pmi(self, word_1, word_2): if self.unigram_dict is None or self.bigram_dict is None: self.calculate_word_probs() return np.log2(self.bigram_dict[" ".join([word_1, word_2])] / (self.unigram_dict[word_1] * self.unigram_dict[word_2])) def calculate_npmi(self, word_1, word_2): return self.calculate_pmi(word_1, word_2) / ( -np.log(self.bigram_dict[" ".join([word_1, word_2])])) def get_weight_vectors(self, weight=2, type="npmi"): pass