Exemple #1
0
def lang():
    vector_data = {k: np.random.normal(0, 1, (2,)) for k in ["red", "blue", "cat", "dog", "green", "purple"]}
    vector_data['cat'] += 10
    vector_data['dog'] += 10
    vocab = Vocab(strings=vector_data.keys())
    for word, vector in vector_data.items():
        vocab.set_vector(word, vector)
    nlp = Language(vocab=vocab)
    return SpacyLanguage(nlp)
def get_docs():
    vocab = Vocab()
    for t in texts:
        for word in t.split():
            hash_id = vocab.strings.add(word)
            vector = numpy.random.uniform(-1, 1, (7, ))
            vocab.set_vector(hash_id, vector)
    docs = [English(vocab)(t) for t in texts]
    return docs
def test_vector_is_oov():
    vocab = Vocab(vectors_name="test_vocab_is_oov")
    data = OPS.xp.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    assert vocab["cat"].is_oov is False
    assert vocab["dog"].is_oov is False
    assert vocab["hamster"].is_oov is True
Exemple #4
0
def test_vocab_add_vector():
    vocab = Vocab()
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    cat = vocab["cat"]
    assert list(cat.vector) == [1.0, 1.0, 1.0]
    dog = vocab["dog"]
    assert list(dog.vector) == [2.0, 2.0, 2.0]
def color_lang():
    vector_data = {
        "red": np.array([1.0, 0.0]),
        "green": np.array([0.5, 0.5]),
        "blue": np.array([0.0, 1.0]),
        "purple": np.array([0.0, 1.0]),
    }

    vocab = Vocab(strings=vector_data.keys())
    for word, vector in vector_data.items():
        vocab.set_vector(word, vector)
    nlp = Language(vocab=vocab)
    return SpacyLanguage(nlp)
def test_doc_token_api_vectors():
    vocab = Vocab()
    vocab.reset_vectors(width=2)
    vocab.set_vector("apples", vector=numpy.asarray([0.0, 2.0], dtype="f"))
    vocab.set_vector("oranges", vector=numpy.asarray([0.0, 1.0], dtype="f"))
    doc = Doc(vocab, words=["apples", "oranges", "oov"])
    assert doc.has_vector
    assert doc[0].has_vector
    assert doc[1].has_vector
    assert not doc[2].has_vector
    apples_norm = (0 * 0 + 2 * 2)**0.5
    oranges_norm = (0 * 0 + 1 * 1)**0.5
    cosine = ((0 * 0) + (2 * 1)) / (apples_norm * oranges_norm)
    assert doc[0].similarity(doc[1]) == cosine
Exemple #7
0
def test_doc_token_api_vectors():
    vocab = Vocab()
    vocab.reset_vectors(width=2)
    vocab.set_vector("apples", vector=numpy.asarray([0.0, 2.0], dtype="f"))
    vocab.set_vector("oranges", vector=numpy.asarray([0.0, 1.0], dtype="f"))
    doc = Doc(vocab, words=["apples", "oranges", "oov"])
    assert doc.has_vector
    assert doc[0].has_vector
    assert doc[1].has_vector
    assert not doc[2].has_vector
    apples_norm = (0 * 0 + 2 * 2) ** 0.5
    oranges_norm = (0 * 0 + 1 * 1) ** 0.5
    cosine = ((0 * 0) + (2 * 1)) / (apples_norm * oranges_norm)
    assert doc[0].similarity(doc[1]) == cosine
def test_vocab_add_vector():
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    data = OPS.xp.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    cat = vocab["cat"]
    assert list(cat.vector) == [1.0, 1.0, 1.0]
    dog = vocab["dog"]
    assert list(dog.vector) == [2.0, 2.0, 2.0]

    with pytest.raises(ValueError):
        vocab.vectors.add(vocab["hamster"].orth, row=1000000)
Exemple #9
0
def write_vectors_model(tmp_dir):
    import numpy
    vocab = Vocab()
    vector_data = {
        "dog": numpy.random.uniform(-1, 1, (300, )),
        "cat": numpy.random.uniform(-1, 1, (300, )),
        "orange": numpy.random.uniform(-1, 1, (300, ))
    }
    for word, vector in vector_data.items():
        vocab.set_vector(word, vector)
    nlp_path = tmp_dir / "vectors_model"
    nlp = English(vocab)
    nlp.to_disk(nlp_path)
    return str(nlp_path)
Exemple #10
0
def test_issue4725():
    # ensures that this runs correctly and doesn't hang or crash because of the global vectors
    vocab = Vocab(vectors_name="test_vocab_add_vector")
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])

    nlp = English(vocab=vocab)
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    nlp.begin_training()
    docs = ["Kurt is in London."] * 10
    for _ in nlp.pipe(docs, batch_size=2, n_process=2):
        pass
def read_vectors(filename):
    def load_embeddings(filename):
        with open(filename, encoding='utf-8') as infile:
            for i, line in enumerate(infile):
                items = line.rstrip().split(' ')
                if len(items) == 2:
                    # This is a header row giving the shape of the matrix
                    continue
                word = items[0]
                vec = np.array([float(x) for x in items[1:]], 'f')
                yield word, vec / np.linalg.norm(vec)

    vocab = Vocab()
    for word, vector in load_embeddings(filename):
        vocab.set_vector(word, vector)
    return vocab
Exemple #12
0
def test_pickle_vocab(text1, text2):
    vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
    vocab.set_vector("dog", numpy.ones((5,), dtype="f"))
    lex1 = vocab[text1]
    lex2 = vocab[text2]
    assert lex1.norm_ == text1[:-1]
    assert lex2.norm_ == text2[:-1]
    data = srsly.pickle_dumps(vocab)
    unpickled = srsly.pickle_loads(data)
    assert unpickled[text1].orth == lex1.orth
    assert unpickled[text2].orth == lex2.orth
    assert unpickled[text1].norm == lex1.norm
    assert unpickled[text2].norm == lex2.norm
    assert unpickled[text1].norm != unpickled[text2].norm
    assert unpickled.vectors is not None
    assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]
Exemple #13
0
def test_pickle_vocab(text1, text2):
    vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]})
    vocab.set_vector("dog", numpy.ones((5, ), dtype="f"))
    lex1 = vocab[text1]
    lex2 = vocab[text2]
    assert lex1.norm_ == text1[:-1]
    assert lex2.norm_ == text2[:-1]
    data = srsly.pickle_dumps(vocab)
    unpickled = srsly.pickle_loads(data)
    assert unpickled[text1].orth == lex1.orth
    assert unpickled[text2].orth == lex2.orth
    assert unpickled[text1].norm == lex1.norm
    assert unpickled[text2].norm == lex2.norm
    assert unpickled[text1].norm != unpickled[text2].norm
    assert unpickled.vectors is not None
    assert list(vocab["dog"].vector) == [1.0, 1.0, 1.0, 1.0, 1.0]
def test_issue2871():
    """Test that vectors recover the correct key for spaCy reserved words."""
    words = ["dog", "cat", "SUFFIX"]
    vocab = Vocab(vectors_name="test_issue2871")
    vocab.vectors.resize(shape=(3, 10))
    vector_data = numpy.zeros((3, 10), dtype="f")
    for word in words:
        _ = vocab[word]  # noqa: F841
        vocab.set_vector(word, vector_data[0])
    vocab.vectors.name = "dummy_vectors"
    assert vocab["dog"].rank == 0
    assert vocab["cat"].rank == 1
    assert vocab["SUFFIX"].rank == 2
    assert vocab.vectors.find(key="dog") == 0
    assert vocab.vectors.find(key="cat") == 1
    assert vocab.vectors.find(key="SUFFIX") == 2
def test_issue4725_2():
    if isinstance(get_current_ops, NumpyOps):
        # ensures that this runs correctly and doesn't hang or crash because of the global vectors
        # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
        # or because of issues with pickling the NER (cf test_issue4725_1)
        vocab = Vocab(vectors_name="test_vocab_add_vector")
        data = numpy.ndarray((5, 3), dtype="f")
        data[0] = 1.0
        data[1] = 2.0
        vocab.set_vector("cat", data[0])
        vocab.set_vector("dog", data[1])
        nlp = English(vocab=vocab)
        nlp.add_pipe("ner")
        nlp.initialize()
        docs = ["Kurt is in London."] * 10
        for _ in nlp.pipe(docs, batch_size=2, n_process=2):
            pass
Exemple #16
0
def test_vocab_prune_vectors():
    vocab = Vocab(vectors_name="test_vocab_prune_vectors")
    _ = vocab["cat"]  # noqa: F841
    _ = vocab["dog"]  # noqa: F841
    _ = vocab["kitten"]  # noqa: F841
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = [1.0, 1.2, 1.1]
    data[1] = [0.3, 1.3, 1.0]
    data[2] = [0.9, 1.22, 1.05]
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    vocab.set_vector("kitten", data[2])

    remap = vocab.prune_vectors(2, batch_size=2)
    assert list(remap.keys()) == ["kitten"]
    neighbour, similarity = list(remap.values())[0]
    assert neighbour == "cat", remap
    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
Exemple #17
0
def test_vocab_prune_vectors():
    vocab = Vocab()
    _ = vocab["cat"]  # noqa: F841
    _ = vocab["dog"]  # noqa: F841
    _ = vocab["kitten"]  # noqa: F841
    data = numpy.ndarray((5, 3), dtype="f")
    data[0] = 1.0
    data[1] = 2.0
    data[2] = 1.1
    vocab.set_vector("cat", data[0])
    vocab.set_vector("dog", data[1])
    vocab.set_vector("kitten", data[2])

    remap = vocab.prune_vectors(2)
    assert list(remap.keys()) == ["kitten"]
    neighbour, similarity = list(remap.values())[0]
    assert neighbour == "cat", remap
    assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
class Neighbors:
    def __init__(self, nlp_obj):
        file = 'data/programGeneratedData/768embedding2016.txt'
        df = pd.read_csv(file, sep=" ", encoding='cp1252', header=None)
        df = df.drop(columns=769)
        D = {}  # dictionary of all words and vectors in bert semeval data
        L = df.loc[:, 0].values  # list of all words
        for i, word in enumerate(L):
            D[word] = df.loc[i, 1:].values

        self.vocab = Vocab()
        for word, vector in D.items():
            self.vocab.set_vector(word, vector)
        self.nlp = nlp_obj
        self.nlp.tokenizer = Tokenizer(self.nlp.vocab)
        self.to_check = [self.vocab[w] for w in self.vocab.strings]
        self.n = {}

    def neighbors(self, word):
        word = unicode(word)
        orig_word = word
        if word not in self.n:
            if word not in self.vocab.strings:
                self.n[word] = []
            else:
                word = self.vocab[unicode(word)]
                queries = [w for w in self.to_check]

                by_similarity = sorted(queries,
                                       key=lambda w: word.similarity(w),
                                       reverse=True)
                self.n[orig_word] = [(self.nlp(by_similarity[0].orth_)[0],
                                      word.similarity(by_similarity[0]))]
                self.n[orig_word] += [
                    (self.nlp(w.orth_)[0], word.similarity(w))
                    for w in by_similarity[100:600]
                    if self.nlp(word.orth_)[0].text.split('_')[0] != self.nlp(
                        w.orth_)[0].text.split('_')[0]
                ]

        return self.n[orig_word]
                       "email <TO> my friend <BODY> I'm sick I cannot come to school.",
                       "email <TO> my colleagues <BODY> I am about 15 minutes late for work.",
                       "remind <TO> me <WHEN> 8am tomorrow <BODY> Take out the trash.",
                       "remind <TO> me <WHEN> next week, monday, 5pm <BODY> Dentist appointment.",
                       "remind <TO> my team <WHEN> friday <BODY> Meeting."]

    # Add custom vectors for tagging stuff
    keywords = {"<START>": np.random.uniform(-1, 1, (300,)),
                "<END>": np.random.uniform(-1, 1, (300,)),
                "<TO>": np.random.uniform(-1, 1, (300,)),
                "<WHEN>": np.random.uniform(-1, 1, (300,)),
                "<BODY>": np.random.uniform(-1, 1, (300,)),
                "<PAD>": np.random.uniform(-1, 1, (300,))}
    for word, vector in keywords.items():
        nlp.vocab.set_vector(word, vector)
        vocab_subset.set_vector(word, vector)
    # TODO(alexander): These needs to be stored so we can interpret future uses of this model

    # Separate dataset outputs (inputs to decoder) and its target
    dataset_targets = []
    for i in range(len(dataset_outputs)):
        dataset_targets.append(dataset_outputs[i] + " <END>");
        dataset_outputs[i] = "<START> " + dataset_outputs[i]
    
    # Convert sentences into vectors of words
    input_vectors = extract_word_embeddings(nlp, keywords, dataset_inputs)
    output_vectors = extract_word_embeddings(nlp, keywords, dataset_outputs)
    target_vectors = extract_word_embeddings(nlp, keywords, dataset_targets)

    # Make sure that each sentence have num_steps vectors, add padding if needed
    # NOTE: the neural network requires sequences to have the same dimensions, strict requirement.
Exemple #20
0
def test_doc_api_has_vector():
    vocab = Vocab()
    vocab.reset_vectors(width=2)
    vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f"))
    doc = Doc(vocab, words=["kitten"])
    assert doc.has_vector
Exemple #21
0
def test_doc_api_has_vector():
    vocab = Vocab()
    vocab.reset_vectors(width=2)
    vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f"))
    doc = Doc(vocab, words=["kitten"])
    assert doc.has_vector
def test_issue1807():
    """Test vocab.set_vector also adds the word to the vocab."""
    vocab = Vocab(vectors_name="test_issue1807")
    assert "hello" not in vocab
    vocab.set_vector("hello", numpy.ones((50, ), dtype="f"))
    assert "hello" in vocab
Exemple #23
0
def test_issue1807():
    """Test vocab.set_vector also adds the word to the vocab."""
    vocab = Vocab()
    assert "hello" not in vocab
    vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
    assert "hello" in vocab
Exemple #24
0
        "man",
        "woman",
        "cousin",
        "neice",
        "king",
        "queen",
        "dude",
        "guy",
        "gal",
        "fire",
        "dog",
        "cat",
        "mouse",
        "red",
        "bluee",
        "green",
        "yellow",
        "water",
        "person",
        "family",
        "brother",
        "sister",
    ]
    nlp = spacy.load("en_core_web_md")
    vec_data = {w: nlp(w).vector for w in words}
    vocab = Vocab(strings=words)
    for word, vector in vec_data.items():
        vocab.set_vector(word, vector)
    nlp = Language(vocab=vocab, meta={"lang": "en"})
    vocab.to_disk("custom_test_vocab")
Exemple #25
0
class NlPipe:
    """
    class for creating LDA models using sklearn. Deprecated as gensim is used.
    """
    def __init__(self,
                 list_of_docs,
                 document_ids=None,
                 language_model="en_core_web_lg",
                 tagger=False,
                 parser=False,
                 ner=False,
                 categorization=False,
                 remove_stopwords=True,
                 remove_punctuation=True,
                 set_lower=True,
                 remove_num=True,
                 expand_stopwords=True,
                 language_detection=False,
                 allowed_languages=frozenset({'en'})):
        """
        :param list_of_docs: List of strings where every document is one string.
        :param document_ids: The ids of the documents, matching the order of the list_of_docs
        :param language_model: Spacy language model to be used for text preprocessing
        :param tagger: Use spacy part-of-speech tagger.
        :param parser: Use spacy to annotate syntactic dependencies in documents.
        :param ner: Use spacy for entity recognition and annotation.
        :param categorization: Use spacy to assign document labels
        :param remove_stopwords: Remove stop words during text preprocessing.
        :param remove_punctuation: Remove punctuation during text prssing.
        :param set_lower: Convert all strings to lowercase during text preprocessing.
        :param remove_num: Remove numeric characters during text preprocessing.
        :param expand_stopwords: Remove non-alpha-characters in stop words and add them to the stop words.
        :param language_detection: Detect language of docs.
        :param allowed_languages: Allowed language for the documents.
        """

        self.pipe_disable = []
        if not tagger:
            self.pipe_disable.append("tagger")
        if not parser:
            self.pipe_disable.append("parser")
        if not ner:
            self.pipe_disable.append("ner")
        if not categorization:
            self.pipe_disable.append("textcat")
        self.remove_punctuation = remove_punctuation
        self.remove_stop_words = remove_stopwords
        self.remove_num = remove_num
        self.set_lower = set_lower
        self.input_docs = list_of_docs
        self.document_ids = document_ids
        self.nlp = spacy.load(language_model)
        if expand_stopwords:
            stops = [stop for stop in self.nlp.Defaults.stop_words]
            for stop in stops:
                self.nlp.Defaults.stop_words.add(re.sub(r"[\W]", "", stop))
        self.spacy_docs = None
        self.processed_docs = None
        self.vectorizer = None
        self.bag_of_words = None
        self.tf_idf = None
        self.preprocessing_batch_size = 500
        self.processes = multiprocessing.cpu_count() - 2
        self.lda_model = None
        self.lda_output = None
        self.grid_search = None
        self.evaluation_output = None
        self.result_df = None
        self.word_topic_df = None
        self.word_topic_intersection = None
        self.intersection_score = None
        self.allowed_languages = allowed_languages
        self.language_detection = language_detection
        self.spacy_vocab = None
        self.word_distance_dict = None
        self.word_topic_distance_sum = 0
        self.unigram_dict = None
        self.bigram_dict = None

    def enable_pipe_component(self, component):
        if component in self.pipe_disable:
            self.pipe_disable.remove(component)
            #todo: add info if not in list from beginning or if successfully enable

    def disable_pipe_component(self, component):
        if component not in self.pipe_disable:
            self.pipe_disable.append(component)
            # todo: add info if not in list from beginning or if successfully enable

    def preprocess_spacy(self):
        # todo: add language check
        if self.language_detection:
            self.spacy_docs = [
                doc for doc in tqdm(self.nlp.pipe(
                    self.input_docs,
                    disable=self.pipe_disable,
                    n_process=self.processes,
                    batch_size=self.preprocessing_batch_size),
                                    desc="Preprocessing text with spacy: ")
                if detect(doc.text) in self.allowed_languages
            ]
        else:
            self.spacy_docs = [
                doc for doc in tqdm(self.nlp.pipe(
                    self.input_docs,
                    disable=self.pipe_disable,
                    n_process=self.processes,
                    batch_size=self.preprocessing_batch_size),
                                    desc="Preprocessing text with spacy: ")
            ]

    def preprocess(self):
        self.processed_docs = []
        if not self.spacy_docs:
            self.preprocess_spacy()
        for spacy_doc in tqdm(
                self.spacy_docs,
                desc="Removing stop words/punctuation/numeric chars: "):
            doc = []
            for token in spacy_doc:
                if not self.remove_stop_words and token.is_stop:
                    word = token.text
                elif token.is_stop:
                    continue
                else:
                    word = token.text
                if self.set_lower:
                    word = word.lower()
                if self.remove_num:
                    word = re.sub(r"[\d]", "", word)
                if self.remove_punctuation:
                    word = re.sub(r"[\W]", "", word)
                if len(word) >= 2:
                    doc.append(word)
            self.processed_docs.append(doc)

    def create_bag_of_words(self, n_grams=(1, 1), min_df=0.01, max_df=0.6):
        self.preprocess_spacy()
        self.preprocess()
        joined_docs = []
        for doc in self.processed_docs:
            joined_docs.append(" ".join(doc))
        self.vectorizer = CountVectorizer(lowercase=False,
                                          ngram_range=n_grams,
                                          min_df=min_df,
                                          max_df=max_df)
        self.bag_of_words = self.vectorizer.fit_transform(joined_docs)

    def create_tf_idf(self, n_grams=(1, 1), min_df=0.01, max_df=0.6):
        self.preprocess_spacy()
        self.preprocess()
        joined_docs = []
        for doc in self.processed_docs:
            joined_docs.append(" ".join(doc))
        self.vectorizer = TfidfVectorizer(lowercase=False,
                                          ngram_range=n_grams,
                                          min_df=min_df,
                                          max_df=max_df)
        self.tf_idf = self.vectorizer.fit_transform(joined_docs)

    def create_lda_model(self, no_topics=10, input_type="bag"):
        self.lda_model = LDA(n_jobs=self.processes, n_components=no_topics)
        if input_type == "bag":
            if self.bag_of_words is None:
                self.create_bag_of_words()
            self.lda_output = self.lda_model.fit_transform(self.bag_of_words)
        else:
            self.create_tf_idf()
            self.lda_output = self.lda_model.fit_transform(self.tf_idf)

    def search_best_model(self,
                          n_components=[2, 3, 4, 5, 10, 15, 20, 25],
                          learning_decay=[.5, .7, .9],
                          input_type="bag"):
        lda_model = LDA()
        self.grid_search = GridSearchCV(lda_model, {
            "n_components": n_components,
            "learning_decay": learning_decay
        })
        if input_type == "bag":
            if self.bag_of_words is None:
                self.create_bag_of_words()
            self.grid_search.fit(self.bag_of_words)
        else:
            if self.tf_idf is None:
                self.create_tf_idf()
            self.grid_search.fit(self.tf_idf)

    def create_document_topic_df(self,
                                 model=None,
                                 no_topics=10,
                                 input_type="bag",
                                 input_matrix=None):
        if model is None:
            self.create_lda_model(no_topics=no_topics, input_type=input_type)
        else:
            self.lda_model = model
        if input_matrix is not None:
            self.evaluation_output = self.lda_model.fit_transform(input_matrix)
        elif input_type == "bag":
            self.evaluation_output = self.lda_model.fit_transform(
                self.bag_of_words)
        else:
            self.evaluation_output = self.lda_model.fit_transform(self.tf_idf)
        self.result_df = pd.DataFrame(self.evaluation_output)
        if self.document_ids is not None and not self.language_detection:
            self.result_df.index = self.document_ids
        elif self.document_ids is not None and self.language_detection:
            raise Warning(
                "Using document ids and language detection together is not implemented (yet)."
            )
        dominant_topic = np.argmax(self.result_df.values, axis=1)
        self.result_df['dominant_topic'] = dominant_topic

    def plot_document_topic_distribution(self):
        #todo: log normalize
        counter = Counter(self.result_df.dominant_topic)
        topic_dict = OrderedDict(
            sorted(counter.items(), key=lambda x: x[1], reverse=True))
        sns.barplot(x=list(topic_dict.values()),
                    y=list(topic_dict.keys()),
                    order=list(topic_dict.keys()),
                    orient='h')
        plt.show()

    def evaluate_model(self, no_words=30):
        keywords = np.array(self.vectorizer.get_feature_names())
        topic_keywords = []
        for topic_weights in self.lda_model.components_:
            top_keyword_locations = (-topic_weights).argsort()[:no_words]
            topic_keywords.append(keywords.take(top_keyword_locations))
        self.word_topic_df = pd.DataFrame(
            topic_keywords, columns=[f"word_{x}" for x in range(no_words)])

    def evaluate_pyldavis(self):
        panel = pyLDAvis.sklearn.prepare(self.lda_model, self.bag_of_words,
                                         self.vectorizer)
        pyLDAvis.show(panel)

    def get_word_topic_intersection(self, no_words=30, no_topics=10):
        if not isinstance(self.word_topic_df, pd.DataFrame):
            self.evaluate_model(no_words=no_words)
        elif isinstance(
                self.word_topic_df,
                pd.DataFrame) and self.word_topic_df.shape[1] != no_words:
            self.evaluate_model(no_words=no_words)
        intersection_list = []
        intersection_score = 0
        all_combinations = [
            combo for combo in combinations(range(no_topics), 2)
        ]
        for x in range(no_topics):
            temp_list = []
            for y in range(no_topics):
                if x != y:
                    temp_list.append(
                        len(
                            set(self.word_topic_df[self.word_topic_df.index ==
                                                   x].values[0]).
                            intersection(self.word_topic_df[
                                self.word_topic_df.index == y].values[0])) /
                        no_words)
                if (x, y) in all_combinations:
                    intersection_score += len(
                        set(self.word_topic_df[self.word_topic_df.index == x].
                            values[0]).intersection(
                                self.word_topic_df[self.word_topic_df.index ==
                                                   y].values[0])) / no_words
                else:
                    temp_list.append(1)
            intersection_list.append(temp_list)
        self.intersection_score = intersection_score / len(all_combinations)
        self.word_topic_intersection = pd.DataFrame(intersection_list)

    def get_topic_word_distance_sum(self, no_words=30):
        self.word_distance_dict = {}
        if not isinstance(self.word_topic_df, pd.DataFrame):
            self.evaluate_model(no_words=no_words)
        elif isinstance(
                self.word_topic_df,
                pd.DataFrame) and self.word_topic_df.shape[1] != no_words:
            self.evaluate_model(no_words=no_words)
        if self.spacy_vocab is None:
            self.load_textgain_embs()
        for index in self.word_topic_df.index:
            topic_distance_sum = 0
            missing_count = 0
            for word_a, word_b in combinations(
                    self.word_topic_df[self.word_topic_df.index ==
                                       index].values[0], 2):
                if self.spacy_vocab.has_vector(
                        str(word_a)) and self.spacy_vocab.has_vector(
                            str(word_b)):
                    topic_distance_sum += np.linalg.norm(
                        self.spacy_vocab.get_vector(str(word_a)) -
                        self.spacy_vocab.get_vector(str(word_b)))
                else:
                    missing_count += 1
            self.word_distance_dict[index] = topic_distance_sum / (
                (factorial(no_words) /
                 (factorial(2) * factorial(no_words - 2))) - missing_count)
        self.word_topic_distance_sum = sum(
            self.word_distance_dict.values()) / len(
                self.word_distance_dict.keys())

        # todo: sum of distance between words in topic derived from word embedding
        # todo: sum of sum of distances divided by no topics

    def load_textgain_embs(self,
                           from_txt=False,
                           path="textgain_embeddings/spacy_vocab"):
        self.spacy_vocab = Vocab()
        if from_txt:
            with open(path) as f:
                for line in f:
                    split_line = line.split()
                    self.spacy_vocab.set_vector(
                        "".join(split_line[:-150]),
                        np.array([float(coord)
                                  for coord in split_line[-150:]]))
        else:
            self.spacy_vocab.from_disk(path)

    def calculate_coherence(self, type="cosine"):
        pass
        #todo: add coherence function here

    def calculate_jaccard(self):
        pass
        #todo: calculate jaccard distance here

    def calculate_cosine(self, word_1, word_2):
        return np.dot(
            word_1, word_2) / (np.linalg.norm(word_1) * np.linalg.norm(word_2))

    def calculate_dice(self):
        pass
        #todo: calculate dice coefficient here

    def calculate_centroid_sim(self):
        pass
        #todo: calculate centroid similarity here

    def calculate_word_probs(self):
        #todo: calculate unigram  and probability of words
        self.unigram_dict = defaultdict(int)
        self.bigram_dict = defaultdict(int)
        unigram_count = 0
        bigram_count = 0
        for doc in tqdm(self.processed_docs,
                        desc="calculation uni- and bigram probabilities: "):
            for i, word in enumerate(doc):
                self.unigram_dict[word] += 1
                unigram_count += 1
                try:
                    self.bigram_dict[" ".join([word, doc[i + 1]])] += 1
                    bigram_count += 1
                except:
                    pass
        self.unigram_dict = {
            k: v / unigram_count
            for k, v in self.unigram_dict.items()
        }
        self.bigram_dict = {
            k: v / bigram_count
            for k, v in self.bigram_dict.items()
        }

    def calculate_pmi(self, word_1, word_2):
        if self.unigram_dict is None or self.bigram_dict is None:
            self.calculate_word_probs()
        return np.log2(self.bigram_dict[" ".join([word_1, word_2])] /
                       (self.unigram_dict[word_1] * self.unigram_dict[word_2]))

    def calculate_npmi(self, word_1, word_2):
        return self.calculate_pmi(word_1, word_2) / (
            -np.log(self.bigram_dict[" ".join([word_1, word_2])]))

    def get_weight_vectors(self, weight=2, type="npmi"):
        pass