コード例 #1
0
    def test_cluster_with_one_vector(self):
        """
        Test that the centroid of a cluster with a single vector has an equivalent centroid.
        """

        v = Document("a", ["a", "b", "a", "c"], scheme=TF())
        v.normalize()
        c = Cluster(v)
        self.assertEqual(v.dimensions, c.centroid.dimensions)
コード例 #2
0
    def test_centroid_normalized_several_vectors(self):
        """
        Test that the centroid is always normalized.
        """

        v = Document("", ["a", "c"], scheme=TF())
        c = Cluster(v)
        self.assertEqual(1, round(vector_math.magnitude(c.centroid), 10))
        c.vectors.append(Document("", ["a", "b", "a", "d"]))
        self.assertEqual(1, round(vector_math.magnitude(c.centroid), 10))
コード例 #3
0
    def test_size(self):
        """
        Test retrieving the size of a cluster.
        """

        v = [
            Document("", ['a', 'b'], scheme=TF()),
            Document("", ['a', 'a'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual(len(v), c.size())
コード例 #4
0
    def test_intra_similarity_of_cluster(self):
        """
        Test that the intra-similarity of a cluster with several vectors is equivalent to the average similarity.
        """

        v = [
            Document("", ['a', 'b'], scheme=TF()),
            Document("", ['a', 'a'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual((c.similarity(v[0]) + c.similarity(v[1])) / 2.,
                         c.get_intra_similarity())
コード例 #5
0
    def test_get_representative_vectors(self):
        """
        Test ranking the vectors according to their similarity to the cluster.
        """

        v = [
            Document("", ['a', 'b', 'c'], scheme=TF()),
            Document("", ['a', 'a', 'c'], scheme=TF()),
            Document("", ['p'], scheme=TF()),
        ]
        c = Cluster(v)
        self.assertEqual(list, type(c.get_representative_vectors(2)))
        self.assertEqual([v[1], v[0]], c.get_representative_vectors(2))
コード例 #6
0
    def test_get_centroid(self):
        """
        Test getting the centroid.
        """

        v = Document("", ["a", "c"], scheme=TF())
        v.normalize()
        c = Cluster(v)
        self.assertTrue(
            all(
                round(v.dimensions[dimension], 10) == round(
                    c.centroid.dimensions[dimension], 10)
                for dimension in v.dimensions.keys() | c.centroid.dimensions))
コード例 #7
0
    def test_setting_vectors(self):
        """
        Test setting the vectors manually.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster()
        self.assertEqual({}, c.centroid.dimensions)
        c.vectors = v
        self.assertEqual(v, c.vectors)
コード例 #8
0
    def test_cluster_with_several_vectors(self):
        """
        Test creating a cluster with several vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF()),
        ]
        for vector in v:
            vector.normalize()

        c = Cluster(v)
        self.assertEqual(v, c.vectors)
コード例 #9
0
 def test_get_term_frequency(self):
     document1 = Document("This document is a document with no duplicates.",
                          preserve_duplicates=False)
     document2 = Document("This document is a document with duplicates.",
                          preserve_duplicates=True)
     document3 = Document(
         "This is just to check that a word is not present.",
         preserve_duplicates=True)
     self.assertEqual(
         TermDocumentMatrix.get_term_frequency("document", document1), 1)
     self.assertEqual(
         TermDocumentMatrix.get_term_frequency("document", document2), 2)
     self.assertEqual(
         TermDocumentMatrix.get_term_frequency("document", document3), 0)
コード例 #10
0
    def test_set_vectors_none(self):
        """
        Test that setting vectors to ``None`` overwrites existing vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        self.assertEqual(v, c.vectors)

        c.vectors = None
        self.assertEqual([], c.vectors)
        self.assertEqual({}, c.centroid.dimensions)
コード例 #11
0
 def test_get_word_vector(self):
     documents = [
         Document("Nice document, document", preserve_duplicates=True),
         Document("Bad document"),
         Document("Alright document"),
         Document("Nice day today"),
         Document("No day is a bad day")
     ]
     term_document_matrix = TermDocumentMatrix(documents)
     word_vector = term_document_matrix.word_vectors[
         term_document_matrix.vocabulary.index("document")]
     expected_word_vector = [
         2 * log(5 / 4), 1 * log(5 / 4), 1 * log(5 / 4), 0.0, 0.0
     ]
     self.assertEqual(word_vector, tuple(expected_word_vector))
コード例 #12
0
    def test_set_one_vectors(self):
        """
        Test that setting vectors to a single vector overwrites existing vectors.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        self.assertEqual(v, c.vectors)

        n = Document("", ['a'], scheme=TF())
        c.vectors = n
        self.assertEqual([n], c.vectors)
        self.assertEqual(n.dimensions, c.centroid.dimensions)
コード例 #13
0
    def _disambiguate(self, pages):
        """
        Disambiguate a candidate by finding the link that is most similar to the domain.
        The function returns the link's page name and the associated score.
        Only one page is returned: the one with the highest score.

        :param pages: A list of page titles.
        :type pages: list of str

        :return: A tuple containing the most similar page and its similarity score.
        :rtype: tuple
        """
        """
        Get the first section of each page.
        Then, convert them into documents.
        """
        pages = text.collect(pages, introduction_only=True)
        for page, introduction in pages.items():
            pages[page] = Document(introduction,
                                   self.tokenizer.tokenize(introduction),
                                   scheme=self.scheme)
            pages[page].normalize()
        """
        Rank the page scores in descending order.
        Then, choose the best page and return it alongside its score.
        """
        scores = {
            page: vector_math.cosine(introduction, self.domain)
            for page, introduction in pages.items()
        }
        article, score = sorted(scores.items(),
                                key=lambda score: score[1],
                                reverse=True)[0]
        return (article, score)
コード例 #14
0
    def test_extrapolate_returns_related_participants(self):
        """
        Test that when extrapolating, related participants are returned.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(stem=True,
                              stopwords=list(stopwords.words("english")))
        posts = [
            "The LigaPro is the second-highest division of the Portuguese football league system.",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
        extrapolator = WikipediaExtrapolator(corpus,
                                             tokenizer,
                                             TF(),
                                             first_level_links=15,
                                             second_level_links=15)
        participants = extrapolator.extrapolate([
            'Associação Académica de Coimbra – O.A.F.',
            'Académico de Viseu F.C.', 'S.L. Benfica B', 'FC Porto B'
        ])

        other_participants = [
            'Casa Pia A.C.', 'G.D. Chaves', 'C.D. Cova da Piedade',
            'S.C. Covilhã', 'G.D. Estoril Praia', 'S.C. Farense',
            'C.D. Feirense', 'Leixões S.C.', 'C.D. Mafra', 'C.D. Nacional',
            'U.D. Oliveirense', 'F.C. Penafiel', 'Varzim S.C.',
            'U.D. Vilafranquense'
        ]
        self.assertGreaterEqual(
            len(set(participants).intersection(set(other_participants))), 4)
コード例 #15
0
    def test_zero_threshold(self):
        """
        Test that when a threshold of zero is given, all candidate participants are retained.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertTrue('damascus' in scores)
コード例 #16
0
ファイル: test_document.py プロジェクト: mcassia/nlp
 def test_get_bag_of_words(self):
     bag_of_words = Document.get_bag_of_words(
         "sample parsed text to split on whitespace")
     expected_bag_of_words = [
         "sample", "parsed", "text", "to", "split", "on", "whitespace"
     ]
     self.assertEqual(bag_of_words, expected_bag_of_words)
コード例 #17
0
def load_corpus(filename, clean):
    """
    Load the corpus from the given filename.

    :param filename: The path to the corpus from where to detect participants.
    :type filename: str
    :param clean: A boolean indicating whether tweets should be cleaned while loading them.
    :type clean: bool

    :return: A list of :class:`~nlp.document.Document` making up the corpus.
    :rtype: list of :class:`~nlp.document.Document`
    """

    cleaner = TweetCleaner(replace_mentions=True)

    corpus = []
    with open(filename) as f:
        for i, line in enumerate(f):
            tweet = json.loads(line)
            original = tweet
            while "retweeted_status" in tweet:
                tweet = tweet["retweeted_status"]

            if "extended_tweet" in tweet:
                text = tweet["extended_tweet"].get("full_text",
                                                   tweet.get("text", ""))
            else:
                text = tweet.get("text", "")

            text = cleaner.clean(text, original) if clean else text
            document = Document(text)
            corpus.append(document)

    return corpus
コード例 #18
0
    def test_threshold_filter(self):
        """
        Test the basic functionality of the threshold filter.
        """

        """
        Create the test data.
        """
        tokenizer = Tokenizer(stem=False)
        posts = [
            "Erdogan with threats to attack regime forces 'everywhere' in Syria",
            "Damascus says Erdogan 'disconnected from reality' after threats",
        ]

        corpus = [ Document(post, tokenizer.tokenize(post)) for post in posts ]

        extractor = EntityExtractor()
        scorer = TFScorer()
        filter = ThresholdFilter(0.75)

        candidates = extractor.extract(corpus)
        scores = scorer.score(candidates)

        self.assertEqual(1, scores.get('erdogan', 0))
        self.assertEqual(0.5, scores.get('damascus', 0))

        scores = filter.filter(scores)
        self.assertTrue('erdogan' in scores)
        self.assertFalse('damascus' in scores)
コード例 #19
0
    def test_extract_from_text(self):
        """
        Test that the entity extractor's named entities do appear in the corresponding tweet.
        """
        """
        Load the corpus.
        """
        filename = os.path.join(os.path.dirname(__file__), '..', '..', '..',
                                '..', 'tests', 'corpora', 'understanding',
                                'CRYCHE-100.json')
        corpus = []
        with open(filename) as f:
            for i, line in enumerate(f):
                tweet = json.loads(line)
                original = tweet
                while "retweeted_status" in tweet:
                    tweet = tweet["retweeted_status"]

                if "extended_tweet" in tweet:
                    text = tweet["extended_tweet"].get("full_text",
                                                       tweet.get("text", ""))
                else:
                    text = tweet.get("text", "")

                document = Document(text)
                corpus.append(document)

        extractor = EntityExtractor()
        candidates = extractor.extract(corpus)
        for (document, candidate_set) in zip(corpus, candidates):
            text = document.text.lower().replace('\n', ' ').replace('  ', ' ')
            self.assertTrue(
                all(candidate in text.lower() for candidate in candidate_set))
コード例 #20
0
 def test_sorting(self):
     """
     Test that the resolver sorts the tokens in descending order of score.
     """
     """
     Create the test data
     """
     tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
     posts = [
         "Manchester United falter against Tottenham Hotspur",
         "Manchester United unable to avoid defeat to Tottenham",
         "Tottenham lose again",
     ]
     corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]
     """
     Ensure that the more common candidates are ranked towards the beginning.
     """
     candidates = TokenExtractor().extract(corpus)
     scores = TFScorer().score(candidates)
     scores = ThresholdFilter(0).filter(scores)
     self.assertTrue(scores)
     resolved, unresolved = Resolver().resolve(scores)
     self.assertEqual(set(scores.keys()), set(resolved))
     self.assertEqual([], unresolved)
     self.assertEqual('tottenham', resolved[0])
     self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))
コード例 #21
0
    def test_sorting(self):
        """
        Test that the resolver sorts the tokens in descending order of score.
        """
        """
        Create the test data
        """
        tokenizer = Tokenizer(min_length=3, stem=False, case_fold=True)
        posts = [
            "Manchester United falter against Tottenham Hotspur",
            "Manchester United unable to avoid defeat to Tottenham",
            "Tottenham lose again",
        ]
        corpus = [Document(post, tokenizer.tokenize(post)) for post in posts]

        candidates = TokenExtractor().extract(corpus)
        scores = TFScorer().score(candidates)
        scores = ThresholdFilter(0).filter(scores)
        resolved, unresolved = TokenResolver(tokenizer, corpus).resolve(scores)
        self.assertEqual('tottenham', resolved[0])
        self.assertEqual(set(['manchester', 'united']), set(resolved[1:3]))
        self.assertEqual(
            set([
                'falter', 'against', 'hotspur', 'unable', 'avoid', 'defeat',
                'lose', 'again'
            ]), set(resolved[3:]))
コード例 #22
0
    def test_empty_cluster_similarity(self):
        """
        Test that when calculating the similarity between a vector and an empty cluster, the similarity is 0.
        """

        c = Cluster()
        v = Document("", ["a", "c"], scheme=TF())
        self.assertEqual(0, c.similarity(v))
コード例 #23
0
    def test_centroid_normalized(self):
        """
        Test that the centroid is normalized.
        """

        v = Document("", ["a", "c"], scheme=TF())
        c = Cluster(v)
        self.assertEqual(1, round(vector_math.magnitude(c.centroid), 10))
コード例 #24
0
    def test_set_several_vectors(self):
        """
        Test that setting vectors to several vectors overwrites existing vectors.
        """

        v = Document("", ['a'], scheme=TF())
        c = Cluster(v)
        self.assertEqual([v], c.vectors)
        self.assertEqual(v.dimensions, c.centroid.dimensions)

        n = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]

        c.vectors = n
        self.assertEqual(n, c.vectors)
コード例 #25
0
    def test_intra_similarity_of_cluster_with_single_vector(self):
        """
        Test that the intra-similarity of a cluster with a single vector is equivalent to that vector's similarity with the cluster.
        """

        v = Document("", ['a', 'b'], scheme=TF())
        c = Cluster(v)
        self.assertEqual(c.similarity(v), c.get_intra_similarity())
コード例 #26
0
ファイル: test_document.py プロジェクト: mcassia/nlp
 def test_get_ngrams(self):
     ngrams = [2, 3]
     bag_of_words = Document.get_ngrams(
         ngrams, ["hello", "this", "is", "a", "document"])
     expected_bag_of_words = [
         "hello this", "this is", "is a", "a document", "hello this is",
         "this is a", "is a document"
     ]
     self.assertEqual(bag_of_words, expected_bag_of_words)
コード例 #27
0
    def test_remove_vectors(self):
        """
        Test removing vectors from a cluster gradually.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]
        c = Cluster(v)
        c.vectors.remove(v[0])
        self.assertEqual([v[1]], c.vectors)

        c = Cluster(v)
        c.vectors.remove(v[1])
        self.assertEqual([v[0]], c.vectors)
        c.vectors.remove(v[0])
        self.assertEqual([], c.vectors)
コード例 #28
0
    def test_add_vectors(self):
        """
        Test adding vectors to a cluster gradually.
        """

        c = Cluster()
        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF())
        ]

        self.assertEqual({}, c.centroid.dimensions)

        c.vectors.append(v[0])
        self.assertEqual([v[0]], c.vectors)

        c.vectors.append(v[1])
        self.assertEqual(v, c.vectors)
コード例 #29
0
 def test_get_inverse_document_frequency(self):
     documents = [
         Document("This document is a document with no duplicates."),
         Document("This document is a document with duplicates."),
         Document("This is just to check that a word is not present.")
     ]
     term_document_matrix = TermDocumentMatrix(documents)
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("document"),
         log(3 / 3))
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("not_present"),
         log(3 / 1))
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("just"),
         log(3 / 2))
     self.assertEqual(
         term_document_matrix.get_inverse_document_frequency("is"),
         log(3 / 4))
コード例 #30
0
    def test_cluster_with_several_vectors_copy(self):
        """
        Test that when creating a cluster with several vectors, a copy is created.
        """

        v = [
            Document("", ["a", "b", "a", "c"], scheme=TF()),
            Document("", ["a", "c"], scheme=TF()),
        ]
        for vector in v:
            vector.normalize()

        c = Cluster(v)
        self.assertEqual(v, c.vectors)
        copy = list(v)
        c.vectors.remove(v[0])
        self.assertEqual([v[1]], c.vectors)
        self.assertEqual(copy, v)
        self.assertEqual(2, len(v))