Ejemplo n.º 1
0
def test_gensim_word2vec():
    expected_doc_2 = [
        0.0076740906, -0.051765148, -0.008963874, -0.16817021, -0.12640671,
        -0.28199115, -0.1418166, -0.08547635, -0.1489038, 0.049820565
    ]
    actual_doc_2 = Doc(TEXT_2).generate_gensim_document_embedding(
        model_uri='tests/models/gensim_test_nl.kv')
    if not np.allclose(actual_doc_2, expected_doc_2):
        raise AssertionError

    expected_doc_5 = [
        0.04336167, -0.12551728, 0.121972464, -0.023885678, -0.0892916,
        0.011041589, -0.022286428, 0.06333805, 0.07664292, 0.086685486
    ]
    actual_doc_5 = Doc(TEXT_5).generate_gensim_document_embedding(
        model_uri='tests/models/gensim_test_nl.kv', idf_weighting='naive')
    if not np.allclose(actual_doc_5, expected_doc_5):
        raise AssertionError

    expected_doc_5 = [
        0.021136083, -0.035798773, 0.032576967, 0.0048801005, -0.028301004,
        -0.0059328717, -0.010782357, 0.025319293, 0.018113682, 0.028851084
    ]
    actual_doc_5 = Doc(TEXT_5).generate_gensim_document_embedding(
        model_uri='tests/models/gensim_test_nl.kv', idf_weighting='log')
    if not np.allclose(actual_doc_5, expected_doc_5):
        raise AssertionError
Ejemplo n.º 2
0
def test_gensim_word2vec_with_redis():
    # Load word2vec model into fake Redis
    kv = RedisKeyedVectors('redis://host:1234/0', 'nl')
    kv.load_keyed_vectors_into_redis('tests/models/gensim_test_nl.kv')

    expected_doc_2 = [0.0076740906, -0.051765148, -0.008963874, -0.16817021, -0.12640671,
                      -0.28199115, -0.1418166, -0.08547635, -0.1489038, 0.049820565]
    actual_doc_2 = Doc(TEXT_2, gensim_vectors={'nl': kv}). \
        generate_gensim_document_embedding(model_uri='redis://host:1234/0')
    if not np.allclose(actual_doc_2, expected_doc_2):
        raise AssertionError(actual_doc_2)

    expected_doc_5 = [0.04336167, -0.12551728, 0.121972464, -0.023885678, -0.0892916, 0.011041589,
                      -0.022286428, 0.06333805, 0.07664292, 0.086685486]
    actual_doc_5 = Doc(TEXT_5).generate_gensim_document_embedding(model_uri='redis://host:1234/0',
                                                                  idf_weighting='naive')
    if not np.allclose(actual_doc_5, expected_doc_5):
        raise AssertionError

    with pytest.raises(RedisIDFWeightingMismatchException) as e:
        Doc(TEXT_5).generate_gensim_document_embedding(model_uri='redis://host:1234/0',
                                                       idf_weighting='log')
    assert str(e.value) == 'The specified document embedding idf weighting "log" does not match ' \
                           'weighting in RedisKeyedVector "naive"'

    kv = RedisKeyedVectors('redis://host:1234/0', 'nl')
    kv.load_keyed_vectors_into_redis('tests/models/gensim_test_nl.kv', idf_weighting='log')
    expected_doc_5 = [0.02113608, -0.035798773, 0.032576967, 0.0048801005, -0.028301004,
                      -0.005932871, -0.010782358, 0.025319293, 0.018113682, 0.028851084]
    actual_doc_5 = Doc(TEXT_5, gensim_vectors={'nl': kv}).\
        generate_gensim_document_embedding(model_uri='redis://host:1234/0', idf_weighting='log')

    if not np.allclose(actual_doc_5, expected_doc_5):
        raise AssertionError
    kv._redis.flushall()
Ejemplo n.º 3
0
def test_gensim_word2vec():
    expected_doc_2 = [
        0.0076740906, -0.051765148, -0.008963874, -0.16817021, -0.12640671,
        -0.28199115, -0.1418166, -0.08547635, -0.1489038, 0.049820565
    ]
    actual_doc_2 = Doc(TEXT_2).generate_gensim_document_embedding(
        model_uri='tests/models/gensim_test_nl.kv')
    if not np.allclose(actual_doc_2, expected_doc_2):
        raise AssertionError

    expected_doc_5 = [
        0.04336167, -0.12551728, 0.121972464, -0.023885678, -0.0892916,
        0.011041589, -0.022286428, 0.06333805, 0.07664292, 0.086685486
    ]
    actual_doc_5 = Doc(TEXT_5).generate_gensim_document_embedding(
        model_uri='tests/models/gensim_test_nl.kv')
    if not np.allclose(actual_doc_5, expected_doc_5):
        raise AssertionError
Ejemplo n.º 4
0
    def __call__(self, raw):
        """
        Apply the pipeline to raw text. A dictionary containing the requested elements as keys
        and their content is returned

        Args:
        raw: incoming, unedited text
        """
        doc = Doc(raw, language=self.language, hint_language=self.hint_language,
                  spacy_nlps=self._spacy_nlps)
        result_dict = {oper.__class__.__name__: oper(doc) for oper in self._operations}
        return result_dict
Ejemplo n.º 5
0
def test_gensim_word2vec_with_redis():
    # Load word2vec model into fake Redis
    kv = RedisKeyedVectors('redis://host:1234/0', 'nl')
    kv.load_keyed_vectors_into_redis('tests/models/gensim_test_nl.kv')

    expected_doc_2 = [
        0.0076740906, -0.051765148, -0.008963874, -0.16817021, -0.12640671,
        -0.28199115, -0.1418166, -0.08547635, -0.1489038, 0.049820565
    ]
    actual_doc_2 = Doc(TEXT_2, gensim_vectors={
        'nl': kv
    }).generate_gensim_document_embedding(model_uri='redis://host:1234/0')
    if not np.allclose(actual_doc_2, expected_doc_2):
        raise AssertionError(actual_doc_2)

    expected_doc_5 = [
        0.04336167, -0.12551728, 0.121972464, -0.023885678, -0.0892916,
        0.011041589, -0.022286428, 0.06333805, 0.07664292, 0.086685486
    ]
    actual_doc_5 = Doc(TEXT_5).generate_gensim_document_embedding(
        model_uri='redis://host:1234/0')
    if not np.allclose(actual_doc_5, expected_doc_5):
        raise AssertionError
    kv._redis.flushall()
Ejemplo n.º 6
0
    def __call__(self, raw):
        """
        Apply the pipeline to raw text. A dictionary containing the requested elements as keys
        and their content is returned

        Args:
        raw: incoming, unedited text
        """
        doc = Doc(raw, language=self.language, hint_language=self.hint_language,
                  spacy_nlps=self._spacy_nlps)

        data = {}

        for oper, settings in self.steps:
            target_operation = self._operations[oper]
            data[oper] = target_operation(doc, context=data, settings=settings)

        return data
Ejemplo n.º 7
0
    def _custom_op(doc: doc.Doc, context=None, settings=None, **kwargs):
        """
        Default custom textpipe operation
            - Strip HTML tags
            - Lemmatization
            - Remove stop words
            - Remove punctuations
        If encounter unsupported input lanauage, return None
        """
        # TODO
        # add multi lang/lm support
        # return None for now
        if doc.detect_language()[1] != "en":
            # do some manual regex preprocessing and re-try
            text = re.sub("</?.*?>", " <> ", doc.raw)  # remove tags
            text = text.strip()  # remove whitespaces in the front and the end
            text = re.sub("(\\d|\\W)+", " ",
                          text)  # remove special chars and digits
            text = re.sub(r'[.|,|)|(|\|/|?|!|\'|"|#]', r" ",
                          text)  # remove any punctuation
            doc = Doc(raw=text)
            if doc.detect_language()[1] != "en":
                print("Cannot determine input language.")
                return None

        spacy_doc = doc._spacy_doc
        spacy_nlp = doc._spacy_nlps["en"][None]

        # default clean_text method, strip HTML tags and lower case
        raw_text = doc.clean.lower()

        # apply lemmatization
        # remove punct and stop words
        spacy_doc = spacy_nlp(raw_text)
        spacy_doc = spacy_nlp(" ".join([
            token.lemma_ for token in spacy_doc
            if not (token.is_punct or token.is_stop)
        ]))

        return spacy_doc
Ejemplo n.º 8
0
en ontleden, transformeren, vervolgens inbrengen in databanken, en ten slotte evalueren en
interpreteren. Philips is een bedrijf genaamd Philips.</p>
"""

TEXT_3 = ''

TEXT_4 = """this is a paragraph
this is a paragraph
"""

TEXT_5 = """Mark Zuckerberg is sinds de oprichting van Facebook de directeur van het bedrijf."""

ents_model = spacy.blank('nl')
custom_spacy_nlps = {'nl': {'ents': ents_model}}

DOC_1 = Doc(TEXT_1)
DOC_2 = Doc(TEXT_2)
DOC_3 = Doc(TEXT_3)
DOC_4 = Doc(TEXT_4)
DOC_5 = Doc(TEXT_5, spacy_nlps=custom_spacy_nlps)


def test_load_custom_model():
    """
    The custom spacy language modules should be correctly loaded into the doc.
    """
    model_mapping = {'nl': 'ents'}
    lang = DOC_5.language if DOC_5.is_reliable_language else DOC_5.hint_language
    assert lang == 'nl'
    assert sorted(DOC_5.find_ents()) == sorted([('Mark Zuckerberg', 'PER'),
                                                ('Facebook', 'MISC')])
Ejemplo n.º 9
0
<a href="/wiki/Database" title="Database">database</a>), deriving patterns within the
<a href="/wiki/Structured_data" class="mw-redirect" title="Structured data">structured data</a>, and
finally evaluation and interpretation of the output. Google is a company named Google.
"""

TEXT_2 = """<p><b>Textmining</b>, ook wel <i>textdatamining</i>, verwijst naar het proces om met
allerhande<a href="/wiki/Informatietechnologie" title="Informatietechnologie">ICT</a>-technieken 
waardevolle informatie te halen uit grote hoeveelheden tekstmateriaal. Met deze technieken wordt 
gepoogd patronen en tendensen te ontwaren. Concreet gaat men teksten softwarematig structureren 
en ontleden, transformeren, vervolgens inbrengen in databanken, en ten slotte evalueren en 
interpreteren. Philips is een bedrijf genaamd Philips.</p>
"""

TEXT_3 = ''

DOC_1 = Doc(TEXT_1)
DOC_2 = Doc(TEXT_2)
DOC_3 = Doc(TEXT_3)


def test_nwords_nsents():
    assert DOC_1.nwords == 87
    assert DOC_2.nwords == 53
    assert DOC_3.nwords == 0
    assert DOC_1.nsents == 4
    assert DOC_2.nsents == 4
    assert DOC_3.nsents == 0


def test_entities():
    assert DOC_1.ents.sort() == ['Google'].sort()
Ejemplo n.º 10
0
def test_gensim_word2vec_with_redis_no_model():
    with pytest.raises(TextpipeMissingModelException) as e:
        Doc(TEXT_2).generate_gensim_document_embedding(
            model_uri='redis://host:1234/0')
    assert str(e.value) == 'Redis does not contain a model for language nl. The model' \
                           ' needs to be loaded before use (see load_keyed_vectors_into_redis).'