def test_gensim_word2vec(): expected_doc_2 = [ 0.0076740906, -0.051765148, -0.008963874, -0.16817021, -0.12640671, -0.28199115, -0.1418166, -0.08547635, -0.1489038, 0.049820565 ] actual_doc_2 = Doc(TEXT_2).generate_gensim_document_embedding( model_uri='tests/models/gensim_test_nl.kv') if not np.allclose(actual_doc_2, expected_doc_2): raise AssertionError expected_doc_5 = [ 0.04336167, -0.12551728, 0.121972464, -0.023885678, -0.0892916, 0.011041589, -0.022286428, 0.06333805, 0.07664292, 0.086685486 ] actual_doc_5 = Doc(TEXT_5).generate_gensim_document_embedding( model_uri='tests/models/gensim_test_nl.kv', idf_weighting='naive') if not np.allclose(actual_doc_5, expected_doc_5): raise AssertionError expected_doc_5 = [ 0.021136083, -0.035798773, 0.032576967, 0.0048801005, -0.028301004, -0.0059328717, -0.010782357, 0.025319293, 0.018113682, 0.028851084 ] actual_doc_5 = Doc(TEXT_5).generate_gensim_document_embedding( model_uri='tests/models/gensim_test_nl.kv', idf_weighting='log') if not np.allclose(actual_doc_5, expected_doc_5): raise AssertionError
def test_gensim_word2vec_with_redis(): # Load word2vec model into fake Redis kv = RedisKeyedVectors('redis://host:1234/0', 'nl') kv.load_keyed_vectors_into_redis('tests/models/gensim_test_nl.kv') expected_doc_2 = [0.0076740906, -0.051765148, -0.008963874, -0.16817021, -0.12640671, -0.28199115, -0.1418166, -0.08547635, -0.1489038, 0.049820565] actual_doc_2 = Doc(TEXT_2, gensim_vectors={'nl': kv}). \ generate_gensim_document_embedding(model_uri='redis://host:1234/0') if not np.allclose(actual_doc_2, expected_doc_2): raise AssertionError(actual_doc_2) expected_doc_5 = [0.04336167, -0.12551728, 0.121972464, -0.023885678, -0.0892916, 0.011041589, -0.022286428, 0.06333805, 0.07664292, 0.086685486] actual_doc_5 = Doc(TEXT_5).generate_gensim_document_embedding(model_uri='redis://host:1234/0', idf_weighting='naive') if not np.allclose(actual_doc_5, expected_doc_5): raise AssertionError with pytest.raises(RedisIDFWeightingMismatchException) as e: Doc(TEXT_5).generate_gensim_document_embedding(model_uri='redis://host:1234/0', idf_weighting='log') assert str(e.value) == 'The specified document embedding idf weighting "log" does not match ' \ 'weighting in RedisKeyedVector "naive"' kv = RedisKeyedVectors('redis://host:1234/0', 'nl') kv.load_keyed_vectors_into_redis('tests/models/gensim_test_nl.kv', idf_weighting='log') expected_doc_5 = [0.02113608, -0.035798773, 0.032576967, 0.0048801005, -0.028301004, -0.005932871, -0.010782358, 0.025319293, 0.018113682, 0.028851084] actual_doc_5 = Doc(TEXT_5, gensim_vectors={'nl': kv}).\ generate_gensim_document_embedding(model_uri='redis://host:1234/0', idf_weighting='log') if not np.allclose(actual_doc_5, expected_doc_5): raise AssertionError kv._redis.flushall()
def test_gensim_word2vec(): expected_doc_2 = [ 0.0076740906, -0.051765148, -0.008963874, -0.16817021, -0.12640671, -0.28199115, -0.1418166, -0.08547635, -0.1489038, 0.049820565 ] actual_doc_2 = Doc(TEXT_2).generate_gensim_document_embedding( model_uri='tests/models/gensim_test_nl.kv') if not np.allclose(actual_doc_2, expected_doc_2): raise AssertionError expected_doc_5 = [ 0.04336167, -0.12551728, 0.121972464, -0.023885678, -0.0892916, 0.011041589, -0.022286428, 0.06333805, 0.07664292, 0.086685486 ] actual_doc_5 = Doc(TEXT_5).generate_gensim_document_embedding( model_uri='tests/models/gensim_test_nl.kv') if not np.allclose(actual_doc_5, expected_doc_5): raise AssertionError
def __call__(self, raw): """ Apply the pipeline to raw text. A dictionary containing the requested elements as keys and their content is returned Args: raw: incoming, unedited text """ doc = Doc(raw, language=self.language, hint_language=self.hint_language, spacy_nlps=self._spacy_nlps) result_dict = {oper.__class__.__name__: oper(doc) for oper in self._operations} return result_dict
def test_gensim_word2vec_with_redis(): # Load word2vec model into fake Redis kv = RedisKeyedVectors('redis://host:1234/0', 'nl') kv.load_keyed_vectors_into_redis('tests/models/gensim_test_nl.kv') expected_doc_2 = [ 0.0076740906, -0.051765148, -0.008963874, -0.16817021, -0.12640671, -0.28199115, -0.1418166, -0.08547635, -0.1489038, 0.049820565 ] actual_doc_2 = Doc(TEXT_2, gensim_vectors={ 'nl': kv }).generate_gensim_document_embedding(model_uri='redis://host:1234/0') if not np.allclose(actual_doc_2, expected_doc_2): raise AssertionError(actual_doc_2) expected_doc_5 = [ 0.04336167, -0.12551728, 0.121972464, -0.023885678, -0.0892916, 0.011041589, -0.022286428, 0.06333805, 0.07664292, 0.086685486 ] actual_doc_5 = Doc(TEXT_5).generate_gensim_document_embedding( model_uri='redis://host:1234/0') if not np.allclose(actual_doc_5, expected_doc_5): raise AssertionError kv._redis.flushall()
def __call__(self, raw): """ Apply the pipeline to raw text. A dictionary containing the requested elements as keys and their content is returned Args: raw: incoming, unedited text """ doc = Doc(raw, language=self.language, hint_language=self.hint_language, spacy_nlps=self._spacy_nlps) data = {} for oper, settings in self.steps: target_operation = self._operations[oper] data[oper] = target_operation(doc, context=data, settings=settings) return data
def _custom_op(doc: doc.Doc, context=None, settings=None, **kwargs): """ Default custom textpipe operation - Strip HTML tags - Lemmatization - Remove stop words - Remove punctuations If encounter unsupported input lanauage, return None """ # TODO # add multi lang/lm support # return None for now if doc.detect_language()[1] != "en": # do some manual regex preprocessing and re-try text = re.sub("</?.*?>", " <> ", doc.raw) # remove tags text = text.strip() # remove whitespaces in the front and the end text = re.sub("(\\d|\\W)+", " ", text) # remove special chars and digits text = re.sub(r'[.|,|)|(|\|/|?|!|\'|"|#]', r" ", text) # remove any punctuation doc = Doc(raw=text) if doc.detect_language()[1] != "en": print("Cannot determine input language.") return None spacy_doc = doc._spacy_doc spacy_nlp = doc._spacy_nlps["en"][None] # default clean_text method, strip HTML tags and lower case raw_text = doc.clean.lower() # apply lemmatization # remove punct and stop words spacy_doc = spacy_nlp(raw_text) spacy_doc = spacy_nlp(" ".join([ token.lemma_ for token in spacy_doc if not (token.is_punct or token.is_stop) ])) return spacy_doc
en ontleden, transformeren, vervolgens inbrengen in databanken, en ten slotte evalueren en interpreteren. Philips is een bedrijf genaamd Philips.</p> """ TEXT_3 = '' TEXT_4 = """this is a paragraph this is a paragraph """ TEXT_5 = """Mark Zuckerberg is sinds de oprichting van Facebook de directeur van het bedrijf.""" ents_model = spacy.blank('nl') custom_spacy_nlps = {'nl': {'ents': ents_model}} DOC_1 = Doc(TEXT_1) DOC_2 = Doc(TEXT_2) DOC_3 = Doc(TEXT_3) DOC_4 = Doc(TEXT_4) DOC_5 = Doc(TEXT_5, spacy_nlps=custom_spacy_nlps) def test_load_custom_model(): """ The custom spacy language modules should be correctly loaded into the doc. """ model_mapping = {'nl': 'ents'} lang = DOC_5.language if DOC_5.is_reliable_language else DOC_5.hint_language assert lang == 'nl' assert sorted(DOC_5.find_ents()) == sorted([('Mark Zuckerberg', 'PER'), ('Facebook', 'MISC')])
<a href="/wiki/Database" title="Database">database</a>), deriving patterns within the <a href="/wiki/Structured_data" class="mw-redirect" title="Structured data">structured data</a>, and finally evaluation and interpretation of the output. Google is a company named Google. """ TEXT_2 = """<p><b>Textmining</b>, ook wel <i>textdatamining</i>, verwijst naar het proces om met allerhande<a href="/wiki/Informatietechnologie" title="Informatietechnologie">ICT</a>-technieken waardevolle informatie te halen uit grote hoeveelheden tekstmateriaal. Met deze technieken wordt gepoogd patronen en tendensen te ontwaren. Concreet gaat men teksten softwarematig structureren en ontleden, transformeren, vervolgens inbrengen in databanken, en ten slotte evalueren en interpreteren. Philips is een bedrijf genaamd Philips.</p> """ TEXT_3 = '' DOC_1 = Doc(TEXT_1) DOC_2 = Doc(TEXT_2) DOC_3 = Doc(TEXT_3) def test_nwords_nsents(): assert DOC_1.nwords == 87 assert DOC_2.nwords == 53 assert DOC_3.nwords == 0 assert DOC_1.nsents == 4 assert DOC_2.nsents == 4 assert DOC_3.nsents == 0 def test_entities(): assert DOC_1.ents.sort() == ['Google'].sort()
def test_gensim_word2vec_with_redis_no_model(): with pytest.raises(TextpipeMissingModelException) as e: Doc(TEXT_2).generate_gensim_document_embedding( model_uri='redis://host:1234/0') assert str(e.value) == 'Redis does not contain a model for language nl. The model' \ ' needs to be loaded before use (see load_keyed_vectors_into_redis).'