Exemple #1
0
    def test_set_text(self):
        doc = dict(text='hello world', metadata={'1': 2})
        res = set_text(doc, 'second time with more words')

        assert isinstance(res, dict)
        assert res is not doc
        assert res['text'] == 'second time with more words'
        assert res['metadata'] == {'1': 2}
Exemple #2
0
def process(content, env, **settings):
    stops = stopwords.words('english')

    for doc in content:

        words = word_tokenize(doc['text'])
        text = [w for w in words if w not in stops]
        text = " ".join(text)

        yield set_text(doc, text)
Exemple #3
0
def use_phrase_models(content, files, settings):

    for doc in content:
        text = doc.tokenized_text
        for fpath in files:
            phrases = Phrases.load(fpath)
            text = phrases[text]

        text = ". ".join([" ".join(sent) for sent in text])
        yield set_text(doc, text)
def process(content, env, **settings):
    """ Tokenization
    """

    for doc in content:
        text = " ".join(tokenizer(doc.text))

        try:
            yield set_text(doc, text)
        except Exception:
            logger.exception("Error in converting to Doc %r", text)

            continue
Exemple #5
0
def process(content, env, **settings):
    for doc in content:
        try:
            text = doc['text']
            text = preprocess_text(text, **settings)
        except Exception:
            logger.exception(
                "Textacy Processor: got an error in extracting content: %r",
                doc)

            continue

        yield set_text(doc, text)
Exemple #6
0
def process(content, env, **settings):

    for doc in content:
        text = doc['text']
        try:
            soup = BeautifulSoup(text, 'html.parser')
            clean = soup.get_text()
        except Exception:
            logger.exception(
                "BS4 Processor: got an error in extracting content: %r", doc)

            continue

        try:
            yield set_text(doc, clean)
        except Exception:
            logger.exception(
                "BS4 Processor: got an error converting to Doc: %r", doc)

            continue