Beispiel #1
0
    def add_texts(self, texts, metadatas=None, n_threads=4, batch_size=1000):
        """
        Process a stream of texts (and a corresponding stream of metadata dicts,
        optionally) in parallel with spaCy; add as :class:`textacy.Doc <textacy.doc.Doc>` s
        to the corpus.

        Args:
            texts (Iterable[str]): Stream of texts to add to corpus as ``Doc`` s
            metadatas (Iterable[dict]): Stream of dictionaries of relevant
                document metadata. **Note:** This stream must align exactly with
                ``texts``, or metadata will be mis-assigned to texts. More
                concretely, the first item in ``metadatas`` will be assigned to
                the first item in ``texts``, and so on from there.
            n_threads (int): Number of threads to use when processing ``texts``
                in parallel, if available.
            batch_size (int): Number of texts to process at a time.

        See Also:
            :func:`fileio.split_record_fields()`
            http://spacy.io/docs/#multi-threaded
        """
        spacy_docs = self.spacy_lang.pipe(texts,
                                          n_threads=n_threads,
                                          batch_size=batch_size)
        if metadatas:
            for spacy_doc, metadata in zip(spacy_docs, metadatas):
                self._add_textacy_doc(
                    Doc(spacy_doc, lang=self.spacy_lang, metadata=metadata))
        else:
            for spacy_doc in spacy_docs:
                self._add_textacy_doc(
                    Doc(spacy_doc, lang=self.spacy_lang, metadata=None))
Beispiel #2
0
 def emotional_valence(text):
     doc = Doc(text, lang='en')
     scores = emotional_valence(doc.tokens, dm_data_dir='pretrained_models')
     return np.array([
         scores['AFRAID'], scores['AMUSED'], scores['ANGRY'],
         scores['ANNOYED'], scores['DONT_CARE'], scores['HAPPY'],
         scores['INSPIRED'], scores['SAD']
     ])
Beispiel #3
0
    def add_text(self, text, metadata=None):
        """
        Create a :class:`textacy.Doc <textacy.doc.Doc>` from ``text`` and
        ``metadata``, then add it to the corpus.

        Args:
            text (str): Document (text) content to add to corpus as a ``Doc``.
            metadata (dict): Dictionary of relevant document metadata.
        """
        self._add_textacy_doc(Doc(text, lang=self.spacy_lang, metadata=metadata))
Beispiel #4
0
    def test_build_phrase_models(self, Phrases):
        from eea.corpus.processing.phrases.phrases import build_phrase_models
        from textacy.doc import Doc

        content = [Doc('hello'), Doc('world')]

        phrases = Phrases()
        Phrases.return_value = phrases

        build_phrase_models(content, '/corpus/some.csv.phras', {'level': 2})

        # call count should be 1, but we called above once
        assert Phrases.call_count == 2
        assert phrases.save.call_args[0] == ('/corpus/some.csv.phras.2', )

        build_phrase_models(content, '/corpus/some.csv.phras', {'level': 3})

        # call count should be 1, but it accumulates with the 2 above
        assert Phrases.call_count == 4
        assert phrases.save.call_args[0] == ('/corpus/some.csv.phras.3', )
Beispiel #5
0
 def polarity(text):
     doc = Doc(text, lang='en')
     sentences = [span.text for span in doc.sents]
     scores = [
         analyzer.polarity_scores(sentence) for sentence in sentences
     ]
     np_scores = [
         np.array([
             score['neg'], score['neu'], score['pos'], score['compound']
         ]) for score in scores
     ]
     return np.mean(np.stack(np_scores), axis=0)
Beispiel #6
0
    def load(cls, path, name=None, compression=None):
        """
        Load content and metadata from disk, and initialize a ``Corpus``.

        Args:
            path (str): Directory on disk where content + metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json',
                used when corpus was saved to disk via :meth:`Corpus.save()`.
            compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression
                used to reduce size of 'metadatas.json' file when saved, if any.

        Returns:
            :class:`textacy.Corpus <Corpus>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            info_fname = os.path.join(path, '_'.join([name, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Corpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Corpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        corpus = Corpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode)
        spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            corpus.add_doc(
                Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
        return corpus
Beispiel #7
0
def preprocess_text_string(text):
    """Preprocesses text for feature extraction.

    Preprocessing tasks are as follows:
        - whitespace normalization
        - fixing broken unicode via ftfy
        - converting text to lowercase
        - replacing url strings with 'url'
        - replacing phone number strings with 'phone'
        - replacing currency symbols with their standard 3-letter abbreviations
        - stripping punctuation
        - replacing contractions with their unshortened forms
        - lemmatizing words

    Parameters
    ----------
    text : str
        The input text to be preprocessed.

    Returns
    -------
    preprocessed : str
        The preprocessed output text.
    """
    text = preprocess_text(text,
                           fix_unicode=True,
                           lowercase=True,
                           no_urls=True,
                           no_phone_numbers=True,
                           no_currency_symbols=True,
                           no_punct=True,
                           no_contractions=True)
    doc = Doc(text, lang='en')
    lemmatized_tokens = doc.to_terms_list(ngrams=1,
                                          named_entities=False,
                                          as_strings=True,
                                          normalize='lemma')
    return ' '.join(lemmatized_tokens)
Beispiel #8
0
    def _get_quotes(self):
        quote_count = []
        for stance in tqdm.tqdm(self._stances):
            body = self._original_articles.get(stance['Body ID']).decode(
                'utf-8', 'replace')
            doc = Doc(content=body, lang=u'en')
            quotes = direct_quotations(doc)
            quote_counter = 0

            for q in quotes:
                quote_counter = quote_counter + len(q[2])
            quote_counter = quote_counter / len(body)
            quote_count.append(quote_counter)

        return quote_count
Beispiel #9
0
    def add_doc(self, doc, metadata=None):
        """
        Add an existing :class:`textacy.Doc <textacy.doc.Doc>` or initialize a
        new one from a ``spacy.Doc`` to the corpus.

        Args:
            doc (``textacy.Doc`` or ``spacy.Doc``)
            metadata (dict): Dictionary of relevant document metadata. If ``doc``
                is a ``spacy.Doc``, it will be paired as usual; if ``doc`` is a
                ``textacy.Doc``, it will *overwrite* any existing metadata.

        .. warning:: If ``doc`` was already added to this or another ``Corpus``,
            it will be deep-copied and then added as if a new document. A warning
            message will be logged. This is probably not a thing you should do.
        """
        if isinstance(doc, Doc):
            if doc.spacy_vocab is not self.spacy_vocab:
                msg = 'Doc.spacy_vocab {} != Corpus.spacy_vocab {}'.format(
                    doc.spacy_vocab, self.spacy_vocab)
                raise ValueError(msg)
            if hasattr(doc, 'corpus_index'):
                doc = copy.deepcopy(doc)
                # TODO: make this into a logging warning
                print(
                    '**WARNING: Doc already associated with a Corpus; adding anyway...'
                )
            if metadata is not None:
                doc.metadata = metadata
            self._add_textacy_doc(doc)
        elif isinstance(doc, SpacyDoc):
            if doc.vocab is not self.spacy_vocab:
                msg = 'SpacyDoc.vocab {} != Corpus.spacy_vocab {}'.format(
                    doc.vocab, self.spacy_vocab)
                raise ValueError(msg)
            self._add_textacy_doc(
                Doc(doc, lang=self.spacy_lang, metadata=metadata))
        else:
            msg = '`doc` must be {}, not "{}"'.format({Doc, SpacyDoc},
                                                      type(doc))
            raise ValueError(msg)
Beispiel #10
0
def doc_creator(text):
    text = preprocess_text(text, fix_unicode=True, lowercase=True, no_numbers=True, no_punct=True,
                            no_contractions=True, no_accents=True)
    return Doc(text, lang="en_core_web_md")