Beispiel #1
0
    def __init__(self, content, metadata=None, lang=detect_language):
        self.metadata = metadata or {}

        # Doc instantiated from text, so must be parsed with a spacy.Language
        if isinstance(content, unicode_):
            if isinstance(lang, SpacyLang):
                self.lang = lang.lang
                spacy_lang = lang
            elif isinstance(lang, unicode_):
                self.lang = get_lang_class(lang).lang
                spacy_lang = data.load_spacy(lang)
            elif callable(lang):
                self.lang = lang(content)
                spacy_lang = data.load_spacy(self.lang)
            else:
                raise ValueError(
                    '`lang` must be {}, not "{}"'.format(
                        {unicode_, SpacyLang, types.FunctionType}, type(lang)))
            self.spacy_vocab = spacy_lang.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = spacy_lang(content)
        # Doc instantiated from an already-parsed spacy.Doc
        elif isinstance(content, SpacyDoc):
            self.spacy_vocab = content.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = content
            self.lang = self.spacy_vocab.lang
            # these checks are probably unnecessary, but in case a user
            # has done something strange, we should complain...
            if isinstance(lang, SpacyLang):
                if self.spacy_vocab is not lang.vocab:
                    raise ValueError(
                        '`spacy.Vocab` used to parse `content` must be the same '
                        'as the one associated with the `lang` param')
            elif isinstance(lang, unicode_):
                if lang != self.lang:
                    raise ValueError(
                        'lang of spacy models used to parse `content` must be '
                        'the same as the `lang` param')
            elif callable(lang) is False:
                raise ValueError(
                    '`lang` must be {}, not "{}"'.format(
                        {unicode_, SpacyLang, types.FunctionType}, type(lang)))
        # oops, user has made some sort of mistake
        else:
            raise ValueError(
                '`Doc` must be initialized with {} content, not "{}"'.format(
                    {unicode_, SpacyDoc}, type(content)))
Beispiel #2
0
 def setUp(self):
     self.maxDiff = None
     spacy_pipeline = data.load_spacy('en')
     text = """
         Two weeks ago, I was in Kuwait participating in an I.M.F. seminar for Arab educators. For 30 minutes, we discussed the impact of technology trends on education in the Middle East. And then an Egyptian education official raised his hand and asked if he could ask me a personal question: "I heard Donald Trump say we need to close mosques in the United States," he said with great sorrow. "Is that what we want our kids to learn?"
         """
     self.spacy_doc = spacy_pipeline(text.strip())
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[425, 1, 1500074], [443, 1, 392], [447, 3, 365], [416, 2, 407],
          [445, 1, 393], [455, 0, 53503], [432, -1, 405], [441, -1, 401],
          [456, -3, 364], [432, -1, 405], [426, 2, 379], [441, 1, 9480],
          [440, -3, 401], [432, -1, 405], [433, 1, 367], [443, -2, 401],
          [419, -11, 407], [432, 5, 405], [425, 1, 1500074], [443, -2, 401],
          [416, 2, 407], [445, 1, 393], [455, 0, 53503], [426, 1, 379],
          [440, -2, 380], [432, -1, 405], [440, 1, 9480], [443, -2, 401],
          [432, -1, 405], [440, -1, 401], [432, -1, 405], [426, 2, 379],
          [441, 1, 9480], [441, -3, 401], [419, -12, 407], [424, 6, 372],
          [447, 5, 365], [426, 3, 379], [433, 2, 367], [440, 1, 9480],
          [440, 1, 393], [455, 32, 373], [446, 1, 402], [440, -2, 380],
          [424, -3, 372], [455, -4, 375], [432, 3, 387], [445, 2, 393],
          [437, 1, 370], [454, -4, 373], [445, -1, 93815], [426, 2, 379],
          [433, 1, 367], [440, -4, 380], [420, -1, 407], [465, -2, 407],
          [445, 1, 393], [455, -4, 63716], [441, 1, 9480], [441, 1, 393],
          [458, -3, 373], [445, 1, 393], [458, -2, 373], [452, 1, 370],
          [454, -2, 411], [443, -1, 380], [432, -1, 405], [426, 2, 379],
          [441, 1, 9480], [441, -3, 401], [416, 3, 407], [415, 2, 407],
          [445, 1, 393], [455, 0, 53503], [432, -1, 405], [433, 1, 367],
          [440, -2, 401], [419, -4, 407], [465, 1, 407], [459, 0, 53503],
          [426, -1, 393], [461, 2, 380], [445, 1, 393], [458, -3, 373],
          [446, 1, 402], [443, 2, 393], [452, 1, 370], [454, -4, 373],
          [419, -9, 407], [415, -10, 407]],
         dtype='int32')
     self.spacy_doc.from_array(cols, values)
Beispiel #3
0
    def load(cls, path, fname_prefix=None):
        """
        Load serialized content and metadata from disk, and initialize a TextDoc.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_doc.bin' and 'metadata.json'
                when saving to disk

        Returns:
            :class:`textacy.TextDoc`
        """
        if fname_prefix:
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadata.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_doc.bin']))
        else:
            meta_fname = os.path.join(path, 'metadata.json')
            docs_fname = os.path.join(path, 'spacy_doc.bin')
        metadata = list(fileio.read_json(meta_fname))[0]
        lang = metadata.pop('textacy_lang')
        spacy_version = metadata.pop('spacy_version')
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextDoc to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextDoc may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        spacy_vocab = data.load_spacy(lang).vocab
        return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0],
                   lang=lang, metadata=metadata)
Beispiel #4
0
    def __init__(self, lang, texts=None, docs=None, metadatas=None):
        if isinstance(lang, unicode_):
            self.lang = get_lang_class(lang).lang
            self.spacy_lang = data.load_spacy(lang)
        elif isinstance(lang, SpacyLang):
            self.lang = lang.lang
            self.spacy_lang = lang
        else:
            msg = '`lang` must be {}, not "{}"'.format({unicode_, SpacyLang},
                                                       type(lang))
            raise ValueError(msg)
        self.spacy_vocab = self.spacy_lang.vocab
        self.spacy_stringstore = self.spacy_vocab.strings
        self.docs = []
        self.n_docs = 0
        self.n_tokens = 0
        self.n_sents = 0 if self.spacy_lang.parser else None

        if texts and docs:
            msg = 'Corpus may be initialized with either `texts` or `docs`, but not both.'
            raise ValueError(msg)
        if texts:
            self.add_texts(texts, metadatas=metadatas)
        elif docs:
            if metadatas:
                for doc, metadata in zip_(docs, metadatas):
                    self.add_doc(doc, metadata=metadata)
            else:
                for doc in docs:
                    self.add_doc(doc)
Beispiel #5
0
 def setUp(self):
     self.maxDiff = None
     spacy_pipeline = data.load_spacy('en')
     text = """
         Two weeks ago, I was in Kuwait participating in an I.M.F. seminar for Arab educators. For 30 minutes, we discussed the impact of technology trends on education in the Middle East. And then an Egyptian education official raised his hand and asked if he could ask me a personal question: "I heard Donald Trump say we need to close mosques in the United States," he said with great sorrow. "Is that what we want our kids to learn?"
         """
     self.spacy_doc = spacy_pipeline(text.strip())
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[425, 1, 1500074], [443, 1, 392], [447, 3, 365], [416, 2, 407], [445, 1, 393],
          [455, 0, 53503], [432, -1, 405], [441, -1, 401], [456, -3, 364],
          [432, -1, 405], [426, 2, 379], [441, 1, 9480], [440, -3, 401], [432, -1, 405],
          [433, 1, 367], [443, -2, 401], [419, -11, 407], [432, 5, 405],
          [425, 1, 1500074], [443, -2, 401], [416, 2, 407], [445, 1, 393],
          [455, 0, 53503], [426, 1, 379], [440, -2, 380], [432, -1, 405], [440, 1, 9480],
          [443, -2, 401], [432, -1, 405], [440, -1, 401], [432, -1, 405], [426, 2, 379],
          [441, 1, 9480], [441, -3, 401], [419, -12, 407], [424, 6, 372], [447, 5, 365],
          [426, 3, 379], [433, 2, 367], [440, 1, 9480], [440, 1, 393], [455, 32, 373],
          [446, 1, 402], [440, -2, 380], [424, -3, 372], [455, -4, 375], [432, 3, 387],
          [445, 2, 393], [437, 1, 370], [454, -4, 373], [445, -1, 93815], [426, 2, 379],
          [433, 1, 367], [440, -4, 380], [420, -1, 407], [465, -2, 407], [445, 1, 393],
          [455, -4, 63716], [441, 1, 9480], [441, 1, 393], [458, -3, 373], [445, 1, 393],
          [458, -2, 373], [452, 1, 370], [454, -2, 411], [443, -1, 380], [432, -1, 405],
          [426, 2, 379], [441, 1, 9480], [441, -3, 401], [416, 3, 407], [415, 2, 407],
          [445, 1, 393], [455, 0, 53503], [432, -1, 405], [433, 1, 367], [440, -2, 401],
          [419, -4, 407], [465, 1, 407], [459, 0, 53503], [426, -1, 393], [461, 2, 380],
          [445, 1, 393], [458, -3, 373], [446, 1, 402], [443, 2, 393], [452, 1, 370],
          [454, -4, 373], [419, -9, 407], [415, -10, 407]],
         dtype='int32')
     self.spacy_doc.from_array(cols, values)
Beispiel #6
0
    def __init__(self, lang, texts=None, docs=None, metadatas=None):
        if isinstance(lang, unicode_type):
            self.lang = lang
            self.spacy_lang = data.load_spacy(self.lang)
        elif isinstance(lang, SpacyLang):
            self.lang = lang.lang
            self.spacy_lang = lang
        else:
            msg = '`lang` must be {}, not "{}"'.format(
                {unicode_type, SpacyLang}, type(lang))
            raise ValueError(msg)
        self.spacy_vocab = self.spacy_lang.vocab
        self.spacy_stringstore = self.spacy_vocab.strings
        self.docs = []
        self.n_docs = 0
        self.n_tokens = 0
        self.n_sents = 0 if self.spacy_lang.parser else None

        if texts and docs:
            msg = 'Corpus may be initialized with either `texts` or `docs`, but not both.'
            raise ValueError(msg)
        if texts:
            self.add_texts(texts, metadatas=metadatas)
        elif docs:
            if metadatas:
                for doc, metadata in zip(docs, metadatas):
                    self.add_doc(doc, metadata=metadata)
            else:
                for doc in docs:
                    self.add_doc(doc)
Beispiel #7
0
    def __init__(self, text_or_sdoc, spacy_pipeline=None, lang=None, metadata=None):
        self.metadata = {} if metadata is None else metadata
        self._term_counts = Counter()

        if isinstance(text_or_sdoc, str):
            self.lang = text_utils.detect_language(text_or_sdoc) if not lang else lang
            if spacy_pipeline is None:
                spacy_pipeline = data.load_spacy(self.lang)
            # check for match between text and passed spacy_pipeline language
            else:
                if spacy_pipeline.lang != self.lang:
                    msg = 'TextDoc.lang {} != spacy_pipeline.lang {}'.format(
                        self.lang, spacy_pipeline.lang)
                    raise ValueError(msg)
            self.spacy_vocab = spacy_pipeline.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = spacy_pipeline(text_or_sdoc)

        elif isinstance(text_or_sdoc, sdoc):
            self.lang = spacy_pipeline.lang if spacy_pipeline is not None else \
                text_utils.detect_language(text_or_sdoc.text_with_ws)
            self.spacy_vocab = text_or_sdoc.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = text_or_sdoc

        else:
            msg = 'TextDoc must be initialized with {}, not {}'.format(
                {str, sdoc}, type(text_or_sdoc))
            raise ValueError(msg)
Beispiel #8
0
    def __init__(self,
                 text_or_sdoc,
                 spacy_pipeline=None,
                 lang=None,
                 metadata=None):
        self.metadata = {} if metadata is None else metadata
        self._term_counts = Counter()

        if isinstance(text_or_sdoc, string_types):
            self.lang = text_utils.detect_language(
                text_or_sdoc) if not lang else lang
            if spacy_pipeline is None:
                spacy_pipeline = data.load_spacy(self.lang)
            # check for match between text and passed spacy_pipeline language
            else:
                if spacy_pipeline.lang != self.lang:
                    msg = 'TextDoc.lang {} != spacy_pipeline.lang {}'.format(
                        self.lang, spacy_pipeline.lang)
                    raise ValueError(msg)
            self.spacy_vocab = spacy_pipeline.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = spacy_pipeline(text_or_sdoc)

        elif isinstance(text_or_sdoc, sdoc):
            self.lang = spacy_pipeline.lang if spacy_pipeline is not None else \
                text_utils.detect_language(text_or_sdoc.text_with_ws)
            self.spacy_vocab = text_or_sdoc.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = text_or_sdoc

        else:
            msg = 'TextDoc must be initialized with {}, not {}'.format(
                {str, sdoc}, type(text_or_sdoc))
            raise ValueError(msg)
Beispiel #9
0
 def setUp(self):
     self.maxDiff = None
     spacy_lang = data.load_spacy('en_core_web_sm')
     text = """
         Two weeks ago, I was in Kuwait participating in an I.M.F. seminar for Arab educators. For 30 minutes, we discussed the impact of technology trends on education in the Middle East. And then an Egyptian education official raised his hand and asked if he could ask me a personal question: "I heard Donald Trump say we need to close mosques in the United States," he said with great sorrow. "Is that what we want our kids to learn?"
         """
     self.spacy_doc = spacy_lang(text.strip())
Beispiel #10
0
    def __init__(self, content, metadata=None, lang=None):
        self.metadata = metadata or {}

        # Doc instantiated from text, so must be parsed with a spacy.Language
        if isinstance(content, unicode_type):
            if isinstance(lang, SpacyLang):
                self.lang = lang.lang
                spacy_lang = lang
            elif isinstance(lang, unicode_type):
                self.lang = lang
                spacy_lang = data.load_spacy(self.lang)
            elif lang is None:
                self.lang = text_utils.detect_language(content)
                spacy_lang = data.load_spacy(self.lang)
            else:
                msg = '`lang` must be {}, not "{}"'.format(
                    {unicode_type, SpacyLang}, type(lang))
                raise ValueError(msg)
            self.spacy_vocab = spacy_lang.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = spacy_lang(content)
        # Doc instantiated from an already-parsed spacy.Doc
        elif isinstance(content, SpacyDoc):
            self.spacy_vocab = content.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = content
            self.lang = self.spacy_vocab.lang
            # these checks are probably unnecessary, but in case a user
            # has done something very strange, we should complain...
            if isinstance(lang, SpacyLang):
                if self.spacy_vocab is not lang.vocab:
                    msg = '`spacy.Vocab` used to parse `content` must be the same as the one associated with `lang`'
                    raise ValueError(msg)
            elif isinstance(lang, unicode_type):
                if lang != self.lang:
                    raise ValueError(
                        'lang of spacy models used to parse `content` must be the same as `lang`'
                    )
            elif lang is not None:
                msg = '`lang` must be {}, not "{}"'.format(
                    {unicode_type, SpacyLang}, type(lang))
                raise ValueError(msg)
        # oops, user has made some sort of mistake
        else:
            msg = '`Doc` must be initialized with {}, not "{}"'.format(
                {unicode_type, SpacyDoc}, type(content))
            raise ValueError(msg)
Beispiel #11
0
 def setUp(self):
     self.maxDiff = None
     spacy_pipeline = data.load_spacy('en')
     text = """The unit tests aren't going well.
               I love Python, but I don't love some of Guido's decisions.
               No computer programmers were harmed in the making of this package.
               Thank God for Stack Overflow."""
     self.spacy_doc = spacy_pipeline(text)
Beispiel #12
0
 def setUp(self):
     spacy_lang = data.load_spacy('en')
     text = """
     The unit tests aren't going well.
     I love Python, but I don't love backwards incompatibilities.
     No programmers were permanently damaged for textacy's sake.
     Thank God for Stack Overflow."""
     self.spacy_doc = spacy_lang(text.strip())
Beispiel #13
0
 def setUp(self):
     self.maxDiff = None
     spacy_pipeline = data.load_spacy('en')
     text = """The unit tests aren't going well.
               I love Python, but I don't love some of Guido's decisions.
               No computer programmers were harmed in the making of this package.
               Thank God for Stack Overflow."""
     self.spacy_doc = spacy_pipeline(text)
Beispiel #14
0
    def __init__(self, content, metadata=None, lang=None):
        self.metadata = metadata or {}

        # Doc instantiated from text, so must be parsed with a spacy.Language
        if isinstance(content, unicode_type):
            if isinstance(lang, SpacyLang):
                self.lang = lang.lang
                spacy_lang = lang
            elif isinstance(lang, unicode_type):
                self.lang = lang
                spacy_lang = data.load_spacy(self.lang)
            elif lang is None:
                self.lang = text_utils.detect_language(content)
                spacy_lang = data.load_spacy(self.lang)
            else:
                msg = '`lang` must be {}, not "{}"'.format(
                    {unicode_type, SpacyLang}, type(lang))
                raise ValueError(msg)
            self.spacy_vocab = spacy_lang.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = spacy_lang(content)
        # Doc instantiated from an already-parsed spacy.Doc
        elif isinstance(content, SpacyDoc):
            self.spacy_vocab = content.vocab
            self.spacy_stringstore = self.spacy_vocab.strings
            self.spacy_doc = content
            self.lang = self.spacy_vocab.lang
            # these checks are probably unnecessary, but in case a user
            # has done something very strange, we should complain...
            if isinstance(lang, SpacyLang):
                if self.spacy_vocab is not lang.vocab:
                    msg = '`spacy.Vocab` used to parse `content` must be the same as the one associated with `lang`'
                    raise ValueError(msg)
            elif isinstance(lang, unicode_type):
                if lang != self.lang:
                    raise ValueError('lang of spacy models used to parse `content` must be the same as `lang`')
            elif lang is not None:
                msg = '`lang` must be {}, not "{}"'.format(
                    {unicode_type, SpacyLang}, type(lang))
                raise ValueError(msg)
        # oops, user has made some sort of mistake
        else:
            msg = '`Doc` must be initialized with {}, not "{}"'.format(
                {unicode_type, SpacyDoc}, type(content))
            raise ValueError(msg)
Beispiel #15
0
 def __init__(self, lang):
     self.lang = lang
     self.spacy_pipeline = data.load_spacy(self.lang)
     self.spacy_vocab = self.spacy_pipeline.vocab
     self.spacy_stringstore = self.spacy_vocab.strings
     self.docs = []
     self.n_docs = 0
     self.n_sents = 0
     self.n_tokens = 0
Beispiel #16
0
 def __init__(self, lang):
     self.lang = lang
     self.spacy_pipeline = data.load_spacy(self.lang)
     self.spacy_vocab = self.spacy_pipeline.vocab
     self.spacy_stringstore = self.spacy_vocab.strings
     self.docs = []
     self.n_docs = 0
     self.n_sents = 0
     self.n_tokens = 0
Beispiel #17
0
 def setUp(self):
     text = "I would have lived in peace. But my enemies brought me war."
     spacy_lang = data.load_spacy('en_core_web_sm')
     self.spacy_doc = spacy_lang(text)
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[479, 3, 425], [471, 2, 401], [488, 1, 401],
          [491, 0, 512817], [466, -1, 439], [474, -1, 435],
          [453, -3, 441], [458, 3, 403], [480, 1, 436],
          [477, 1, 425], [489, 0, 512817], [479, -1, 412],
          [474, -2, 412], [453, -3, 441]], dtype='int32')
     self.spacy_doc.from_array(cols, values)
Beispiel #18
0
 def setUp(self):
     text = "I would have lived in peace. But my enemies brought me war."
     # we're not loading all models for speed; instead, we're updating the doc
     # with pre-computed part-of-speech tagging and parsing values
     spacy_pipeline = data.load_spacy('en')
     self.spacy_doc = spacy_pipeline(text)
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[445, 3, 393], [437, 2, 370], [454, 1, 370], [457, 0, 53503],
          [432, -1, 405], [440, -1, 401], [419, -3, 407], [424, 3, 372],
          [446, 1, 402], [443, 1, 393], [455, 0, 53503], [445, -1, 93815],
          [440, -2, 380], [419, -3, 407]], dtype='int32')
     self.spacy_doc.from_array(cols, values)
Beispiel #19
0
 def __init__(self, lang_or_pipeline):
     if isinstance(lang_or_pipeline, str):
         self.lang = lang_or_pipeline
         self.spacy_pipeline = data.load_spacy(self.lang)
     else:
         self.spacy_pipeline = lang_or_pipeline
         self.lang = self.spacy_pipeline.lang
     self.spacy_vocab = self.spacy_pipeline.vocab
     self.spacy_stringstore = self.spacy_vocab.strings
     self.docs = []
     self.n_docs = 0
     self.n_sents = 0
     self.n_tokens = 0
Beispiel #20
0
 def __init__(self, lang_or_pipeline):
     if isinstance(lang_or_pipeline, string_types):
         self.lang = lang_or_pipeline
         self.spacy_pipeline = data.load_spacy(self.lang)
     else:
         self.spacy_pipeline = lang_or_pipeline
         self.lang = self.spacy_pipeline.lang
     self.spacy_vocab = self.spacy_pipeline.vocab
     self.spacy_stringstore = self.spacy_vocab.strings
     self.docs = []
     self.n_docs = 0
     self.n_sents = 0
     self.n_tokens = 0
Beispiel #21
0
 def setUp(self):
     text = "I would have lived in peace. But my enemies brought me war."
     # we're not loading all models for speed; instead, we're updating the doc
     # with pre-computed part-of-speech tagging and parsing values
     spacy_pipeline = data.load_spacy('en')
     self.spacy_doc = spacy_pipeline(text)
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[445, 3, 393], [437, 2, 370], [454, 1, 370], [457, 0, 53503],
          [432, -1, 405], [440, -1, 401], [419, -3, 407], [424, 3, 372],
          [446, 1, 402], [443, 1, 393], [455, 0, 53503], [445, -1, 93815],
          [440, -2, 380], [419, -3, 407]],
         dtype='int32')
     self.spacy_doc.from_array(cols, values)
Beispiel #22
0
 def setUp(self):
     text = "The year was 2081, and everybody was finally equal. They weren't only equal before God and the law. They were equal every which way."
     # we're not loading all models for speed; instead, we're updating the doc
     # with pre-computed part-of-speech tagging and parsing values
     spacy_pipeline = data.load_spacy('en')
     self.spacy_doc = spacy_pipeline(text)
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[426, 1, 379], [440, 1, 393], [455, 0, 53503], [425, -1, 369], [416, -2, 407],
         [424, -3, 372], [440, 1, 393], [455, -5, 375], [447, -1, 365], [433, -2, 363],
         [419, -3, 407], [445, 1, 393], [455, 0, 53503], [447, 2, 404], [447, -1, 365],
         [433, -3, 363], [432, -1, 405], [441, -1, 401], [424, -1, 372], [426, 1, 379],
         [440, -3, 375], [419, -9, 407], [445, 1, 393], [455, 0, 53503], [433, -1, 363],
         [426, 2, 379], [460, 1, 379], [440, -4, 392], [419, -5, 407]], dtype='int32')
     self.spacy_doc.from_array(cols, values)
Beispiel #23
0
    def setUp(self):
        spacy_lang = data.load_spacy('en_core_web_sm')
        text = """
        Friedman joined the London bureau of United Press International after completing his master's degree. He was dispatched a year later to Beirut, where he lived from June 1979 to May 1981 while covering the Lebanon Civil War. He was hired by The New York Times as a reporter in 1981 and re-dispatched to Beirut at the start of the 1982 Israeli invasion of Lebanon. His coverage of the war, particularly the Sabra and Shatila massacre, won him the Pulitzer Prize for International Reporting (shared with Loren Jenkins of The Washington Post). Alongside David K. Shipler he also won the George Polk Award for foreign reporting.

        In June 1984, Friedman was transferred to Jerusalem, where he served as the New York Times Jerusalem Bureau Chief until February 1988. That year he received a second Pulitzer Prize for International Reporting, which cited his coverage of the First Palestinian Intifada. He wrote a book, From Beirut to Jerusalem, describing his experiences in the Middle East, which won the 1989 U.S. National Book Award for Nonfiction.

        Friedman covered Secretary of State James Baker during the administration of President George H. W. Bush. Following the election of Bill Clinton in 1992, Friedman became the White House correspondent for the New York Times. In 1994, he began to write more about foreign policy and economics, and moved to the op-ed page of The New York Times the following year as a foreign affairs columnist. In 2002, Friedman won the Pulitzer Prize for Commentary for his "clarity of vision, based on extensive reporting, in commenting on the worldwide impact of the terrorist threat."

        In February 2002, Friedman met Saudi Crown Prince Abdullah and encouraged him to make a comprehensive attempt to end the Arab-Israeli conflict by normalizing Arab relations with Israel in exchange for the return of refugees alongside an end to the Israel territorial occupations. Abdullah proposed the Arab Peace Initiative at the Beirut Summit that March, which Friedman has since strongly supported.

        Friedman received the 2004 Overseas Press Club Award for lifetime achievement and was named to the Order of the British Empire by Queen Elizabeth II.

        In May 2011, The New York Times reported that President Barack Obama "has sounded out" Friedman concerning Middle East issues.
        """
        self.spacy_doc = spacy_lang(preprocess_text(text), parse=False)
Beispiel #24
0
 def setUp(self):
     text = "The year was 2081, and everybody was finally equal. They weren't only equal before God and the law. They were equal every which way."
     # we're not loading all models for speed; instead, we're updating the doc
     # with pre-computed part-of-speech tagging and parsing values
     spacy_pipeline = data.load_spacy('en')
     self.spacy_doc = spacy_pipeline(text)
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[426, 1, 379], [440, 1, 393], [455, 0, 53503], [425, -1, 369],
          [416, -2, 407], [424, -3, 372], [440, 1, 393], [455, -5, 375],
          [447, -1, 365], [433, -2, 363], [419, -3, 407], [445, 1, 393],
          [455, 0, 53503], [447, 2, 404], [447, -1, 365], [433, -3, 363],
          [432, -1, 405], [441, -1, 401], [424, -1, 372], [426, 1, 379],
          [440, -3, 375], [419, -9, 407], [445, 1, 393], [455, 0, 53503],
          [433, -1, 363], [426, 2, 379], [460, 1, 379], [440, -4, 392],
          [419, -5, 407]],
         dtype='int32')
     self.spacy_doc.from_array(cols, values)
Beispiel #25
0
 def setUp(self):
     self.text = "The year was 2081, and everybody was finally equal. They weren't only equal before God and the law. They were equal every which way."
     self.spacy_pipeline = data.load_spacy('en')
     self.spacy_doc = self.spacy_pipeline(self.text)
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[426, 1, 379], [440, 1, 393], [455, 0, 53503], [425, -1, 369],
          [416, -2, 407], [424, -3, 372], [440, 1, 393], [455, -5, 375],
          [447, -1, 365], [433, -2, 363], [419, -3, 407], [445, 1, 393],
          [455, 0, 53503], [447, 2, 389], [447, 1, 365], [433, -3, 363],
          [432, -1, 405], [441, -1, 401], [424, -1, 372], [426, 1, 379],
          [440, -3, 375], [419, -9, 407], [445, 1, 393], [455, 0, 53503],
          [433, -1, 363], [426, 2, 379], [460, 1, 379], [440, -4, 392],
          [419, -5, 407]],
         dtype='int32')
     self.spacy_doc.from_array(cols, values)
     self.tempdir = tempfile.mkdtemp(
         prefix='test_fileio', dir=os.path.dirname(os.path.abspath(__file__)))
     self.tests_dir = os.path.split(__file__)[0]
     self.maxDiff = None
Beispiel #26
0
 def setUp(self):
     text = "I would have lived in peace. But my enemies brought me war."
     spacy_lang = data.load_spacy('en')
     self.spacy_doc = spacy_lang(text)
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[13656873538139661788, 3, 426], [16235386156175103506, 2, 402],
          [14200088355797579614, 1, 402],
          [3822385049556375858, 0, 8206900633647566924],
          [1292078113972184607, 18446744073709551615, 440],
          [15308085513773655218, 18446744073709551615, 436],
          [12646065887601541794, 18446744073709551613, 442],
          [17571114184892886314, 3, 404], [4062917326063685704, 1, 437],
          [783433942507015291, 1, 426],
          [17109001835818727656, 0, 8206900633647566924],
          [13656873538139661788, 18446744073709551615, 3965108062993911700],
          [15308085513773655218, 18446744073709551614, 413],
          [12646065887601541794, 18446744073709551613, 442]],
         dtype='uint64')
     self.spacy_doc.from_array(cols, values)
Beispiel #27
0
 def setUp(self):
     self.text = "The year was 2081, and everybody was finally equal. They weren't only equal before God and the law. They were equal every which way."
     self.spacy_pipeline = data.load_spacy('en')
     self.spacy_doc = self.spacy_pipeline(self.text)
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[426, 1, 379], [440, 1, 393], [455, 0, 53503], [425, -1, 369],
          [416, -2, 407], [424, -3, 372], [440, 1, 393], [455, -5, 375],
          [447, -1, 365], [433, -2, 363], [419, -3, 407], [445, 1, 393],
          [455, 0, 53503], [447, 2, 389], [447, 1, 365], [433, -3, 363],
          [432, -1, 405], [441, -1, 401], [424, -1, 372], [426, 1, 379],
          [440, -3, 375], [419, -9, 407], [445, 1, 393], [455, 0, 53503],
          [433, -1, 363], [426, 2, 379], [460, 1, 379], [440, -4, 392],
          [419, -5, 407]],
         dtype='int32')
     self.spacy_doc.from_array(cols, values)
     self.tempdir = tempfile.mkdtemp(
         prefix='test_fileio', dir=os.path.dirname(os.path.abspath(__file__)))
     self.tests_dir = os.path.split(__file__)[0]
     self.maxDiff = None
Beispiel #28
0
 def setUp(self):
     self.text = "The year was 2081, and everybody was finally equal. They weren't only equal before God and the law. They were equal every which way."
     self.spacy_lang = data.load_spacy('en')
     self.spacy_doc = self.spacy_lang(self.text)
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[15267657372422890137, 1, 412], [15308085513773655218, 1, 426],
          [17109001835818727656, 0, 8206900633647566924],
          [8427216679587749980, 18446744073709551615, 401],
          [2593208677638477497, 18446744073709551614, 442],
          [17571114184892886314, 18446744073709551613, 404],
          [15308085513773655218, 1, 426],
          [17109001835818727656, 18446744073709551611, 407],
          [164681854541413346, 18446744073709551615, 397],
          [10554686591937588953, 18446744073709551614, 395],
          [12646065887601541794, 18446744073709551613, 442],
          [13656873538139661788, 1, 426],
          [17109001835818727656, 0, 8206900633647566924],
          [164681854541413346, 18446744073709551615, 422],
          [164681854541413346, 1, 397],
          [10554686591937588953, 18446744073709551613, 395],
          [1292078113972184607, 18446744073709551615, 440],
          [15794550382381185553, 18446744073709551615, 436],
          [17571114184892886314, 18446744073709551615, 404],
          [15267657372422890137, 1, 412],
          [15308085513773655218, 18446744073709551613, 407],
          [12646065887601541794, 18446744073709551607, 442],
          [13656873538139661788, 1, 426],
          [17109001835818727656, 0, 8206900633647566924],
          [10554686591937588953, 18446744073709551615, 395],
          [15267657372422890137, 2, 13323405159917154080],
          [17202369883303991778, 1, 412],
          [15308085513773655218, 18446744073709551612, 425],
          [12646065887601541794, 18446744073709551611, 442]],
         dtype='uint64')
     self.spacy_doc.from_array(cols, values)
     self.tempdir = tempfile.mkdtemp(prefix='test_fileio',
                                     dir=os.path.dirname(
                                         os.path.abspath(__file__)))
     self.tests_dir = os.path.split(__file__)[0]
     self.maxDiff = None
Beispiel #29
0
 def setUp(self):
     self.maxDiff = None
     spacy_lang = data.load_spacy('en_core_web_sm')
     text = """
         Two weeks ago, I was in Kuwait participating in an I.M.F. seminar for Arab educators. For 30 minutes, we discussed the impact of technology trends on education in the Middle East. And then an Egyptian education official raised his hand and asked if he could ask me a personal question: "I heard Donald Trump say we need to close mosques in the United States," he said with great sorrow. "Is that what we want our kids to learn?"
         """
     self.spacy_doc = spacy_lang(text.strip())
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP, attrs.ENT_TYPE]
     values = np.array(
         [[459, 1, 758136, 387], [477, 1, 424, 387], [481, 3, 396, 387],
          [450, 2, 441, 0], [479, 1, 425, 0], [489, 0, 512817, 0],
          [466, -1, 439, 0], [475, -1, 435, 381], [490, -3, 395, 0],
          [466, -1, 439, 0], [460, 2, 411, 0], [475, 1, 74185, 0],
          [474, -3, 435, 0], [466, -1, 439, 0], [467, 1, 398, 378],
          [477, -2, 435, 0], [453, -11, 441, 0], [466, 5, 439, 388],
          [459, 1, 758136, 388], [477, -2, 435, 388], [450, 2, 441, 0],
          [479, 1, 425, 0], [489, 0, 512817, 0], [460, 1, 411, 0],
          [474, -2, 412, 0], [466, -1, 439, 0], [474, 1, 74185, 0],
          [477, -2, 435, 0], [466, -1, 439, 0], [474, -1, 435, 0],
          [466, -1, 439, 0], [460, 2, 411, 382], [475, 1, 74185, 382],
          [475, -3, 435, 382], [453, -12, 441, 0], [458, 6, 403, 0],
          [481, 5, 396, 0], [460, 3, 411, 0], [467, 2, 398, 378],
          [474, 1, 74185, 0], [474, 1, 425, 0], [489, 32, 404, 0],
          [480, 1, 436, 0], [474, -2, 412, 0], [458, -3, 403, 0],
          [489, -4, 406, 0], [466, 3, 419, 0], [479, 2, 425, 0],
          [471, 1, 401, 0], [488, -4, 395, 0], [479, -1, 758134, 0],
          [460, 2, 411, 0], [467, 1, 398, 0], [474, -4, 412, 0],
          [454, -1, 441, 0], [499, 2, 441, 0], [479, 1, 425, 0],
          [489, -8, 404, 0], [475, 1, 74185, 377], [475, 1, 425, 377],
          [492, -3, 404, 0], [479, 1, 425, 0], [492, -2, 404, 0],
          [486, 1, 401, 0], [488, -2, 445, 0], [477, -1, 412, 0],
          [466, -1, 439, 0], [460, 2, 411, 381], [475, 1, 74185, 381],
          [475, -3, 435, 381], [450, 3, 441, 0], [449, 2, 441, 0],
          [479, 1, 425, 0], [489, 0, 512817, 0], [466, -1, 439, 0],
          [467, 1, 398, 0], [474, -2, 435, 0], [453, -4, 441, 0],
          [499, 1, 441, 0], [493, 0, 512817, 0], [460, -1, 425, 0],
          [495, 2, 412, 0], [479, 1, 425, 0], [492, -4, 404, 0],
          [480, 1, 436, 0], [477, -6, 412, 0], [486, 1, 401, 0],
          [488, -8, 445, 0], [453, -9, 441, 0], [449, -10, 441, 0]],
         dtype='int32')
     self.spacy_doc.from_array(cols, values)
Beispiel #30
0
 def setUp(self):
     self.text = "The year was 2081, and everybody was finally equal. They weren't only equal before God and the law. They were equal every which way."
     self.spacy_lang = data.load_spacy('en_core_web_sm')
     self.spacy_doc = self.spacy_lang(self.text)
     cols = [attrs.TAG, attrs.HEAD, attrs.DEP]
     values = np.array(
         [[460, 1, 411], [474, 1, 425], [489, 0, 512817], [459, -1, 399],
          [450, -1, 441], [458, -2, 403], [474, 1, 425], [489, -4, 406],
          [481, 1, 396], [467, -2, 394], [453, -8, 441], [479, 1, 425],
          [489, 0, 512817], [481, 2, 438], [481, 1, 396], [467, -3, 394],
          [466, -1, 439], [475, -1, 435], [458, -1, 403], [460, 1, 411],
          [474, -5, 412], [453, -9, 441], [479, 1, 425], [489, 0, 512817],
          [467, -1, 394], [460, 2, 411], [494, 1, 411], [474, -3, 758141],
          [453, -5, 441]],
         dtype='int32')
     self.spacy_doc.from_array(cols, values)
     self.tempdir = tempfile.mkdtemp(prefix='test_fileio',
                                     dir=os.path.dirname(
                                         os.path.abspath(__file__)))
     self.tests_dir = os.path.split(__file__)[0]
     self.maxDiff = None
Beispiel #31
0
    def load(cls, path, fname_prefix=None):
        """
        Load serialized content and metadata from disk, and initialize a TextDoc.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_doc.bin' and 'metadata.json'
                when saving to disk

        Returns:
            :class:`textacy.TextDoc`

        .. warn:: If the `spacy.Vocab` object used to save this document is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            meta_fname = os.path.join(
                path, '_'.join([fname_prefix, 'metadata.json']))
            docs_fname = os.path.join(
                path, '_'.join([fname_prefix, 'spacy_doc.bin']))
        else:
            meta_fname = os.path.join(path, 'metadata.json')
            docs_fname = os.path.join(path, 'spacy_doc.bin')
        metadata = list(fileio.read_json(meta_fname))[0]
        lang = metadata.pop('textacy_lang')
        spacy_version = metadata.pop('spacy_version')
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextDoc to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextDoc may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        spacy_vocab = data.load_spacy(lang).vocab
        return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0],
                   lang=lang,
                   metadata=metadata)
Beispiel #32
0
    def load(cls, path, name=None):
        """
        Load content and metadata from disk, and initialize a ``Doc``.

        Args:
            path (str): Directory on disk where content and metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_doc.bin' and 'metadata.json', used when doc was
                saved to disk via :meth:`Doc.save()`.

        Returns:
            :class:`textacy.Doc <Doc>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            meta_fname = os.path.join(path, '_'.join([name, 'metadata.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_doc.bin']))
        else:
            meta_fname = os.path.join(path, 'metadata.json')
            docs_fname = os.path.join(path, 'spacy_doc.bin')
        metadata = list(fileio.read_json(meta_fname))[0]
        lang = metadata.pop('textacy_lang')
        spacy_version = metadata.pop('spacy_version')
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Doc to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Doc may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        spacy_vocab = data.load_spacy(lang).vocab
        return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0],
                   lang=lang,
                   metadata=metadata)
Beispiel #33
0
    def load(cls, path, name=None):
        """
        Load content and metadata from disk, and initialize a ``Doc``.

        Args:
            path (str): Directory on disk where content and metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_doc.bin' and 'metadata.json', used when doc was
                saved to disk via :meth:`Doc.save()`.

        Returns:
            :class:`textacy.Doc <Doc>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            meta_fname = os.path.join(path, '_'.join([name, 'metadata.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_doc.bin']))
        else:
            meta_fname = os.path.join(path, 'metadata.json')
            docs_fname = os.path.join(path, 'spacy_doc.bin')
        metadata = list(fileio.read_json(meta_fname))[0]
        lang = metadata.pop('textacy_lang')
        spacy_version = metadata.pop('spacy_version')
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Doc to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Doc may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        spacy_vocab = data.load_spacy(lang).vocab
        return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0],
                   lang=lang, metadata=metadata)
Beispiel #34
0
 def test_invalid_content_lang_combo(self):
     spacy_lang = data.load_spacy('en_core_web_sm')
     with self.assertRaises(ValueError):
         Doc(spacy_lang('Hola, cómo estás mi amigo?'), lang='es')
Beispiel #35
0
 def test_lang_spacylang(self):
     spacy_lang = data.load_spacy('en_core_web_sm')
     self.assertIsInstance(
         Doc('This is an English sentence.', lang=spacy_lang), Doc)
Beispiel #36
0
 def test_spacydoc_content(self):
     spacy_lang = data.load_spacy('en_core_web_sm')
     spacy_doc = spacy_lang('This is an English sentence.')
     self.assertIsInstance(Doc(spacy_doc), Doc)