Exemple #1
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
            spacy_model (model): an already loaded spacy model.
        """

        spacy_model = kwargs.get('spacy_model', None)
        if self.language == 'th':
            sentences = []
            text = preprocess(text)
            tokens = tokenize(text, engine='deepcut', remove_whitespace=True)
            tokens = [token for token in tokens if not token.startswith('WS')]
            pos = pos_tag(tokens, corpus='orchid_ud')
            sentences.append({
                "words": tokens,
                "lemmas": tokens,
                "POS": [_pos[1] for _pos in pos]
            })
            doc = Document.from_sentences(sentences,
                                        input_file=kwargs.get('input_file', None),
                                        **kwargs)
            return doc
        else: 
            if spacy_model is not None:
                spacy_model = fix_spacy_for_french(spacy_model)
                spacy_doc = spacy_model(text)
            else:
                max_length = kwargs.get('max_length', 10**6)
                nlp = spacy.load(self.language,
                                max_length=max_length,
                                disable=['ner', 'textcat', 'parser'])
                nlp.add_pipe(nlp.create_pipe('sentencizer'))
                nlp = fix_spacy_for_french(nlp)
                spacy_doc = nlp(text)

            sentences = []
            for sentence_id, sentence in enumerate(spacy_doc.sents):
                sentences.append({
                    "words": [token.text for token in sentence],
                    "lemmas": [token.lemma_ for token in sentence],
                    # FIX : This is a fallback if `fix_spacy_for_french` does not work
                    "POS": [token.pos_ or token.tag_ for token in sentence],
                    "char_offsets": [(token.idx, token.idx + len(token.text))
                                        for token in sentence]
                })

            doc = Document.from_sentences(sentences,
                                        input_file=kwargs.get('input_file', None),
                                        **kwargs)

            return doc
Exemple #2
0
    def read(self, path, **kwargs):
        sentences = []
        tree = etree.parse(path, self.parser)
        for sentence in tree.iterfind('./document/sentences/sentence'):
            # get the character offsets
            starts = [
                int(u.text)
                for u in sentence.iterfind("tokens/token/CharacterOffsetBegin")
            ]
            ends = [
                int(u.text)
                for u in sentence.iterfind("tokens/token/CharacterOffsetEnd")
            ]
            sentences.append({
                "words":
                [u.text for u in sentence.iterfind("tokens/token/word")],
                "lemmas":
                [u.text for u in sentence.iterfind("tokens/token/lemma")],
                "POS": [u.text for u in sentence.iterfind("tokens/token/POS")],
                "char_offsets": [(starts[k], ends[k])
                                 for k in range(len(starts))]
            })
            sentences[-1].update(sentence.attrib)

        doc = Document.from_sentences(sentences, input_file=path, **kwargs)

        return doc
Exemple #3
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
        """

        max_length = kwargs.get('max_length', 10**6)

        if self.language in RawTextReader.nlps:
            nlp = RawTextReader.nlps[self.language]
        else:
            nlp = spacy.load(self.language,
                            max_length=max_length)

        spacy_doc = nlp(text)

        sentences = []
        for sentence_id, sentence in enumerate(spacy_doc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                "POS": [token.pos_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                     for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get('input_file', None),
                                      **kwargs)

        return doc
Exemple #4
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
        """

        nlp = spacy.load(self.language)
        spacy_doc = nlp(text)

        sentences = []
        for sentence_id, sentence in enumerate(spacy_doc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                "POS": [token.pos_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                     for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get('input_file', None),
                                      **kwargs)

        return doc
Exemple #5
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Spacy model selection: By default this function will load the spacy
        model that is closest to the `language` parameter ('fr' language will
        load the spacy model linked to 'fr' or any 'fr_core_web_*' available
        model). In order to select the model that will be used please provide a
        preloaded model via the `spacy_model` parameter, or link the model you
        wish to use to the corresponding language code
        `python3 -m spacy link spacy_model lang_code`.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
            spacy_model (model): an already loaded spacy model.
        """

        spacy_model = kwargs.get('spacy_model', None)

        if spacy_model is None:
            max_length = kwargs.get('max_length', 10**6)
            try:
                spacy_model = spacy.load(str2spacy(self.language),
                                         max_length=max_length,
                                         disable=['ner', 'textcat', 'parser'])
            except OSError:
                logging.warning('No spacy model for \'{}\' language.'.format(
                    self.language))
                logging.warning(
                    'Falling back to using english model. There might '
                    'be tokenization and postagging errors. A list of available '
                    'spacy model is available at https://spacy.io/models.'.
                    format(self.language))
                spacy_model = spacy.load(str2spacy('en_core_web_sm'),
                                         max_length=max_length,
                                         disable=['ner', 'textcat', 'parser'])
            spacy_model.add_pipe(spacy_model.create_pipe('sentencizer'))

        spacy_model = fix_spacy_for_french(spacy_model)
        spacy_doc = spacy_model(text)

        sentences = []
        for sentence_id, sentence in enumerate(spacy_doc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                # FIX : This is a fallback if `fix_spacy_for_french` does not work
                "POS": [token.pos_ or token.tag_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                 for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get(
                                          'input_file', None),
                                      **kwargs)

        return doc
Exemple #6
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
        """
        if self.language != 'id':
            max_length = kwargs.get('max_length', 10**6)
            nlp = spacy.load(self.language, max_length=max_length)
            spacy_doc = nlp(text)
            sentences = []
            for sentence_id, sentence in enumerate(spacy_doc.sents):
                sentences.append({
                    "words": [token.text for token in sentence],
                    "lemmas": [token.lemma_ for token in sentence],
                    "POS": [token.pos_ for token in sentence],
                    "char_offsets": [(token.idx, token.idx + len(token.text))
                                     for token in sentence]
                })

        else:
            text = text.lower()
            token_words = [
                tokenizer_words.tokenize(t) for t in sent_tokenize(text)
            ]
            token_lemmas = []
            token_pos = ct.tag_sents(token_words)

            for token in token_words:
                temp = []
                for word in token:
                    temp.append(stemmer.stem(word))
                token_lemmas.append(temp)

            sentences = []
            for idx, _ in enumerate(token_words):
                sentences.append({
                    "words": token_words[idx],
                    "lemmas": token_lemmas[idx],
                    "POS": token_pos[idx],
                })
        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get(
                                          'input_file', None),
                                      **kwargs)

        return doc
Exemple #7
0
    def read(self, sdoc, **kwargs):
        sentences = []
        for sentence_id, sentence in enumerate(sdoc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                "POS": [token.pos_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                 for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get(
                                          'input_file', None),
                                      **kwargs)
        return doc
Exemple #8
0
    def read(self, text, **kwargs):
        obj = json.loads(text)

        sentences = []
        for sentence_id, s in enumerate(obj['sents']):
            sentences.append({
                "words": [u['t'] for u in s['tok']],
                "lemmas": [u.get('l', '') for u in s['tok']],
                "POS": [u['p'] for u in s['tok']],
                "char_offsets": [(u['o'], u['o'] + len(u['t'])) for u in s['tok']]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get('input_file', None),
                                      **kwargs)

        return doc
Exemple #9
0
    def load_str(self, input, **kwargs):
        # get the language parameter
        language = kwargs.get('language', 'en')

        # test whether the language is known, otherwise fall back to english
        if language not in ISO_to_language:
            logging.warning(
                "ISO 639 code {} is not supported, switching to 'en'.".format(
                    language))
            language = 'en'

        # initialize document
        doc = Document()
        parser = RawTextReader(language=language)
        doc = parser.read(text=input, **kwargs)

        # set the input file
        self.input_file = doc.input_file

        # set the language of the document
        self.language = language

        # set the sentences
        self.sentences = doc.sentences

        # initialize the stoplist
        self.stoplist = stopwords.words(ISO_to_language[self.language])

        # word normalization
        self.normalization = kwargs.get('normalization', 'stemming')
        if self.normalization == 'stemming':
            self.apply_stemming()
        elif self.normalization is None:
            for i, sentence in enumerate(self.sentences):
                self.sentences[i].stems = sentence.words

        # lowercase the normalized words
        for i, sentence in enumerate(self.sentences):
            self.sentences[i].stems = [w.lower() for w in sentence.stems]

        # POS normalization
        if getattr(doc, 'is_corenlp_file', False):
            self.normalize_pos_tags()
            self.unescape_punctuation_marks()
Exemple #10
0
    def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
            spacy_model (model): an already loaded spacy model.
        """

        spacy_model = kwargs.get('spacy_model', None)

        if spacy_model is not None:
            spacy_model = fix_spacy_for_french(spacy_model)
            spacy_doc = spacy_model(text)
        else:
            max_length = kwargs.get('max_length', 10**6)
            nlp = spacy.load("en_core_web_sm",
                             max_length=max_length,
                             disable=['ner', 'textcat', 'parser'])
            nlp.add_pipe(nlp.create_pipe('sentencizer'))
            nlp = fix_spacy_for_french(nlp)
            spacy_doc = nlp(text)

        sentences = []
        for sentence_id, sentence in enumerate(spacy_doc.sents):
            sentences.append({
                "words": [token.text for token in sentence],
                "lemmas": [token.lemma_ for token in sentence],
                # FIX : This is a fallback if `fix_spacy_for_french` does not work
                "POS": [token.pos_ or token.tag_ for token in sentence],
                "char_offsets": [(token.idx, token.idx + len(token.text))
                                 for token in sentence]
            })

        doc = Document.from_sentences(sentences,
                                      input_file=kwargs.get(
                                          'input_file', None),
                                      **kwargs)

        return doc
Exemple #11
0
    def load_document(self, input, **kwargs):
        """Loads the content of a document/string/stream in a given language.

        Args:
            input (str): input.
            language (str): language of the input, defaults to 'en'.
            encoding (str): encoding of the raw file.
            normalization (str): word normalization method, defaults to
                'stemming'. Other possible values are 'lemmatization' or 'None'
                for using word surface forms instead of stems/lemmas.
        """

        # get the language parameter
        language = kwargs.get('language', 'en')

        # test whether the language is known, otherwise fall back to english
        if language not in ISO_to_language:
            logging.warning(
                "ISO 639 code {} is not supported, switching to 'en'.".format(
                    language))
            language = 'en'

        # initialize document
        doc = Document()

        if isinstance(input, spacy_doc_type):
            parser = SpacyDocReader(language=language)
            doc = parser.read(sdoc=input)

        elif isinstance(input, string_types):

            # if input is an input file
            if os.path.isfile(input):

                # an xml file is considered as a CoreNLP document
                if input.endswith('xml'):
                    parser = MinimalCoreNLPReader()
                    doc = parser.read(path=input, **kwargs)
                    doc.is_corenlp_file = True

                # other extensions are considered as raw text
                else:
                    parser = RawTextReader(language=language)
                    encoding = kwargs.get('encoding', 'utf-8')
                    with codecs.open(input, 'r', encoding=encoding) as file:
                        text = file.read()
                    doc = parser.read(text=text, path=input, **kwargs)

            # if input is a string
            else:
                parser = RawTextReader(language=language)
                doc = parser.read(text=input, **kwargs)

        elif getattr(input, 'read', None):
            # check whether it is a compressed CoreNLP document
            name = getattr(input, 'name', None)
            if name and name.endswith('xml'):
                parser = MinimalCoreNLPReader()
                doc = parser.read(path=input, **kwargs)
                doc.is_corenlp_file = True
            else:
                parser = RawTextReader(language=language)
                doc = parser.read(text=input.read(), **kwargs)

        else:
            logging.error('Cannot process {}'.format(type(input)))

        # set the input file
        self.input_file = doc.input_file

        # set the language of the document
        self.language = language

        # set the sentences
        self.sentences = doc.sentences

        # initialize the stoplist
        self.stoplist = stopwords.words(ISO_to_language[self.language])

        # word normalization
        self.normalization = kwargs.get('normalization', 'stemming')
        if self.normalization == 'stemming':
            self.apply_stemming()
        elif self.normalization is None:
            for i, sentence in enumerate(self.sentences):
                self.sentences[i].stems = sentence.words

        # lowercase the normalized words
        for i, sentence in enumerate(self.sentences):
            self.sentences[i].stems = [w.lower() for w in sentence.stems]

        # POS normalization
        if getattr(doc, 'is_corenlp_file', False):
            self.normalize_pos_tags()
            self.unescape_punctuation_marks()
Exemple #12
0
    def load_document(self, input, **kwargs):
        """Loads the content of a document/string/stream in a given language.

        Args:
            input (str): input.
            language (str): language of the input, defaults to 'en'.
            encoding (str): encoding of the raw file.
            normalization (str): word normalization method, defaults to
                'stemming'. Other possible values are 'lemmatization' or 'None'
                for using word surface forms instead of stems/lemmas.
        """

        # get the language parameter
        language = kwargs.get('language', 'en')

        # initialize document
        doc = Document()

        if is_corenlp(input):
            path = input
            parser = MinimalCoreNLPReader()
            doc = parser.read(path=input, **kwargs)
            doc.is_corenlp_file = True
        elif is_file_path(input):
            path = input
            with open(path, encoding=kwargs.get('encoding', 'utf-8')) as f:
                input = f.read()
            parser = RawTextReader(language=language)
            doc = parser.read(text=input, path=path, **kwargs)
        elif isinstance(input, str):
            parser = RawTextReader(language=language)
            doc = parser.read(text=input, **kwargs)
        else:
            logging.error('Cannot process input. It is neither a file path '
                          'or a string: {}'.format(type(input)))
            return

        # set the input file
        self.input_file = doc.input_file

        # set the language of the document
        self.language = language

        # set the sentences
        self.sentences = doc.sentences

        # initialize the stoplist
        self.stoplist = get_stopwords(self.language)

        # word normalization
        self.normalization = kwargs.get('normalization', 'stemming')

        if self.normalization == 'stemming':
            stem = get_stemmer_func(self.language)
            get_stem = lambda s: [stem(w).lower() for w in s.words]
        else:
            get_stem = lambda s: [w.lower() for w in s.words]

        # Populate Sentence.stems according to normalization
        for i, sentence in enumerate(self.sentences):
            self.sentences[i].stems = get_stem(sentence)

        # POS normalization
        if getattr(doc, 'is_corenlp_file', False):
            self.normalize_pos_tags()
            self.unescape_punctuation_marks()
Exemple #13
0
   def read(self, text, **kwargs):
        """Read the input file and use spacy to pre-process.

        Spacy model selection: By default this function will load the spacy
        model that is closest to the `language` parameter ('fr' language will
        load the spacy model linked to 'fr' or any 'fr_core_web_*' available
        model). In order to select the model that will be used please provide a
        preloaded model via the `spacy_model` parameter, or link the model you
        wish to use to the corresponding language code
        `python3 -m spacy link spacy_model lang_code`.

        Args:
            text (str): raw text to pre-process.
            max_length (int): maximum number of characters in a single text for
                spacy, default to 1,000,000 characters (1mb).
            spacy_model (model): an already loaded spacy model.
        """

        sentenceList = []
        for line in StringIO(text):
            line = line.strip()
            tmp = line.split('<phrase>')
            entityMentions = []
            if len(tmp) <= 2:
                #no phrase
                other_parts = tmp[0].split(' ')
                if(other_parts is not None):
                    while('' in other_parts):
                        other_parts.remove('')
                    entityMentions += other_parts
            for seg in tmp:
                temp2 = seg.split('</phrase>')
                if (len(temp2) > 1):
                    entityMentions.append((' ').join(temp2[0].split(' ')))
                    if (temp2[1] != ''):
                        other_parts = temp2[1].split(' ')
                        if(other_parts is not None):
                            while('' in other_parts):
                                other_parts.remove('')
                            entityMentions += other_parts
                elif temp2[0] != ' ' and temp2[0] != '':
                    other_parts = temp2[0].split(' ')
                    if(other_parts is not None):
                        while('' in other_parts):
                            other_parts.remove('')
                        entityMentions += other_parts
            sentenceList.append(entityMentions)

        nlp.spacy.load('en')
        nlp.tokenizer = nlp.tokenizer.tokens_from_list
        for spacy_doc in spacy.pipe(sentenceList):
            sentences = []
            for sentence_id, sentence in enumerate(spacy_doc.sents):
                sentences.append({
                    "words": [token.text for token in sentence],
                    "lemmas": [token.lemma_ for token in sentence],
                    # FIX : This is a fallback if `fix_spacy_for_french` does not work
                    "POS": [token.pos_ or token.tag_ for token in sentence],
                    "char_offsets": [(token.idx, token.idx + len(token.text))
                                        for token in sentence]
                })

            doc = Document.from_sentences(sentences,
                                        input_file=kwargs.get('input_file', None),
                                        **kwargs)

        return doc
Exemple #14
0
    def load_document(self, input, **kwargs):
        """Loads the content of a document/string/stream in a given language.

        Args:
            input (str): input.
            language (str): language of the input, defaults to 'en'.
            encoding (str): encoding of the raw file.
            normalization (str): word normalization method, defaults to
                'stemming'. Other possible values are 'lemmatization' or 'None'
                for using word surface forms instead of stems/lemmas.
        """

        # get the language parameter
        language = kwargs.get('language', 'en')

        # initialize document
        doc = Document()

        if isinstance(input, string_types):

            # if input is an input file
            if os.path.isfile(input):

                # an xml file is considered as a CoreNLP document
                if input.endswith('xml'):
                    parser = MinimalCoreNLPReader()
                    doc = parser.read(path=input, **kwargs)
                    doc.is_corenlp_file = True

                # other extensions are considered as raw text
                else:
                    parser = RawTextReader(language=language)
                    encoding = kwargs.get('encoding', 'utf-8')
                    with codecs.open(input, 'r', encoding=encoding) as file:
                        text = file.read()
                    doc = parser.read(text=text, path=input, **kwargs)

            # if input is a string
            else:
                parser = RawTextReader(language=language)
                doc = parser.read(text=input, **kwargs)

        elif getattr(input, 'read', None):
            # check whether it is a compressed CoreNLP document
            name = getattr(input, 'name', None)
            if name and name.endswith('xml'):
                parser = MinimalCoreNLPReader()
                doc = parser.read(path=input, **kwargs)
                doc.is_corenlp_file = True
            else:
                parser = RawTextReader(language=language)
                doc = parser.read(text=input.read(), **kwargs)

        else:
            logging.error('Cannot process {}'.format(type(input)))

        # set the input file
        self.input_file = doc.input_file

        # set the language of the document
        self.language = language

        # set the sentences
        self.sentences = doc.sentences

        # initialize the stoplist
        self.stoplist = get_stopwords(self.language)

        # word normalization
        self.normalization = kwargs.get('normalization', 'stemming')

        if self.normalization == 'stemming':
            stem = get_stemmer_func(self.language)
            get_stem = lambda s: [stem(w).lower() for w in s.words]
        else:
            get_stem = lambda s: [w.lower() for w in s.words]

        # Populate Sentence.stems according to normalization
        for i, sentence in enumerate(self.sentences):
            self.sentences[i].stems = get_stem(sentence)

        # POS normalization
        if getattr(doc, 'is_corenlp_file', False):
            self.normalize_pos_tags()
            self.unescape_punctuation_marks()