def load_str(self, input, **kwargs): # get the language parameter language = kwargs.get('language', 'en') # test whether the language is known, otherwise fall back to english if language not in ISO_to_language: logging.warning( "ISO 639 code {} is not supported, switching to 'en'.".format( language)) language = 'en' # initialize document doc = Document() parser = RawTextReader(language=language) doc = parser.read(text=input, **kwargs) # set the input file self.input_file = doc.input_file # set the language of the document self.language = language # set the sentences self.sentences = doc.sentences # initialize the stoplist self.stoplist = stopwords.words(ISO_to_language[self.language]) # word normalization self.normalization = kwargs.get('normalization', 'stemming') if self.normalization == 'stemming': self.apply_stemming() elif self.normalization is None: for i, sentence in enumerate(self.sentences): self.sentences[i].stems = sentence.words # lowercase the normalized words for i, sentence in enumerate(self.sentences): self.sentences[i].stems = [w.lower() for w in sentence.stems] # POS normalization if getattr(doc, 'is_corenlp_file', False): self.normalize_pos_tags() self.unescape_punctuation_marks()
def load_document(self, input, **kwargs): """Loads the content of a document/string/stream in a given language. Args: input (str): input. language (str): language of the input, defaults to 'en'. encoding (str): encoding of the raw file. normalization (str): word normalization method, defaults to 'stemming'. Other possible values are 'lemmatization' or 'None' for using word surface forms instead of stems/lemmas. """ # get the language parameter language = kwargs.get('language', 'en') # test whether the language is known, otherwise fall back to english if language not in ISO_to_language: logging.warning( "ISO 639 code {} is not supported, switching to 'en'.".format( language)) language = 'en' # initialize document doc = Document() if isinstance(input, spacy_doc_type): parser = SpacyDocReader(language=language) doc = parser.read(sdoc=input) elif isinstance(input, string_types): # if input is an input file if os.path.isfile(input): # an xml file is considered as a CoreNLP document if input.endswith('xml'): parser = MinimalCoreNLPReader() doc = parser.read(path=input, **kwargs) doc.is_corenlp_file = True # other extensions are considered as raw text else: parser = RawTextReader(language=language) encoding = kwargs.get('encoding', 'utf-8') with codecs.open(input, 'r', encoding=encoding) as file: text = file.read() doc = parser.read(text=text, path=input, **kwargs) # if input is a string else: parser = RawTextReader(language=language) doc = parser.read(text=input, **kwargs) elif getattr(input, 'read', None): # check whether it is a compressed CoreNLP document name = getattr(input, 'name', None) if name and name.endswith('xml'): parser = MinimalCoreNLPReader() doc = parser.read(path=input, **kwargs) doc.is_corenlp_file = True else: parser = RawTextReader(language=language) doc = parser.read(text=input.read(), **kwargs) else: logging.error('Cannot process {}'.format(type(input))) # set the input file self.input_file = doc.input_file # set the language of the document self.language = language # set the sentences self.sentences = doc.sentences # initialize the stoplist self.stoplist = stopwords.words(ISO_to_language[self.language]) # word normalization self.normalization = kwargs.get('normalization', 'stemming') if self.normalization == 'stemming': self.apply_stemming() elif self.normalization is None: for i, sentence in enumerate(self.sentences): self.sentences[i].stems = sentence.words # lowercase the normalized words for i, sentence in enumerate(self.sentences): self.sentences[i].stems = [w.lower() for w in sentence.stems] # POS normalization if getattr(doc, 'is_corenlp_file', False): self.normalize_pos_tags() self.unescape_punctuation_marks()
def load_document(self, input, **kwargs): """Loads the content of a document/string/stream in a given language. Args: input (str): input. language (str): language of the input, defaults to 'en'. encoding (str): encoding of the raw file. normalization (str): word normalization method, defaults to 'stemming'. Other possible values are 'lemmatization' or 'None' for using word surface forms instead of stems/lemmas. """ # get the language parameter language = kwargs.get('language', 'en') # initialize document doc = Document() if is_corenlp(input): path = input parser = MinimalCoreNLPReader() doc = parser.read(path=input, **kwargs) doc.is_corenlp_file = True elif is_file_path(input): path = input with open(path, encoding=kwargs.get('encoding', 'utf-8')) as f: input = f.read() parser = RawTextReader(language=language) doc = parser.read(text=input, path=path, **kwargs) elif isinstance(input, str): parser = RawTextReader(language=language) doc = parser.read(text=input, **kwargs) else: logging.error('Cannot process input. It is neither a file path ' 'or a string: {}'.format(type(input))) return # set the input file self.input_file = doc.input_file # set the language of the document self.language = language # set the sentences self.sentences = doc.sentences # initialize the stoplist self.stoplist = get_stopwords(self.language) # word normalization self.normalization = kwargs.get('normalization', 'stemming') if self.normalization == 'stemming': stem = get_stemmer_func(self.language) get_stem = lambda s: [stem(w).lower() for w in s.words] else: get_stem = lambda s: [w.lower() for w in s.words] # Populate Sentence.stems according to normalization for i, sentence in enumerate(self.sentences): self.sentences[i].stems = get_stem(sentence) # POS normalization if getattr(doc, 'is_corenlp_file', False): self.normalize_pos_tags() self.unescape_punctuation_marks()
def load_document(self, input, **kwargs): """Loads the content of a document/string/stream in a given language. Args: input (str): input. language (str): language of the input, defaults to 'en'. encoding (str): encoding of the raw file. normalization (str): word normalization method, defaults to 'stemming'. Other possible values are 'lemmatization' or 'None' for using word surface forms instead of stems/lemmas. """ # get the language parameter language = kwargs.get('language', 'en') # initialize document doc = Document() if isinstance(input, string_types): # if input is an input file if os.path.isfile(input): # an xml file is considered as a CoreNLP document if input.endswith('xml'): parser = MinimalCoreNLPReader() doc = parser.read(path=input, **kwargs) doc.is_corenlp_file = True # other extensions are considered as raw text else: parser = RawTextReader(language=language) encoding = kwargs.get('encoding', 'utf-8') with codecs.open(input, 'r', encoding=encoding) as file: text = file.read() doc = parser.read(text=text, path=input, **kwargs) # if input is a string else: parser = RawTextReader(language=language) doc = parser.read(text=input, **kwargs) elif getattr(input, 'read', None): # check whether it is a compressed CoreNLP document name = getattr(input, 'name', None) if name and name.endswith('xml'): parser = MinimalCoreNLPReader() doc = parser.read(path=input, **kwargs) doc.is_corenlp_file = True else: parser = RawTextReader(language=language) doc = parser.read(text=input.read(), **kwargs) else: logging.error('Cannot process {}'.format(type(input))) # set the input file self.input_file = doc.input_file # set the language of the document self.language = language # set the sentences self.sentences = doc.sentences # initialize the stoplist self.stoplist = get_stopwords(self.language) # word normalization self.normalization = kwargs.get('normalization', 'stemming') if self.normalization == 'stemming': stem = get_stemmer_func(self.language) get_stem = lambda s: [stem(w).lower() for w in s.words] else: get_stem = lambda s: [w.lower() for w in s.words] # Populate Sentence.stems according to normalization for i, sentence in enumerate(self.sentences): self.sentences[i].stems = get_stem(sentence) # POS normalization if getattr(doc, 'is_corenlp_file', False): self.normalize_pos_tags() self.unescape_punctuation_marks()