def extract(self, core_nlp_folder, n_term, grammar, considered_tags=None, lang="en", beta=0.55, alias_threshold=0.7, output_file=None): xml_files = [ filename for filename in os.listdir(core_nlp_folder) if filename.endswith(".xml") ] all_terms = {} for xml_file in tqdm(xml_files): core_nlp_reader = MinimalCoreNLPReader() core_nlp_doc = core_nlp_reader.read( path=os.path.join(core_nlp_folder, xml_file)) tagged_text = [ list(zip(sentence.words, sentence.pos)) for sentence in core_nlp_doc.sentences ] text_obj = InputTextObj(tagged_text, lang) if considered_tags: text_obj.considered_tags = considered_tags candidates, candidate_embs = self.candidate_selection( grammar, self.embedding_distrib, text_obj) if len(candidates) > 0: result = _MMR(self.embedding_distrib, text_obj, candidates, candidate_embs, N=n_term, beta=beta, use_filtered=True, alias_threshold=alias_threshold) else: result = (None, None, None) document_id = xml_file.split(".")[0] all_terms[document_id] = result[0] if output_file: TermsExtractor.write_terms_to(all_terms, output_file) return all_terms
def load_document(self, input, **kwargs): """Loads the content of a document/string/stream in a given language. Args: input (str): input. language (str): language of the input, defaults to 'en'. encoding (str): encoding of the raw file. normalization (str): word normalization method, defaults to 'stemming'. Other possible values are 'lemmatization' or 'None' for using word surface forms instead of stems/lemmas. """ # get the language parameter language = kwargs.get('language', 'en') # test whether the language is known, otherwise fall back to english if language not in ISO_to_language: logging.warning( "ISO 639 code {} is not supported, switching to 'en'.".format( language)) language = 'en' # initialize document doc = Document() if isinstance(input, spacy_doc_type): parser = SpacyDocReader(language=language) doc = parser.read(sdoc=input) elif isinstance(input, string_types): # if input is an input file if os.path.isfile(input): # an xml file is considered as a CoreNLP document if input.endswith('xml'): parser = MinimalCoreNLPReader() doc = parser.read(path=input, **kwargs) doc.is_corenlp_file = True # other extensions are considered as raw text else: parser = RawTextReader(language=language) encoding = kwargs.get('encoding', 'utf-8') with codecs.open(input, 'r', encoding=encoding) as file: text = file.read() doc = parser.read(text=text, path=input, **kwargs) # if input is a string else: parser = RawTextReader(language=language) doc = parser.read(text=input, **kwargs) elif getattr(input, 'read', None): # check whether it is a compressed CoreNLP document name = getattr(input, 'name', None) if name and name.endswith('xml'): parser = MinimalCoreNLPReader() doc = parser.read(path=input, **kwargs) doc.is_corenlp_file = True else: parser = RawTextReader(language=language) doc = parser.read(text=input.read(), **kwargs) else: logging.error('Cannot process {}'.format(type(input))) # set the input file self.input_file = doc.input_file # set the language of the document self.language = language # set the sentences self.sentences = doc.sentences # initialize the stoplist self.stoplist = stopwords.words(ISO_to_language[self.language]) # word normalization self.normalization = kwargs.get('normalization', 'stemming') if self.normalization == 'stemming': self.apply_stemming() elif self.normalization is None: for i, sentence in enumerate(self.sentences): self.sentences[i].stems = sentence.words # lowercase the normalized words for i, sentence in enumerate(self.sentences): self.sentences[i].stems = [w.lower() for w in sentence.stems] # POS normalization if getattr(doc, 'is_corenlp_file', False): self.normalize_pos_tags() self.unescape_punctuation_marks()
def load_document(self, input, **kwargs): """Loads the content of a document/string/stream in a given language. Args: input (str): input. language (str): language of the input, defaults to 'en'. encoding (str): encoding of the raw file. normalization (str): word normalization method, defaults to 'stemming'. Other possible values are 'lemmatization' or 'None' for using word surface forms instead of stems/lemmas. """ # get the language parameter language = kwargs.get('language', 'en') # initialize document doc = Document() if is_corenlp(input): path = input parser = MinimalCoreNLPReader() doc = parser.read(path=input, **kwargs) doc.is_corenlp_file = True elif is_file_path(input): path = input with open(path, encoding=kwargs.get('encoding', 'utf-8')) as f: input = f.read() parser = RawTextReader(language=language) doc = parser.read(text=input, path=path, **kwargs) elif isinstance(input, str): parser = RawTextReader(language=language) doc = parser.read(text=input, **kwargs) else: logging.error('Cannot process input. It is neither a file path ' 'or a string: {}'.format(type(input))) return # set the input file self.input_file = doc.input_file # set the language of the document self.language = language # set the sentences self.sentences = doc.sentences # initialize the stoplist self.stoplist = get_stopwords(self.language) # word normalization self.normalization = kwargs.get('normalization', 'stemming') if self.normalization == 'stemming': stem = get_stemmer_func(self.language) get_stem = lambda s: [stem(w).lower() for w in s.words] else: get_stem = lambda s: [w.lower() for w in s.words] # Populate Sentence.stems according to normalization for i, sentence in enumerate(self.sentences): self.sentences[i].stems = get_stem(sentence) # POS normalization if getattr(doc, 'is_corenlp_file', False): self.normalize_pos_tags() self.unescape_punctuation_marks()
def load_document(self, input, **kwargs): """Loads the content of a document/string/stream in a given language. Args: input (str): input. language (str): language of the input, defaults to 'en'. encoding (str): encoding of the raw file. normalization (str): word normalization method, defaults to 'stemming'. Other possible values are 'lemmatization' or 'None' for using word surface forms instead of stems/lemmas. """ # get the language parameter language = kwargs.get('language', 'en') # initialize document doc = Document() if isinstance(input, string_types): # if input is an input file if os.path.isfile(input): # an xml file is considered as a CoreNLP document if input.endswith('xml'): parser = MinimalCoreNLPReader() doc = parser.read(path=input, **kwargs) doc.is_corenlp_file = True # other extensions are considered as raw text else: parser = RawTextReader(language=language) encoding = kwargs.get('encoding', 'utf-8') with codecs.open(input, 'r', encoding=encoding) as file: text = file.read() doc = parser.read(text=text, path=input, **kwargs) # if input is a string else: parser = RawTextReader(language=language) doc = parser.read(text=input, **kwargs) elif getattr(input, 'read', None): # check whether it is a compressed CoreNLP document name = getattr(input, 'name', None) if name and name.endswith('xml'): parser = MinimalCoreNLPReader() doc = parser.read(path=input, **kwargs) doc.is_corenlp_file = True else: parser = RawTextReader(language=language) doc = parser.read(text=input.read(), **kwargs) else: logging.error('Cannot process {}'.format(type(input))) # set the input file self.input_file = doc.input_file # set the language of the document self.language = language # set the sentences self.sentences = doc.sentences # initialize the stoplist self.stoplist = get_stopwords(self.language) # word normalization self.normalization = kwargs.get('normalization', 'stemming') if self.normalization == 'stemming': stem = get_stemmer_func(self.language) get_stem = lambda s: [stem(w).lower() for w in s.words] else: get_stem = lambda s: [w.lower() for w in s.words] # Populate Sentence.stems according to normalization for i, sentence in enumerate(self.sentences): self.sentences[i].stems = get_stem(sentence) # POS normalization if getattr(doc, 'is_corenlp_file', False): self.normalize_pos_tags() self.unescape_punctuation_marks()