Exemple #1
0
 def extract(self,
             core_nlp_folder,
             n_term,
             grammar,
             considered_tags=None,
             lang="en",
             beta=0.55,
             alias_threshold=0.7,
             output_file=None):
     xml_files = [
         filename for filename in os.listdir(core_nlp_folder)
         if filename.endswith(".xml")
     ]
     all_terms = {}
     for xml_file in tqdm(xml_files):
         core_nlp_reader = MinimalCoreNLPReader()
         core_nlp_doc = core_nlp_reader.read(
             path=os.path.join(core_nlp_folder, xml_file))
         tagged_text = [
             list(zip(sentence.words, sentence.pos))
             for sentence in core_nlp_doc.sentences
         ]
         text_obj = InputTextObj(tagged_text, lang)
         if considered_tags: text_obj.considered_tags = considered_tags
         candidates, candidate_embs = self.candidate_selection(
             grammar, self.embedding_distrib, text_obj)
         if len(candidates) > 0:
             result = _MMR(self.embedding_distrib,
                           text_obj,
                           candidates,
                           candidate_embs,
                           N=n_term,
                           beta=beta,
                           use_filtered=True,
                           alias_threshold=alias_threshold)
         else:
             result = (None, None, None)
         document_id = xml_file.split(".")[0]
         all_terms[document_id] = result[0]
     if output_file: TermsExtractor.write_terms_to(all_terms, output_file)
     return all_terms
Exemple #2
0
    def load_document(self, input, **kwargs):
        """Loads the content of a document/string/stream in a given language.

        Args:
            input (str): input.
            language (str): language of the input, defaults to 'en'.
            encoding (str): encoding of the raw file.
            normalization (str): word normalization method, defaults to
                'stemming'. Other possible values are 'lemmatization' or 'None'
                for using word surface forms instead of stems/lemmas.
        """

        # get the language parameter
        language = kwargs.get('language', 'en')

        # test whether the language is known, otherwise fall back to english
        if language not in ISO_to_language:
            logging.warning(
                "ISO 639 code {} is not supported, switching to 'en'.".format(
                    language))
            language = 'en'

        # initialize document
        doc = Document()

        if isinstance(input, spacy_doc_type):
            parser = SpacyDocReader(language=language)
            doc = parser.read(sdoc=input)

        elif isinstance(input, string_types):

            # if input is an input file
            if os.path.isfile(input):

                # an xml file is considered as a CoreNLP document
                if input.endswith('xml'):
                    parser = MinimalCoreNLPReader()
                    doc = parser.read(path=input, **kwargs)
                    doc.is_corenlp_file = True

                # other extensions are considered as raw text
                else:
                    parser = RawTextReader(language=language)
                    encoding = kwargs.get('encoding', 'utf-8')
                    with codecs.open(input, 'r', encoding=encoding) as file:
                        text = file.read()
                    doc = parser.read(text=text, path=input, **kwargs)

            # if input is a string
            else:
                parser = RawTextReader(language=language)
                doc = parser.read(text=input, **kwargs)

        elif getattr(input, 'read', None):
            # check whether it is a compressed CoreNLP document
            name = getattr(input, 'name', None)
            if name and name.endswith('xml'):
                parser = MinimalCoreNLPReader()
                doc = parser.read(path=input, **kwargs)
                doc.is_corenlp_file = True
            else:
                parser = RawTextReader(language=language)
                doc = parser.read(text=input.read(), **kwargs)

        else:
            logging.error('Cannot process {}'.format(type(input)))

        # set the input file
        self.input_file = doc.input_file

        # set the language of the document
        self.language = language

        # set the sentences
        self.sentences = doc.sentences

        # initialize the stoplist
        self.stoplist = stopwords.words(ISO_to_language[self.language])

        # word normalization
        self.normalization = kwargs.get('normalization', 'stemming')
        if self.normalization == 'stemming':
            self.apply_stemming()
        elif self.normalization is None:
            for i, sentence in enumerate(self.sentences):
                self.sentences[i].stems = sentence.words

        # lowercase the normalized words
        for i, sentence in enumerate(self.sentences):
            self.sentences[i].stems = [w.lower() for w in sentence.stems]

        # POS normalization
        if getattr(doc, 'is_corenlp_file', False):
            self.normalize_pos_tags()
            self.unescape_punctuation_marks()
Exemple #3
0
    def load_document(self, input, **kwargs):
        """Loads the content of a document/string/stream in a given language.

        Args:
            input (str): input.
            language (str): language of the input, defaults to 'en'.
            encoding (str): encoding of the raw file.
            normalization (str): word normalization method, defaults to
                'stemming'. Other possible values are 'lemmatization' or 'None'
                for using word surface forms instead of stems/lemmas.
        """

        # get the language parameter
        language = kwargs.get('language', 'en')

        # initialize document
        doc = Document()

        if is_corenlp(input):
            path = input
            parser = MinimalCoreNLPReader()
            doc = parser.read(path=input, **kwargs)
            doc.is_corenlp_file = True
        elif is_file_path(input):
            path = input
            with open(path, encoding=kwargs.get('encoding', 'utf-8')) as f:
                input = f.read()
            parser = RawTextReader(language=language)
            doc = parser.read(text=input, path=path, **kwargs)
        elif isinstance(input, str):
            parser = RawTextReader(language=language)
            doc = parser.read(text=input, **kwargs)
        else:
            logging.error('Cannot process input. It is neither a file path '
                          'or a string: {}'.format(type(input)))
            return

        # set the input file
        self.input_file = doc.input_file

        # set the language of the document
        self.language = language

        # set the sentences
        self.sentences = doc.sentences

        # initialize the stoplist
        self.stoplist = get_stopwords(self.language)

        # word normalization
        self.normalization = kwargs.get('normalization', 'stemming')

        if self.normalization == 'stemming':
            stem = get_stemmer_func(self.language)
            get_stem = lambda s: [stem(w).lower() for w in s.words]
        else:
            get_stem = lambda s: [w.lower() for w in s.words]

        # Populate Sentence.stems according to normalization
        for i, sentence in enumerate(self.sentences):
            self.sentences[i].stems = get_stem(sentence)

        # POS normalization
        if getattr(doc, 'is_corenlp_file', False):
            self.normalize_pos_tags()
            self.unescape_punctuation_marks()
Exemple #4
0
    def load_document(self, input, **kwargs):
        """Loads the content of a document/string/stream in a given language.

        Args:
            input (str): input.
            language (str): language of the input, defaults to 'en'.
            encoding (str): encoding of the raw file.
            normalization (str): word normalization method, defaults to
                'stemming'. Other possible values are 'lemmatization' or 'None'
                for using word surface forms instead of stems/lemmas.
        """

        # get the language parameter
        language = kwargs.get('language', 'en')

        # initialize document
        doc = Document()

        if isinstance(input, string_types):

            # if input is an input file
            if os.path.isfile(input):

                # an xml file is considered as a CoreNLP document
                if input.endswith('xml'):
                    parser = MinimalCoreNLPReader()
                    doc = parser.read(path=input, **kwargs)
                    doc.is_corenlp_file = True

                # other extensions are considered as raw text
                else:
                    parser = RawTextReader(language=language)
                    encoding = kwargs.get('encoding', 'utf-8')
                    with codecs.open(input, 'r', encoding=encoding) as file:
                        text = file.read()
                    doc = parser.read(text=text, path=input, **kwargs)

            # if input is a string
            else:
                parser = RawTextReader(language=language)
                doc = parser.read(text=input, **kwargs)

        elif getattr(input, 'read', None):
            # check whether it is a compressed CoreNLP document
            name = getattr(input, 'name', None)
            if name and name.endswith('xml'):
                parser = MinimalCoreNLPReader()
                doc = parser.read(path=input, **kwargs)
                doc.is_corenlp_file = True
            else:
                parser = RawTextReader(language=language)
                doc = parser.read(text=input.read(), **kwargs)

        else:
            logging.error('Cannot process {}'.format(type(input)))

        # set the input file
        self.input_file = doc.input_file

        # set the language of the document
        self.language = language

        # set the sentences
        self.sentences = doc.sentences

        # initialize the stoplist
        self.stoplist = get_stopwords(self.language)

        # word normalization
        self.normalization = kwargs.get('normalization', 'stemming')

        if self.normalization == 'stemming':
            stem = get_stemmer_func(self.language)
            get_stem = lambda s: [stem(w).lower() for w in s.words]
        else:
            get_stem = lambda s: [w.lower() for w in s.words]

        # Populate Sentence.stems according to normalization
        for i, sentence in enumerate(self.sentences):
            self.sentences[i].stems = get_stem(sentence)

        # POS normalization
        if getattr(doc, 'is_corenlp_file', False):
            self.normalize_pos_tags()
            self.unescape_punctuation_marks()