コード例 #1
0
ファイル: lemma.py プロジェクト: redreamality/broca
    def tokenize(self, docs):
        """ Tokenizes a document, using a lemmatizer.

        Args:
            | doc (str)                 -- the text document to process.

        Returns:
            | list                      -- the list of tokens.
        """
        tokens = []

        for doc in docs:
            toks = []

            for t in spacy(doc, tag=True, parse=False, entity=False):
                token = t.lower_.strip()
                tag = t.tag_

                # Ignore stopwords
                if token in self.stops:
                    continue

                # Lemmatize
                wn_tag = penn_to_wordnet(tag)
                if wn_tag is not None:
                    lemma = self.lemmr.lemmatize(token, wn_tag)
                    toks.append(lemma)
                else:
                    toks.append(token)

            tokens.append(toks)

        return tokens
コード例 #2
0
ファイル: overkill.py プロジェクト: redreamality/broca
    def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        pre_tdocs = RAKE().tokenize(docs)

        tdocs = []
        for i, tdoc in enumerate(pre_tdocs):
            # Split phrase keywords into 1gram keywords,
            # to check tokens against
            kws_1g = [t.split(' ') for t in tdoc]
            kws_1g = [kw for grp in kws_1g for kw in grp]

            toks = spacy(docs[i], tag=True, parse=False, entity=False)
            tagged = [(t.lower_.strip(), t.tag_) for t in toks]

            toks = []
            for tok, tag in tagged:
                if tok in kws_1g:
                    wn_tag = penn_to_wordnet(tag)
                    if wn_tag is not None:
                        toks.append(lem.lemmatize(tok, wn_tag))
            tdocs.append(toks)

        tdocs = extract_phrases(tdocs, docs)
        if prune:
            return prune(tdocs)
        return tdocs
コード例 #3
0
ファイル: pos.py プロジェクト: colinsongf/doc_similarity
    def tokenize(self, docs):
        tags = ['NN', 'NNS', 'NNP', 'NNPS']

        keywords = []
        for doc in docs:
            toks = spacy(doc, tag=True, parse=False, entity=False)
            tagged = [(t.lower_.strip(), t.tag_) for t in toks]
            kws = [t for t, tag in tagged if tag in tags]
            kws += extract_noun_phrases(tagged)
            keywords.append(kws)
        return prune(keywords)
コード例 #4
0
ファイル: dcs.py プロジェクト: MaxwellRebo/broca
    def _process_doc(self, doc):
        """
        Applies DCS to a document to extract its core concepts and their weights.
        """
        # Prep
        doc = doc.lower()
        tagged_tokens = [(t, penn_to_wordnet(t.tag_)) for t in spacy(doc, tag=True, parse=False, entity=False)]
        tokens = [t for t, tag in tagged_tokens]
        term_concept_map = self._disambiguate_doc(tagged_tokens)
        concept_weights = self._weight_concepts(tokens, term_concept_map)

        # Compute core semantics
        lexical_chains = self._lexical_chains(doc, term_concept_map)
        core_semantics = self._core_semantics(lexical_chains, concept_weights)
        core_concepts = [c for chain in core_semantics for c in chain]

        return [(con, concept_weights[con]) for con in core_concepts]
コード例 #5
0
ファイル: overkill.py プロジェクト: MaxwellRebo/broca
def pre_tokenize(doc, tdoc, lem):
    # Split phrase keywords into 1gram keywords,
    # to check tokens against
    # We learn keyphrases later on.
    kws_1g = [t.split(' ') for t in tdoc]
    kws_1g = [kw for grp in kws_1g for kw in grp]

    toks = spacy(doc, tag=True, parse=False, entity=False)
    tagged = [(t.lower_.strip(), t.tag_) for t in toks]

    toks = []
    for tok, tag in tagged:
        if tok in kws_1g:
            wn_tag = penn_to_wordnet(tag)
            if wn_tag is not None:
                toks.append(lem.lemmatize(tok, wn_tag))

    return toks
コード例 #6
0
    def _tokenize(self, doc):
        toks = []

        for t in spacy(doc, tag=True, parse=False, entity=False):
            token = t.lower_.strip()
            tag = t.tag_

            # Ignore stopwords
            if token in self.stops:
                continue

            # Lemmatize
            wn_tag = penn_to_wordnet(tag)
            if wn_tag is not None:
                lemma = self.lemmr.lemmatize(token, wn_tag)
                toks.append(lemma)
            else:
                toks.append(token)
        return toks
コード例 #7
0
ファイル: dcs.py プロジェクト: colinsongf/doc_similarity
    def _process_doc(self, doc):
        """
        Applies DCS to a document to extract its core concepts and their weights.
        """
        # Prep
        doc = doc.lower()
        tagged_tokens = [
            (t, penn_to_wordnet(t.tag_))
            for t in spacy(doc, tag=True, parse=False, entity=False)
        ]
        tokens = [t for t, tag in tagged_tokens]
        term_concept_map = self._disambiguate_doc(tagged_tokens)
        concept_weights = self._weight_concepts(tokens, term_concept_map)

        # Compute core semantics
        lexical_chains = self._lexical_chains(doc, term_concept_map)
        core_semantics = self._core_semantics(lexical_chains, concept_weights)
        core_concepts = [c for chain in core_semantics for c in chain]

        return [(con, concept_weights[con]) for con in core_concepts]