def tokenize(self, docs): """ Tokenizes a document, using a lemmatizer. Args: | doc (str) -- the text document to process. Returns: | list -- the list of tokens. """ tokens = [] for doc in docs: toks = [] for t in spacy(doc, tag=True, parse=False, entity=False): token = t.lower_.strip() tag = t.tag_ # Ignore stopwords if token in self.stops: continue # Lemmatize wn_tag = penn_to_wordnet(tag) if wn_tag is not None: lemma = self.lemmr.lemmatize(token, wn_tag) toks.append(lemma) else: toks.append(token) tokens.append(toks) return tokens
def tokenize(self, docs): if self.lemmatize: lem = WordNetLemmatizer() pre_tdocs = RAKE().tokenize(docs) tdocs = [] for i, tdoc in enumerate(pre_tdocs): # Split phrase keywords into 1gram keywords, # to check tokens against kws_1g = [t.split(' ') for t in tdoc] kws_1g = [kw for grp in kws_1g for kw in grp] toks = spacy(docs[i], tag=True, parse=False, entity=False) tagged = [(t.lower_.strip(), t.tag_) for t in toks] toks = [] for tok, tag in tagged: if tok in kws_1g: wn_tag = penn_to_wordnet(tag) if wn_tag is not None: toks.append(lem.lemmatize(tok, wn_tag)) tdocs.append(toks) tdocs = extract_phrases(tdocs, docs) if prune: return prune(tdocs) return tdocs
def tokenize(self, docs): tags = ['NN', 'NNS', 'NNP', 'NNPS'] keywords = [] for doc in docs: toks = spacy(doc, tag=True, parse=False, entity=False) tagged = [(t.lower_.strip(), t.tag_) for t in toks] kws = [t for t, tag in tagged if tag in tags] kws += extract_noun_phrases(tagged) keywords.append(kws) return prune(keywords)
def _process_doc(self, doc): """ Applies DCS to a document to extract its core concepts and their weights. """ # Prep doc = doc.lower() tagged_tokens = [(t, penn_to_wordnet(t.tag_)) for t in spacy(doc, tag=True, parse=False, entity=False)] tokens = [t for t, tag in tagged_tokens] term_concept_map = self._disambiguate_doc(tagged_tokens) concept_weights = self._weight_concepts(tokens, term_concept_map) # Compute core semantics lexical_chains = self._lexical_chains(doc, term_concept_map) core_semantics = self._core_semantics(lexical_chains, concept_weights) core_concepts = [c for chain in core_semantics for c in chain] return [(con, concept_weights[con]) for con in core_concepts]
def pre_tokenize(doc, tdoc, lem): # Split phrase keywords into 1gram keywords, # to check tokens against # We learn keyphrases later on. kws_1g = [t.split(' ') for t in tdoc] kws_1g = [kw for grp in kws_1g for kw in grp] toks = spacy(doc, tag=True, parse=False, entity=False) tagged = [(t.lower_.strip(), t.tag_) for t in toks] toks = [] for tok, tag in tagged: if tok in kws_1g: wn_tag = penn_to_wordnet(tag) if wn_tag is not None: toks.append(lem.lemmatize(tok, wn_tag)) return toks
def _tokenize(self, doc): toks = [] for t in spacy(doc, tag=True, parse=False, entity=False): token = t.lower_.strip() tag = t.tag_ # Ignore stopwords if token in self.stops: continue # Lemmatize wn_tag = penn_to_wordnet(tag) if wn_tag is not None: lemma = self.lemmr.lemmatize(token, wn_tag) toks.append(lemma) else: toks.append(token) return toks
def _process_doc(self, doc): """ Applies DCS to a document to extract its core concepts and their weights. """ # Prep doc = doc.lower() tagged_tokens = [ (t, penn_to_wordnet(t.tag_)) for t in spacy(doc, tag=True, parse=False, entity=False) ] tokens = [t for t, tag in tagged_tokens] term_concept_map = self._disambiguate_doc(tagged_tokens) concept_weights = self._weight_concepts(tokens, term_concept_map) # Compute core semantics lexical_chains = self._lexical_chains(doc, term_concept_map) core_semantics = self._core_semantics(lexical_chains, concept_weights) core_concepts = [c for chain in core_semantics for c in chain] return [(con, concept_weights[con]) for con in core_concepts]