def prune(tdocs): """ Prune terms which are totally subsumed by a phrase """ all_terms = set([t for toks in tdocs for t in toks]) redundant = {t for t in all_terms if gram_size(t) == 1} # This could be more efficient for doc in tdocs: cleared = set() for t in redundant: if t not in doc: continue # If this term occurs outside of a phrase, # it is no longer a candidate n = doc.count(t) d = sum(1 for t_ in doc if t != t_ and t in t_) if n > d: cleared.add(t) redundant = redundant.difference(cleared) pruned_tdocs = [] for doc in tdocs: pruned_tdocs.append([t for t in doc if t not in redundant]) return pruned_tdocs
def extract_phrases(tdocs, docs): """ Learn novel phrases by looking at co-occurrence of candidate term pairings. Docs should be input in tokenized (`tdocs`) and untokenized (`docs`) form. """ # Gather existing keyphrases keyphrases = set() for doc in tdocs: for t in doc: if gram_size(t) > 1: keyphrases.add(t) # Count document co-occurrences t_counts = defaultdict(int) pair_docs = defaultdict(list) for i, terms in enumerate(tdocs): # We dont convert the doc to a set b/c we want to preserve order # Iterate over terms as pairs for pair in zip(terms, terms[1:]): t_counts[pair] += 1 pair_docs[pair].append(i) # There are a lot of co-occurrences, filter down to those which could # potentially be phrases. t_counts = {kw: count for kw, count in t_counts.items() if count >= 2} # Identify novel phrases by looking at # keywords which co-occur some percentage of the time. # This could probably be more efficient/cleaned up for (kw, kw_), count in t_counts.items(): # Look for phrases that are space-delimited or joined by 'and' or '-' ph_reg = re.compile('({0}|{1})(\s|-)(and\s)?({0}|{1})'.format(kw, kw_)) # Extract candidate phrases and keep track of their counts phrases = defaultdict(int) phrase_docs = defaultdict(set) for i in pair_docs[(kw, kw_)]: for m in ph_reg.findall(docs[i].lower()): phrases[''.join(m)] += 1 phrase_docs[''.join(m)].add(i) if not phrases: continue # Get the phrase encountered the most top_phrase = max(phrases.keys(), key=lambda k: phrases[k]) top_count = phrases[top_phrase] if top_count/count >= 0.8: # Check if this new phrase is contained by an existing keyphrase. if any(top_phrase in ph for ph in keyphrases): continue keyphrases.add(top_phrase) # Add the new phrase to each doc it's found in for i in phrase_docs[top_phrase]: tdocs[i].append(top_phrase) return tdocs
def __call__(self, token_docs): filtered_token_docs = [] for doc in token_docs: # Remove keyphrases with more than 3 words to reduce runtime filtered_token_docs.append([t for t in doc if gram_size(t) <= 3]) token_docs = self._preprocess(filtered_token_docs) dist_mat = self._distance_matrix(token_docs) return dist_mat
def _score(k): support = len(aspect_map[k]) # Require some minimum support. if support < min_sup: return 0 scores = [] for k_ in k.split(', '): # Mean IDF was ~15.2, so slightly bias unencountered terms. scores.append(idf.get(k_, 15.5)**2 * support * gram_size(k_)) return sum(scores) / len(scores)
def _score(k): support = len(aspect_map[k]) # Require some minimum support. if support < min_sup: return 0 scores = [] for k_ in k.split(', '): # Mean IDF was ~15.2, so slightly bias unencountered terms. scores.append(idf.get(k_, 15.5)**2 * support * gram_size(k_)) return sum(scores)/len(scores)
def markup_highlights(term, doc): """ Highlights each instance of the given term in the document. All forms of the term will be highlighted. """ for term in term.split(','): term = term.strip() # Determine which forms are present for the term in the document if gram_size(term) == 1: # Replace longer forms first so we don't replace their substrings. forms = sorted(lemma_forms(term, doc), key=lambda f: len(f), reverse=True) else: forms = [term] for t in forms: # This captures 'F.D.A' if given 'FDA' # yeah, it's kind of overkill reg_ = '[.]?'.join(list(t)) # Spaces might be spaces, or they might be hyphens reg_ = reg_.replace(' ', '[\s-]') # Only match the term if it is not continguous with other characters. # Otherwise it might be a substring of another word, which we want to # ignore reg = '(^|{0})({1})($|{0})'.format('[^A-Za-z]', reg_) if re.findall(reg, doc): doc = re.sub(reg, '\g<1><span class="highlight">\g<2></span>\g<3>', doc, flags=re.IGNORECASE) else: # If none of the term was found, try with extra alpha characters # This helps if a phrase was newly learned and only assembled in # its lemma form, so we may be missing the actual form it appears in. reg = '(^|{0})({1}[A-Za-z]?)()'.format('[^A-Za-z]', reg_) doc = re.sub(reg, '\g<1><span class="highlight">\g<2></span>\g<3>', doc, flags=re.IGNORECASE) return doc
def extract_highlights(self, token_docs): print('{0} docs...'.format(len(token_docs))) # Tokenize sentences, # group sentences by their aspects. # Keep track of keywords and keyphrases keywords = set() keyphrases = set() aspect_map = defaultdict(set) for id, tokens in enumerate(token_docs): tokens = set(tokens) for t in tokens: aspect_map[t].add(id) if gram_size(t) > 1: keyphrases.add(t) else: keywords.add(t) # Prune aspects # If a keyword is encountered as part of a keyphrase, remove overlapping # sentences with the keyphrase from the keyword's sentences. for kw, kp in ((kw, kp) for kw, kp in product(keywords, keyphrases) if kw in kp): aspect_map[kw] = aspect_map[kw].difference(aspect_map[kp]) # Group terms with common stems stem_map = defaultdict(list) for kw in keywords: stem = stemmer.stem(kw) stem_map[stem].append(kw) # Group sentences with common aspect stems. for stem, kws in stem_map.items(): if len(kws) == 1: continue key = ', '.join(kws) aspect_map[key] = set() for kw in kws: aspect_map[key] = aspect_map[key].union(aspect_map[kw]) # Remove the old keys aspect_map.pop(kw, None) return aspect_map
def prune(tdocs): """ Prune terms which are totally subsumed by a phrase This could be better if it just removes the individual keywords that occur in a phrase for each time that phrase occurs. """ all_terms = set([t for toks in tdocs for t in toks]) terms = set() phrases = set() for t in all_terms: if gram_size(t) > 1: phrases.add(t) else: terms.add(t) # Identify candidates for redundant terms (1-gram terms found in a phrase) redundant = set() for t in terms: if any(t in ph for ph in phrases): redundant.add(t) # Search all documents to check that these terms occur # only in a phrase. If not, remove it as a candidate. # This could be more efficient cleared = set() for t in redundant: if any(check_term(d, term=t) for d in tdocs): cleared.add(t) redundant = redundant.difference(cleared) pruned_tdocs = [] for doc in tdocs: pruned_tdocs.append([t for t in doc if t not in redundant]) return pruned_tdocs
def _vec_reps(self): """ Creates salience-weighted vector representations for documents """ # Keep track of which term pairs collapse self.collapse_map = {} # Identify which terms to collapse tsimmat = self.w2v_sim_mat.copy() tsimmat[np.where(tsimmat == 1.)] = -1 for term in self.all_terms: idx = self.w2v_term_map[term] top = np.nanargmax(tsimmat[idx]) sim = np.nanmax(tsimmat[idx]) if sim >= 0.8: # cutoff # bleh, find matching term by index for k, v in self.w2v_term_map.items(): if v == top: match = k break # Only collapse terms of the same gram size # This is because phrases which share a word in common tend to have # a higher similarity, because they share a word in common # TO DO could collapse terms of diff gram sizes but require a higher # sim threshold if gram_size(term.term) == gram_size(match.term): # If either term is already in the collapse map if term in self.collapse_map: self.collapse_map[match] = self.collapse_map[term] elif match in self.collapse_map: self.collapse_map[term] = self.collapse_map[match] else: self.collapse_map[term] = term self.collapse_map[match] = term # Build the reduced term set self.collapsed_terms = set() for term in self.all_terms: self.collapsed_terms.add(self.collapse_map.get(term, term)) print(len(self.all_terms)) print(len(self.collapsed_terms)) terms = list(self.collapsed_terms) # Now we can build the vectors # TO DO make this not ridiculous vecs = [] for d in self.docs: vec = [] for t in terms: if t in d: vec.append(t.salience) else: vec.append(0) vecs.append(vec) vecs = np.array(vecs) print(vecs.shape) print(vecs) return vecs