Esempio n. 1
0
def prune(tdocs):
    """
    Prune terms which are totally subsumed by a phrase
    """
    all_terms = set([t for toks in tdocs for t in toks])
    redundant = {t for t in all_terms if gram_size(t) == 1}

    # This could be more efficient
    for doc in tdocs:
        cleared = set()
        for t in redundant:
            if t not in doc:
                continue

            # If this term occurs outside of a phrase,
            # it is no longer a candidate
            n = doc.count(t)
            d = sum(1 for t_ in doc if t != t_ and t in t_)
            if n > d:
                cleared.add(t)

        redundant = redundant.difference(cleared)

    pruned_tdocs = []
    for doc in tdocs:
        pruned_tdocs.append([t for t in doc if t not in redundant])

    return pruned_tdocs
Esempio n. 2
0
def extract_phrases(tdocs, docs):
    """
    Learn novel phrases by looking at co-occurrence of candidate term pairings.
    Docs should be input in tokenized (`tdocs`) and untokenized (`docs`) form.
    """
    # Gather existing keyphrases
    keyphrases = set()
    for doc in tdocs:
        for t in doc:
            if gram_size(t) > 1:
                keyphrases.add(t)

    # Count document co-occurrences
    t_counts = defaultdict(int)
    pair_docs = defaultdict(list)
    for i, terms in enumerate(tdocs):
        # We dont convert the doc to a set b/c we want to preserve order
        # Iterate over terms as pairs
        for pair in zip(terms, terms[1:]):
            t_counts[pair] += 1
            pair_docs[pair].append(i)

    # There are a lot of co-occurrences, filter down to those which could
    # potentially be phrases.
    t_counts = {kw: count for kw, count in t_counts.items() if count >= 2}

    # Identify novel phrases by looking at
    # keywords which co-occur some percentage of the time.
    # This could probably be more efficient/cleaned up
    for (kw, kw_), count in t_counts.items():
        # Look for phrases that are space-delimited or joined by 'and' or '-'
        ph_reg = re.compile('({0}|{1})(\s|-)(and\s)?({0}|{1})'.format(kw, kw_))

        # Extract candidate phrases and keep track of their counts
        phrases = defaultdict(int)
        phrase_docs = defaultdict(set)
        for i in pair_docs[(kw, kw_)]:
            for m in ph_reg.findall(docs[i].lower()):
                phrases[''.join(m)] += 1
                phrase_docs[''.join(m)].add(i)

        if not phrases:
            continue

        # Get the phrase encountered the most
        top_phrase = max(phrases.keys(), key=lambda k: phrases[k])
        top_count = phrases[top_phrase]

        if top_count/count >= 0.8:
            # Check if this new phrase is contained by an existing keyphrase.
            if any(top_phrase in ph for ph in keyphrases):
                continue
            keyphrases.add(top_phrase)

            # Add the new phrase to each doc it's found in
            for i in phrase_docs[top_phrase]:
                tdocs[i].append(top_phrase)

    return tdocs
Esempio n. 3
0
 def __call__(self, token_docs):
     filtered_token_docs = []
     for doc in token_docs:
         # Remove keyphrases with more than 3 words to reduce runtime
         filtered_token_docs.append([t for t in doc if gram_size(t) <= 3])
     token_docs = self._preprocess(filtered_token_docs)
     dist_mat = self._distance_matrix(token_docs)
     return dist_mat
Esempio n. 4
0
        def _score(k):
            support = len(aspect_map[k])

            # Require some minimum support.
            if support < min_sup:
                return 0

            scores = []
            for k_ in k.split(', '):
                # Mean IDF was ~15.2, so slightly bias unencountered terms.
                scores.append(idf.get(k_, 15.5)**2 * support * gram_size(k_))
            return sum(scores) / len(scores)
Esempio n. 5
0
        def _score(k):
            support = len(aspect_map[k])

            # Require some minimum support.
            if support < min_sup:
                return 0

            scores = []
            for k_ in k.split(', '):
                # Mean IDF was ~15.2, so slightly bias unencountered terms.
                scores.append(idf.get(k_, 15.5)**2 * support * gram_size(k_))
            return sum(scores)/len(scores)
Esempio n. 6
0
def markup_highlights(term, doc):
    """
    Highlights each instance of the given term
    in the document. All forms of the term will be highlighted.
    """
    for term in term.split(','):
        term = term.strip()

        # Determine which forms are present for the term in the document
        if gram_size(term) == 1:
            # Replace longer forms first so we don't replace their substrings.
            forms = sorted(lemma_forms(term, doc),
                           key=lambda f: len(f),
                           reverse=True)
        else:
            forms = [term]

        for t in forms:
            # This captures 'F.D.A' if given 'FDA'
            # yeah, it's kind of overkill
            reg_ = '[.]?'.join(list(t))

            # Spaces might be spaces, or they might be hyphens
            reg_ = reg_.replace(' ', '[\s-]')

            # Only match the term if it is not continguous with other characters.
            # Otherwise it might be a substring of another word, which we want to
            # ignore
            reg = '(^|{0})({1})($|{0})'.format('[^A-Za-z]', reg_)

            if re.findall(reg, doc):
                doc = re.sub(reg,
                             '\g<1><span class="highlight">\g<2></span>\g<3>',
                             doc,
                             flags=re.IGNORECASE)
            else:
                # If none of the term was found, try with extra alpha characters
                # This helps if a phrase was newly learned and only assembled in
                # its lemma form, so we may be missing the actual form it appears in.
                reg = '(^|{0})({1}[A-Za-z]?)()'.format('[^A-Za-z]', reg_)
                doc = re.sub(reg,
                             '\g<1><span class="highlight">\g<2></span>\g<3>',
                             doc,
                             flags=re.IGNORECASE)

    return doc
Esempio n. 7
0
    def extract_highlights(self, token_docs):
        print('{0} docs...'.format(len(token_docs)))

        # Tokenize sentences,
        # group sentences by their aspects.
        # Keep track of keywords and keyphrases
        keywords = set()
        keyphrases = set()
        aspect_map = defaultdict(set)
        for id, tokens in enumerate(token_docs):
            tokens = set(tokens)
            for t in tokens:
                aspect_map[t].add(id)
                if gram_size(t) > 1:
                    keyphrases.add(t)
                else:
                    keywords.add(t)

        # Prune aspects
        # If a keyword is encountered as part of a keyphrase, remove overlapping
        # sentences with the keyphrase from the keyword's sentences.
        for kw, kp in ((kw, kp) for kw, kp in product(keywords, keyphrases)
                       if kw in kp):
            aspect_map[kw] = aspect_map[kw].difference(aspect_map[kp])

        # Group terms with common stems
        stem_map = defaultdict(list)
        for kw in keywords:
            stem = stemmer.stem(kw)
            stem_map[stem].append(kw)

        # Group sentences with common aspect stems.
        for stem, kws in stem_map.items():
            if len(kws) == 1:
                continue

            key = ', '.join(kws)
            aspect_map[key] = set()
            for kw in kws:
                aspect_map[key] = aspect_map[key].union(aspect_map[kw])

                # Remove the old keys
                aspect_map.pop(kw, None)

        return aspect_map
Esempio n. 8
0
    def extract_highlights(self, token_docs):
        print('{0} docs...'.format(len(token_docs)))

        # Tokenize sentences,
        # group sentences by their aspects.
        # Keep track of keywords and keyphrases
        keywords = set()
        keyphrases = set()
        aspect_map = defaultdict(set)
        for id, tokens in enumerate(token_docs):
            tokens = set(tokens)
            for t in tokens:
                aspect_map[t].add(id)
                if gram_size(t) > 1:
                    keyphrases.add(t)
                else:
                    keywords.add(t)

        # Prune aspects
        # If a keyword is encountered as part of a keyphrase, remove overlapping
        # sentences with the keyphrase from the keyword's sentences.
        for kw, kp in ((kw, kp) for kw, kp in product(keywords, keyphrases) if kw in kp):
            aspect_map[kw] = aspect_map[kw].difference(aspect_map[kp])

        # Group terms with common stems
        stem_map = defaultdict(list)
        for kw in keywords:
            stem = stemmer.stem(kw)
            stem_map[stem].append(kw)

        # Group sentences with common aspect stems.
        for stem, kws in stem_map.items():
            if len(kws) == 1:
                continue

            key = ', '.join(kws)
            aspect_map[key] = set()
            for kw in kws:
                aspect_map[key] = aspect_map[key].union(aspect_map[kw])

                # Remove the old keys
                aspect_map.pop(kw, None)

        return aspect_map
Esempio n. 9
0
def markup_highlights(term, doc):
    """
    Highlights each instance of the given term
    in the document. All forms of the term will be highlighted.
    """
    for term in term.split(','):
        term = term.strip()

        # Determine which forms are present for the term in the document
        if gram_size(term) == 1:
            # Replace longer forms first so we don't replace their substrings.
            forms = sorted(lemma_forms(term, doc), key=lambda f: len(f), reverse=True)
        else:
            forms = [term]

        for t in forms:
            # This captures 'F.D.A' if given 'FDA'
            # yeah, it's kind of overkill
            reg_ = '[.]?'.join(list(t))

            # Spaces might be spaces, or they might be hyphens
            reg_ = reg_.replace(' ', '[\s-]')

            # Only match the term if it is not continguous with other characters.
            # Otherwise it might be a substring of another word, which we want to
            # ignore
            reg = '(^|{0})({1})($|{0})'.format('[^A-Za-z]', reg_)

            if re.findall(reg, doc):
                doc = re.sub(reg, '\g<1><span class="highlight">\g<2></span>\g<3>', doc, flags=re.IGNORECASE)
            else:
                # If none of the term was found, try with extra alpha characters
                # This helps if a phrase was newly learned and only assembled in
                # its lemma form, so we may be missing the actual form it appears in.
                reg = '(^|{0})({1}[A-Za-z]?)()'.format('[^A-Za-z]', reg_)
                doc = re.sub(reg, '\g<1><span class="highlight">\g<2></span>\g<3>', doc, flags=re.IGNORECASE)

    return doc
Esempio n. 10
0
def prune(tdocs):
    """
    Prune terms which are totally subsumed by a phrase

    This could be better if it just removes the individual keywords
    that occur in a phrase for each time that phrase occurs.
    """
    all_terms = set([t for toks in tdocs for t in toks])
    terms = set()
    phrases = set()
    for t in all_terms:
        if gram_size(t) > 1:
            phrases.add(t)
        else:
            terms.add(t)

    # Identify candidates for redundant terms (1-gram terms found in a phrase)
    redundant = set()
    for t in terms:
        if any(t in ph for ph in phrases):
            redundant.add(t)

    # Search all documents to check that these terms occur
    # only in a phrase. If not, remove it as a candidate.
    # This could be more efficient
    cleared = set()
    for t in redundant:
        if any(check_term(d, term=t) for d in tdocs):
            cleared.add(t)

    redundant = redundant.difference(cleared)

    pruned_tdocs = []
    for doc in tdocs:
        pruned_tdocs.append([t for t in doc if t not in redundant])

    return pruned_tdocs
Esempio n. 11
0
def prune(tdocs):
    """
    Prune terms which are totally subsumed by a phrase

    This could be better if it just removes the individual keywords
    that occur in a phrase for each time that phrase occurs.
    """
    all_terms = set([t for toks in tdocs for t in toks])
    terms = set()
    phrases = set()
    for t in all_terms:
        if gram_size(t) > 1:
            phrases.add(t)
        else:
            terms.add(t)

    # Identify candidates for redundant terms (1-gram terms found in a phrase)
    redundant = set()
    for t in terms:
        if any(t in ph for ph in phrases):
            redundant.add(t)

    # Search all documents to check that these terms occur
    # only in a phrase. If not, remove it as a candidate.
    # This could be more efficient
    cleared = set()
    for t in redundant:
        if any(check_term(d, term=t) for d in tdocs):
            cleared.add(t)

    redundant = redundant.difference(cleared)

    pruned_tdocs = []
    for doc in tdocs:
        pruned_tdocs.append([t for t in doc if t not in redundant])

    return pruned_tdocs
Esempio n. 12
0
    def _vec_reps(self):
        """
        Creates salience-weighted vector representations for documents
        """

        # Keep track of which term pairs collapse
        self.collapse_map = {}

        # Identify which terms to collapse
        tsimmat = self.w2v_sim_mat.copy()
        tsimmat[np.where(tsimmat == 1.)] = -1
        for term in self.all_terms:
            idx = self.w2v_term_map[term]
            top = np.nanargmax(tsimmat[idx])
            sim = np.nanmax(tsimmat[idx])

            if sim >= 0.8: # cutoff
                # bleh, find matching term by index
                for k, v in self.w2v_term_map.items():
                    if v == top:
                        match = k
                        break

                # Only collapse terms of the same gram size
                # This is because phrases which share a word in common tend to have
                # a higher similarity, because they share a word in common
                # TO DO could collapse terms of diff gram sizes but require a higher
                # sim threshold
                if gram_size(term.term) == gram_size(match.term):
                    # If either term is already in the collapse map
                    if term in self.collapse_map:
                        self.collapse_map[match] = self.collapse_map[term]
                    elif match in self.collapse_map:
                        self.collapse_map[term] = self.collapse_map[match]
                    else:
                        self.collapse_map[term] = term
                        self.collapse_map[match] = term

        # Build the reduced term set
        self.collapsed_terms = set()
        for term in self.all_terms:
            self.collapsed_terms.add(self.collapse_map.get(term, term))

        print(len(self.all_terms))
        print(len(self.collapsed_terms))

        terms = list(self.collapsed_terms)

        # Now we can build the vectors
        # TO DO make this not ridiculous
        vecs = []
        for d in self.docs:
            vec = []
            for t in terms:
                if t in d:
                    vec.append(t.salience)
                else:
                    vec.append(0)
            vecs.append(vec)

        vecs = np.array(vecs)
        print(vecs.shape)
        print(vecs)
        return vecs