def lemma_components(sense, etyma):
    etyma = [etymon.lemma for etymon in etyma
             if etymon.language == 'English' and
             not etymon.lemma_manager().is_affix()]

    components = []
    if not ' ' in sense.lemma and not '-' in sense.lemma:
        components.append(sense.lemma)

    if not sense.is_derivative():
        components = [re.sub(r'(.)\'s$', r'\1', w).strip().lower()
                      for w in sense.lemma_manager().decompose(base=sense.lemma)]
    components.extend(etyma)

    # Add in the entry headword. This will be overkill in most cases -
    #  except for the case of derivatives, the headword should already
    #  be included by virtue of decompose() above - but will cover the
    #  occasional cases where a compound has not been decomposed
    #  correctly
    if not sense.headword_manager().is_affix():
        components.append(sense.headword_manager().lemma)

    # Remove junk
    components = [w.lower() for w in components if len(w) >= 3
                  and w.lower() not in ('the', 'and')]

    # Porter-stem, so as to align with other definition keywords
    return [porter_stem(w) for w in components]
    def title_words(self, title):
        if title is None or not title:
            return set()

        title = re.sub('(\u2013|-|\'s )', ' ', title.lower())
        title = re.sub(r"[,:;()']", '', title)
        words = [w.strip() for w in title.split() if w.strip()]

        wordset = set()
        for i, w in enumerate(words):
            if w.endswith('.'):
                if i == 0 and w in TitleWords.expansions['first']:
                    w = TitleWords.expansions['first'][w]
                elif i == len(words)-1 and w in TitleWords.expansions['last']:
                    w = TitleWords.expansions['last'][w]
                elif w in TitleWords.expansions['all']:
                    w = TitleWords.expansions['all'][w]
                w = finish_expansion(w)
            if re.search(r'^[a-z]+$', w) and len(w) >= 4:
                wordset.add(porter_stem(w))

        #print '--------------------------------------------'
        #print repr(title)
        #print repr(wordset)
        return wordset
Exemple #3
0
 def _normalize(word):
     """
     Return a stemmed/modernized version of the token
     """
     word = stringtools.porter_stem(word.lower().strip())
     word = word.replace(' ', '').replace('-', '').replace('.', '')
     return MODERNIZER.edit(word)
 def _load_stoptitlewords(self):
     f = os.path.join(self.dir, "stoptitlewords.txt")
     with open(f, "r") as fh:
         lines = fh.readlines()
         for l in lines:
             l = l.lower().strip().strip(" .")
             if l:
                 KeywordsFilter.stoptitlewords.add(porter_stem(l))
 def filter_titlewords(self, keywords, lemma=None):
     if lemma is not None:
         lemma = porter_stem(lemma)[0:8]
     # Filter out stopwords
     keywords = self._filter_stoptitlewords(keywords)
     keywords2 = set()
     for k in keywords:
         k = k.replace("-", "")
         # cut down to just the first 8 characters
         k = k[0:8]
         if lemma is None or lemma != k:
             keywords2.add(k)
     # Filter again, to be on the safe side
     return self._filter_stoptitlewords(keywords2)
Exemple #6
0
 def tokens(self):
     if self.label is None:
         return []
     else:
         label = self.label.lower()
         for replacement in '(),;.-':
             label = label.replace(replacement, ' ')
         for replacement in ('by way of', 'by means of', 'as regards'):
             label = label.replace(replacement, ' ')
         label = re.sub(r'  +', ' ', label)
         tokens = [stringtools.porter_stem(t)
                   for t in label.strip().split(' ')
                   if t not in thesaurusdbconfig.STOPWORDS]
         tokens.sort(key=len, reverse=True)
         return tokens
Exemple #7
0
    def ranked_collocates(self, lemma):
        """
        Score each word-like token in the quotation text, apart from
        the keyword(s) itself.

        Score is determined by distance from the keyword
        (minimum distance, if the token occurs more than once),
        up to a maximum of 10

        Returns a list of 2-ples (ranked by score). Each 2-ple consists of
         -- the lemma stem (as returned by Porter stemmer;
         -- the score (1-10)
        """
        kw_index = self.keyword_index(lemma=lemma)
        if kw_index:
            keyword_start, keyword_end = kw_index
        else:
            keyword_start, keyword_end = (None, None)

        collocates = defaultdict(list)
        for i, token in enumerate(self.tokens):
            # How far is this token from the keyword (assuming we've
            #  located the keyword)?
            if keyword_start is None:
                distance = 10
            elif i < keyword_start:
                distance = keyword_start - i
            elif i > keyword_end:
                distance = i - keyword_end
            else:
                distance = 10

            if re.search(r'^([a-zA-Z]+|[a-zA-Z]+-[a-zA-Z]+)$', token):
                token = token.lower()
                if self.year < 1800:
                    token = MODERNIZER.edit(token)
                stem = stringtools.porter_stem(token)
                collocates[stem].append(distance)

        collrank = [(token, min(distances))
                    for token, distances in collocates.items()]
        collrank.sort(key=lambda token: token[1], reverse=True)
        return collrank
Exemple #8
0
    def tokens(self):
        """
        Return a list of tokens from the definition (words only,
        no punctuation, numbers, etc.

        The list is lower-cased, Porter-stemmed, and alpha-sorted.
        """
        try:
            return self._tokens
        except AttributeError:
            self._tokens = set()
            serialized = etree.tounicode(self.node_stripped())
            serialized = ELEMENT_REMOVER.sub('', serialized)
            serialized = re.sub(r'<[^<>]*>', ' ', serialized)
            tokens = stringtools.word_tokens(serialized)
            for text in [t for t in tokens if re.search(r'[a-zA-Z]{3}', t)
                         and t.lower() not in STOPWORDS]:
                text = text.lower().strip('.,;: -()')
                self._tokens.add(stringtools.porter_stem(text))
            self._tokens = sorted(list(self._tokens))
            return self._tokens