def lemma_components(sense, etyma): etyma = [etymon.lemma for etymon in etyma if etymon.language == 'English' and not etymon.lemma_manager().is_affix()] components = [] if not ' ' in sense.lemma and not '-' in sense.lemma: components.append(sense.lemma) if not sense.is_derivative(): components = [re.sub(r'(.)\'s$', r'\1', w).strip().lower() for w in sense.lemma_manager().decompose(base=sense.lemma)] components.extend(etyma) # Add in the entry headword. This will be overkill in most cases - # except for the case of derivatives, the headword should already # be included by virtue of decompose() above - but will cover the # occasional cases where a compound has not been decomposed # correctly if not sense.headword_manager().is_affix(): components.append(sense.headword_manager().lemma) # Remove junk components = [w.lower() for w in components if len(w) >= 3 and w.lower() not in ('the', 'and')] # Porter-stem, so as to align with other definition keywords return [porter_stem(w) for w in components]
def title_words(self, title): if title is None or not title: return set() title = re.sub('(\u2013|-|\'s )', ' ', title.lower()) title = re.sub(r"[,:;()']", '', title) words = [w.strip() for w in title.split() if w.strip()] wordset = set() for i, w in enumerate(words): if w.endswith('.'): if i == 0 and w in TitleWords.expansions['first']: w = TitleWords.expansions['first'][w] elif i == len(words)-1 and w in TitleWords.expansions['last']: w = TitleWords.expansions['last'][w] elif w in TitleWords.expansions['all']: w = TitleWords.expansions['all'][w] w = finish_expansion(w) if re.search(r'^[a-z]+$', w) and len(w) >= 4: wordset.add(porter_stem(w)) #print '--------------------------------------------' #print repr(title) #print repr(wordset) return wordset
def _normalize(word): """ Return a stemmed/modernized version of the token """ word = stringtools.porter_stem(word.lower().strip()) word = word.replace(' ', '').replace('-', '').replace('.', '') return MODERNIZER.edit(word)
def _load_stoptitlewords(self): f = os.path.join(self.dir, "stoptitlewords.txt") with open(f, "r") as fh: lines = fh.readlines() for l in lines: l = l.lower().strip().strip(" .") if l: KeywordsFilter.stoptitlewords.add(porter_stem(l))
def filter_titlewords(self, keywords, lemma=None): if lemma is not None: lemma = porter_stem(lemma)[0:8] # Filter out stopwords keywords = self._filter_stoptitlewords(keywords) keywords2 = set() for k in keywords: k = k.replace("-", "") # cut down to just the first 8 characters k = k[0:8] if lemma is None or lemma != k: keywords2.add(k) # Filter again, to be on the safe side return self._filter_stoptitlewords(keywords2)
def tokens(self): if self.label is None: return [] else: label = self.label.lower() for replacement in '(),;.-': label = label.replace(replacement, ' ') for replacement in ('by way of', 'by means of', 'as regards'): label = label.replace(replacement, ' ') label = re.sub(r' +', ' ', label) tokens = [stringtools.porter_stem(t) for t in label.strip().split(' ') if t not in thesaurusdbconfig.STOPWORDS] tokens.sort(key=len, reverse=True) return tokens
def ranked_collocates(self, lemma): """ Score each word-like token in the quotation text, apart from the keyword(s) itself. Score is determined by distance from the keyword (minimum distance, if the token occurs more than once), up to a maximum of 10 Returns a list of 2-ples (ranked by score). Each 2-ple consists of -- the lemma stem (as returned by Porter stemmer; -- the score (1-10) """ kw_index = self.keyword_index(lemma=lemma) if kw_index: keyword_start, keyword_end = kw_index else: keyword_start, keyword_end = (None, None) collocates = defaultdict(list) for i, token in enumerate(self.tokens): # How far is this token from the keyword (assuming we've # located the keyword)? if keyword_start is None: distance = 10 elif i < keyword_start: distance = keyword_start - i elif i > keyword_end: distance = i - keyword_end else: distance = 10 if re.search(r'^([a-zA-Z]+|[a-zA-Z]+-[a-zA-Z]+)$', token): token = token.lower() if self.year < 1800: token = MODERNIZER.edit(token) stem = stringtools.porter_stem(token) collocates[stem].append(distance) collrank = [(token, min(distances)) for token, distances in collocates.items()] collrank.sort(key=lambda token: token[1], reverse=True) return collrank
def tokens(self): """ Return a list of tokens from the definition (words only, no punctuation, numbers, etc. The list is lower-cased, Porter-stemmed, and alpha-sorted. """ try: return self._tokens except AttributeError: self._tokens = set() serialized = etree.tounicode(self.node_stripped()) serialized = ELEMENT_REMOVER.sub('', serialized) serialized = re.sub(r'<[^<>]*>', ' ', serialized) tokens = stringtools.word_tokens(serialized) for text in [t for t in tokens if re.search(r'[a-zA-Z]{3}', t) and t.lower() not in STOPWORDS]: text = text.lower().strip('.,;: -()') self._tokens.add(stringtools.porter_stem(text)) self._tokens = sorted(list(self._tokens)) return self._tokens