def get_url_tokens(self): tokens = self.get_link().split("/") end = -1 if not tokens[-1]: # if url of form http://site/foo/bar/ end = -2 tokens = tokens[2 : end] return string.join(["url:" + t for t in tokens if chew.acceptable_term(t)])
def get_url_tokens(self): tokens = self.get_link().split("/") end = -1 if not tokens[-1]: # if url of form http://site/foo/bar/ end = -2 tokens = tokens[2:end] return string.join( ["url:" + t for t in tokens if chew.acceptable_term(t)])
def text_to_vector(text, blacklist={}, tracker=None, stemming=0): termlist = chew.extract_terms(text) lang = langmodules.get_language_module(termlist) vector = Vector() for term in termlist: term = string.lower(term) if chew.acceptable_term(term) and \ not lang.is_stop_word(term) and \ not blacklist.has_key(term): if stemming: stem = lang.get_stem(term) else: stem = term if tracker: tracker.add_occurrence(stem) vector.add_term(stem) return vector
def text_to_vector(text, blacklist = {}, tracker = None, stemming = 0): termlist = chew.extract_terms(text) lang = langmodules.get_language_module(termlist) vector = Vector() for term in termlist: term = string.lower(term) if chew.acceptable_term(term) and \ not lang.is_stop_word(term) and \ not blacklist.has_key(term): if stemming: stem = lang.get_stem(term) else: stem = term if tracker: tracker.add_occurrence(stem) vector.add_term(stem) return vector