Esempio n. 1
0
 def get_url_tokens(self):
     tokens = self.get_link().split("/")
     end = -1
     if not tokens[-1]: # if url of form http://site/foo/bar/
         end = -2
     tokens = tokens[2 : end]
     return string.join(["url:" + t for t in tokens if chew.acceptable_term(t)])
Esempio n. 2
0
 def get_url_tokens(self):
     tokens = self.get_link().split("/")
     end = -1
     if not tokens[-1]:  # if url of form http://site/foo/bar/
         end = -2
     tokens = tokens[2:end]
     return string.join(
         ["url:" + t for t in tokens if chew.acceptable_term(t)])
Esempio n. 3
0
def text_to_vector(text, blacklist={}, tracker=None, stemming=0):
    termlist = chew.extract_terms(text)
    lang = langmodules.get_language_module(termlist)

    vector = Vector()
    for term in termlist:
        term = string.lower(term)
        if chew.acceptable_term(term) and \
           not lang.is_stop_word(term) and \
           not blacklist.has_key(term):
            if stemming:
                stem = lang.get_stem(term)
            else:
                stem = term
            if tracker:
                tracker.add_occurrence(stem)
            vector.add_term(stem)

    return vector
Esempio n. 4
0
def text_to_vector(text, blacklist = {}, tracker = None, stemming = 0):
    termlist = chew.extract_terms(text)
    lang = langmodules.get_language_module(termlist)

    vector = Vector()
    for term in termlist:
        term = string.lower(term)
        if chew.acceptable_term(term) and \
           not lang.is_stop_word(term) and \
           not blacklist.has_key(term):
            if stemming:
                stem = lang.get_stem(term)
            else:
                stem = term
            if tracker:
                tracker.add_occurrence(stem)
            vector.add_term(stem)

    return vector