Example #1
0
def jaccard_distance(item1, item2):
    """
    A simple distance function (curse of dimentionality applies)
    distance(A, B) = 1 - n(A intersection B) / n(A union B) or
    distance(A, B) = 1 - n(A intersection B) / n(A) + n(B) - n(A intersection B)
    text1 and text2 are our features
    """
    feature1 = set(re.findall('\w+', strip_stopwords("%s %s" % (item1.title.lower(), item1.body.lower())))[:100])
    feature2 = set(re.findall('\w+', strip_stopwords("%s %s" % (item2.title.lower(), item2.body.lower())))[:100])

    if len(feature1) == 0 and len(feature2) == 0:
        return 1# max distance
    similarity = 1.0*len(feature1.intersection(feature2))/len(feature1.union(feature2))
    return 1 - similarity
Example #2
0
def normalize(text):
  if not text:
    text = ''

  text = strip_stopwords(text)

  return NOT_WORD_NUM_RE.sub('', text)
Example #3
0
def _tokenize(search_str):
    "Strips stopwords and tokenizes search string if not already a list."
    if isinstance(search_str, basestring):
        # TODO determine appropriate characters that should be retained
        sanitized_str = re.sub('[^\w\s]+', '', search_str)
        cleaned_str = stopwords.strip_stopwords(sanitized_str)
        toks = cleaned_str.split()
    else:
        toks = list(search_str)
    return toks
Example #4
0
def jaccard_distance(item1, item2):
    """
    A simple distance function (curse of dimentionality applies)
    distance(A, B) = 1 - n(A intersection B) / n(A union B) or
    distance(A, B) = 1 - n(A intersection B) / n(A) + n(B) - n(A intersection B)
    text1 and text2 are our features
    """
    feature1 = set(
        re.findall(
            '\w+',
            strip_stopwords("%s %s" %
                            (item1.title.lower(), item1.body.lower())))[:100])
    feature2 = set(
        re.findall(
            '\w+',
            strip_stopwords("%s %s" %
                            (item2.title.lower(), item2.body.lower())))[:100])

    if len(feature1) == 0 and len(feature2) == 0:
        return 1  # max distance
    similarity = 1.0 * len(feature1.intersection(feature2)) / len(
        feature1.union(feature2))
    return 1 - similarity
def build_slug(slug):
    """ Builds a slug and strips all stopwords from a slug and builds it again.
    """
    slug = strip_stopwords(slug)
    slug = slugify(slug)
    words = slug.split('-')
    sentence = []
    for word in words:
        try:
            float(word)
        except ValueError:
            sentence.append(word)
    slug = u'-'.join(sentence) 
    return slug