def jaccard_distance(item1, item2): """ A simple distance function (curse of dimentionality applies) distance(A, B) = 1 - n(A intersection B) / n(A union B) or distance(A, B) = 1 - n(A intersection B) / n(A) + n(B) - n(A intersection B) text1 and text2 are our features """ feature1 = set(re.findall('\w+', strip_stopwords("%s %s" % (item1.title.lower(), item1.body.lower())))[:100]) feature2 = set(re.findall('\w+', strip_stopwords("%s %s" % (item2.title.lower(), item2.body.lower())))[:100]) if len(feature1) == 0 and len(feature2) == 0: return 1# max distance similarity = 1.0*len(feature1.intersection(feature2))/len(feature1.union(feature2)) return 1 - similarity
def normalize(text): if not text: text = '' text = strip_stopwords(text) return NOT_WORD_NUM_RE.sub('', text)
def _tokenize(search_str): "Strips stopwords and tokenizes search string if not already a list." if isinstance(search_str, basestring): # TODO determine appropriate characters that should be retained sanitized_str = re.sub('[^\w\s]+', '', search_str) cleaned_str = stopwords.strip_stopwords(sanitized_str) toks = cleaned_str.split() else: toks = list(search_str) return toks
def jaccard_distance(item1, item2): """ A simple distance function (curse of dimentionality applies) distance(A, B) = 1 - n(A intersection B) / n(A union B) or distance(A, B) = 1 - n(A intersection B) / n(A) + n(B) - n(A intersection B) text1 and text2 are our features """ feature1 = set( re.findall( '\w+', strip_stopwords("%s %s" % (item1.title.lower(), item1.body.lower())))[:100]) feature2 = set( re.findall( '\w+', strip_stopwords("%s %s" % (item2.title.lower(), item2.body.lower())))[:100]) if len(feature1) == 0 and len(feature2) == 0: return 1 # max distance similarity = 1.0 * len(feature1.intersection(feature2)) / len( feature1.union(feature2)) return 1 - similarity
def build_slug(slug): """ Builds a slug and strips all stopwords from a slug and builds it again. """ slug = strip_stopwords(slug) slug = slugify(slug) words = slug.split('-') sentence = [] for word in words: try: float(word) except ValueError: sentence.append(word) slug = u'-'.join(sentence) return slug