def build_relation_db(): """ Build a relation collection that includes every similarity between posts. Only includes relation when similarity > 0.2 This takes a lot of time, run this periodically. Eg. once a week or everynight. Use insert_new_relation() for new posts """ posts = Post.objects() posts2 = Post.objects() Relation.drop_collection() counter = 0 print counter for p1 in posts: for p2 in posts2: if p1.url != p2.url: if p1.post_type != "pdf" and p2.post_type != "pdf": counter = counter + 1 # text similarity text1 = p1.content.lower() text2 = p2.content.lower() vector1 = text_to_vector(text1) vector2 = text_to_vector(text2) content_cosine = get_cosine(vector1, vector2) # title similarity title1 = p1.title.lower() title2 = p2.title.lower() tvector1 = text_to_vector(title1) tvector2 = text_to_vector(title2) title_cosine = get_cosine(tvector1, tvector2) category_point = get_category_point(p1, p2) cosine = content_cosine + title_cosine + category_point if cosine > 0.1: relation = Relation(p1, p2, cosine) relation.save() print counter