Exemple #1
0
def insert_new_relation(post1):
    """

    Arguments:
    - `post1`: newly added post
    """
    posts = Post.objects()
    if post1.post_type == "pdf":
        return None

    for post2 in posts:
        if post2.post_type != "pdf" and post2.url != post1.url:
            # text similarity
            text1 = post1.content.lower()
            text2 = post2.content.lower()
            vector1 = text_to_vector(text1)
            vector2 = text_to_vector(text2)
            content_cosine = get_cosine(vector1, vector2)
            # title similarity
            title1 = post1.title.lower()
            title2 = post2.title.lower()
            tvector1 = text_to_vector(title1)
            tvector2 = text_to_vector(title2)
            title_cosine = get_cosine(tvector1, tvector2)

            category_point = get_category_point(post1, post2)
            cosine = content_cosine + title_cosine + category_point

            if cosine > 0.1:
                relation = Relation(post1, post2, cosine)
                relation.save()

                relation = Relation(post2, post1, cosine)
                relation.save()
Exemple #2
0
def insert_new_relation(post1):
    """

    Arguments:
    - `post1`: newly added post
    """
    posts = Post.objects()
    if post1.post_type == "pdf":
        return None

    for post2 in posts:
        if post2.post_type != "pdf" and post2.url != post1.url:
            # text similarity
            text1 = post1.content.lower()
            text2 = post2.content.lower()
            vector1 = text_to_vector(text1)
            vector2 = text_to_vector(text2)
            content_cosine = get_cosine(vector1, vector2)
            # title similarity
            title1 = post1.title.lower()
            title2 = post2.title.lower()
            tvector1 = text_to_vector(title1)
            tvector2 = text_to_vector(title2)
            title_cosine = get_cosine(tvector1, tvector2)

            category_point = get_category_point(post1, post2)
            cosine = content_cosine + title_cosine + category_point

            if cosine > 0.1:
                relation = Relation(post1, post2, cosine)
                relation.save()

                relation = Relation(post2, post1, cosine)
                relation.save()
Exemple #3
0
def build_relation_db():
    """
    Build a relation collection that includes
    every similarity between posts.

    Only includes relation when similarity > 0.2

    This takes a lot of time, run this periodically.
    Eg. once a week or everynight.

    Use insert_new_relation() for new posts

    """
    posts = Post.objects()
    posts2 = Post.objects()
    Relation.drop_collection()
    counter = 0
    print counter
    for p1 in posts:
        for p2 in posts2:
            if p1.url != p2.url:
                if p1.post_type != "pdf" and p2.post_type != "pdf":
                    counter = counter + 1

                    # text similarity
                    text1 = p1.content.lower()
                    text2 = p2.content.lower()
                    vector1 = text_to_vector(text1)
                    vector2 = text_to_vector(text2)
                    content_cosine = get_cosine(vector1, vector2)
                    # title similarity
                    title1 = p1.title.lower()
                    title2 = p2.title.lower()
                    tvector1 = text_to_vector(title1)
                    tvector2 = text_to_vector(title2)
                    title_cosine = get_cosine(tvector1, tvector2)

                    category_point = get_category_point(p1, p2)
                    cosine = content_cosine + title_cosine + category_point

                    if cosine > 0.1:
                        relation = Relation(p1, p2, cosine)
                        relation.save()
    print counter
Exemple #4
0
def build_relation_db():
    """
    Build a relation collection that includes
    every similarity between posts.

    Only includes relation when similarity > 0.2

    This takes a lot of time, run this periodically.
    Eg. once a week or everynight.

    Use insert_new_relation() for new posts

    """
    posts = Post.objects()
    posts2 = Post.objects()
    Relation.drop_collection()
    counter = 0
    print counter
    for p1 in posts:
        for p2 in posts2:
            if p1.url != p2.url:
                if p1.post_type != "pdf" and p2.post_type != "pdf":
                    counter = counter + 1

                    # text similarity
                    text1 = p1.content.lower()
                    text2 = p2.content.lower()
                    vector1 = text_to_vector(text1)
                    vector2 = text_to_vector(text2)
                    content_cosine = get_cosine(vector1, vector2)
                    # title similarity
                    title1 = p1.title.lower()
                    title2 = p2.title.lower()
                    tvector1 = text_to_vector(title1)
                    tvector2 = text_to_vector(title2)
                    title_cosine = get_cosine(tvector1, tvector2)

                    category_point = get_category_point(p1, p2)
                    cosine = content_cosine + title_cosine + category_point

                    if cosine > 0.1:
                        relation = Relation(p1, p2, cosine)
                        relation.save()
    print counter