Python AnalizedDocument Examples

Programming Language: Python

Namespace/Package Name: similarity.text.document

Class/Type: AnalizedDocument

Examples at hotexamples.com: 4

Python AnalizedDocument - 4 examples found. These are the top rated real world Python examples of similarity.text.document.AnalizedDocument extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

calculate_membership_to_categories(1)

calculate_terms_membership(1)

compare(1)

from_file(1)

Example #1

Show file

File: webpage.py Project: Nozdi/webpage-similarity

class WebPage(object):
    """
        Represents analized webpage.

        :field content: object of Document class
        :type content: Document
        :field url: url to a webpage
        :type url: string
    """
    goose = ExtendedGoose()

    def __init__(self, url="http://google.com"):
        """
            :param document: object of Document class
            :type document: Document
            :param url: url to a webpage
            :type url: string
        """

        self.url = url
        self.article = self.goose.extract(url=self.url)
        self.get_text()
        self.create_folder()
        self.get_all_pictures()

    def create_folder(self):
        folder = re.sub("https?://", "", self.article.final_url).replace("/", "_")
        self.directory = "./data/{}/".format(folder)
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)

    def get_text(self):
        text = self.article.cleaned_text
        self.content = AnalizedDocument(unidecode(text))

    def get_top_picture(self):
        image = self.article.top_image.src
        extension = os.path.splitext(image)[1]
        urllib.urlretrieve(image, self.directory + "top" + extension)

    def get_all_pictures(self):
        self.pictures = []
        for image_src in map(lambda im: im.src, self.article.images):
            image_path = self.directory + os.path.basename(image_src)
            urllib.urlretrieve(image_src, image_path)
            self.pictures.append((image_path, Image.open(image_path)))

    def get_text_similarity(self, web_page):
        return self.content.compare(web_page.content)

    def get_image_similatiry(self, web_page):
        return compare_many(self.pictures, web_page.pictures)

Example #2

Show file

File: file_example.py Project: Nozdi/webpage-similarity

    :synopsis: This module shows example usage
"""
import fix_path
from similarity.text.document import AnalizedDocument
from similarity.img.compare import (
    compare,
    compare_many
)
from utils import print_sorted_dict
from PIL import Image
from glob import glob
from os import path


if __name__ == '__main__':
    ad1 = AnalizedDocument.from_file("./data/test_doc")
    ad2 = AnalizedDocument.from_file("./data/test_doc2")

    print "Text Similarity for ad1 and ad2"
    comparsion = ad1.compare(ad2)
    print "Result: ", comparsion.similarity_result
    print "Important categories:"
    for cat in comparsion.important_categories:
        print cat

    print_sorted_dict(ad1.categories_membership, "test_doc")
    print_sorted_dict(ad2.categories_membership, "test_doc2")

    print "Two images:"
    im1 = Image.open("./data/red.jpg")
    im2 = Image.open("./data/pom.png")

Example #3

Show file

File: show_workflow.py Project: Nozdi/webpage-similarity

from similarity.text.document import AnalizedDocument

if __name__ == '__main__':
    text = """
    A hedgehog is any of the spiny mammals of the subfamily Erinaceinae,
    which is in order Erinaceomorpha. There are seventeen species of
    hedgehog in five genera, found through parts of Europe, Asia, Africa and New Zealand.
    """

    ad = AnalizedDocument(text)
    print ad.terms_quantity
    for term in ad.terms_quantity:
        print "Term:", term
        for cat, items in sorted(
            ad.terms_relevance.items(),
            key=lambda x: x[1].get(term), reverse=True
        ):
            relevance = items.get(term, 0)
            if relevance:
                print "\t", cat, ":", relevance

    print '-'*100

    ad.calculate_terms_membership()
    print ad.terms_membership

    print '-'*100
    ad.calculate_membership_to_categories()

    for cat, value in sorted(
        ad.categories_membership.items(),

Example #4

Show file

File: webpage.py Project: Nozdi/webpage-similarity

 def get_text(self):
     text = self.article.cleaned_text
     self.content = AnalizedDocument(unidecode(text))