Example #1
0
class WebPage(object):
    """
        Represents analized webpage.

        :field content: object of Document class
        :type content: Document
        :field url: url to a webpage
        :type url: string
    """
    goose = ExtendedGoose()

    def __init__(self, url="http://google.com"):
        """
            :param document: object of Document class
            :type document: Document
            :param url: url to a webpage
            :type url: string
        """

        self.url = url
        self.article = self.goose.extract(url=self.url)
        self.get_text()
        self.create_folder()
        self.get_all_pictures()

    def create_folder(self):
        folder = re.sub("https?://", "", self.article.final_url).replace("/", "_")
        self.directory = "./data/{}/".format(folder)
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)

    def get_text(self):
        text = self.article.cleaned_text
        self.content = AnalizedDocument(unidecode(text))

    def get_top_picture(self):
        image = self.article.top_image.src
        extension = os.path.splitext(image)[1]
        urllib.urlretrieve(image, self.directory + "top" + extension)

    def get_all_pictures(self):
        self.pictures = []
        for image_src in map(lambda im: im.src, self.article.images):
            image_path = self.directory + os.path.basename(image_src)
            urllib.urlretrieve(image_src, image_path)
            self.pictures.append((image_path, Image.open(image_path)))

    def get_text_similarity(self, web_page):
        return self.content.compare(web_page.content)

    def get_image_similatiry(self, web_page):
        return compare_many(self.pictures, web_page.pictures)
Example #2
0
    :synopsis: This module shows example usage
"""
import fix_path
from similarity.text.document import AnalizedDocument
from similarity.img.compare import (
    compare,
    compare_many
)
from utils import print_sorted_dict
from PIL import Image
from glob import glob
from os import path


if __name__ == '__main__':
    ad1 = AnalizedDocument.from_file("./data/test_doc")
    ad2 = AnalizedDocument.from_file("./data/test_doc2")

    print "Text Similarity for ad1 and ad2"
    comparsion = ad1.compare(ad2)
    print "Result: ", comparsion.similarity_result
    print "Important categories:"
    for cat in comparsion.important_categories:
        print cat

    print_sorted_dict(ad1.categories_membership, "test_doc")
    print_sorted_dict(ad2.categories_membership, "test_doc2")

    print "Two images:"
    im1 = Image.open("./data/red.jpg")
    im2 = Image.open("./data/pom.png")
from similarity.text.document import AnalizedDocument

if __name__ == '__main__':
    text = """
    A hedgehog is any of the spiny mammals of the subfamily Erinaceinae,
    which is in order Erinaceomorpha. There are seventeen species of
    hedgehog in five genera, found through parts of Europe, Asia, Africa and New Zealand.
    """

    ad = AnalizedDocument(text)
    print ad.terms_quantity
    for term in ad.terms_quantity:
        print "Term:", term
        for cat, items in sorted(
            ad.terms_relevance.items(),
            key=lambda x: x[1].get(term), reverse=True
        ):
            relevance = items.get(term, 0)
            if relevance:
                print "\t", cat, ":", relevance

    print '-'*100

    ad.calculate_terms_membership()
    print ad.terms_membership

    print '-'*100
    ad.calculate_membership_to_categories()

    for cat, value in sorted(
        ad.categories_membership.items(),
Example #4
0
 def get_text(self):
     text = self.article.cleaned_text
     self.content = AnalizedDocument(unidecode(text))