class WebPage(object): """ Represents analized webpage. :field content: object of Document class :type content: Document :field url: url to a webpage :type url: string """ goose = ExtendedGoose() def __init__(self, url="http://google.com"): """ :param document: object of Document class :type document: Document :param url: url to a webpage :type url: string """ self.url = url self.article = self.goose.extract(url=self.url) self.get_text() self.create_folder() self.get_all_pictures() def create_folder(self): folder = re.sub("https?://", "", self.article.final_url).replace("/", "_") self.directory = "./data/{}/".format(folder) if not os.path.exists(self.directory): os.makedirs(self.directory) def get_text(self): text = self.article.cleaned_text self.content = AnalizedDocument(unidecode(text)) def get_top_picture(self): image = self.article.top_image.src extension = os.path.splitext(image)[1] urllib.urlretrieve(image, self.directory + "top" + extension) def get_all_pictures(self): self.pictures = [] for image_src in map(lambda im: im.src, self.article.images): image_path = self.directory + os.path.basename(image_src) urllib.urlretrieve(image_src, image_path) self.pictures.append((image_path, Image.open(image_path))) def get_text_similarity(self, web_page): return self.content.compare(web_page.content) def get_image_similatiry(self, web_page): return compare_many(self.pictures, web_page.pictures)
:synopsis: This module shows example usage """ import fix_path from similarity.text.document import AnalizedDocument from similarity.img.compare import ( compare, compare_many ) from utils import print_sorted_dict from PIL import Image from glob import glob from os import path if __name__ == '__main__': ad1 = AnalizedDocument.from_file("./data/test_doc") ad2 = AnalizedDocument.from_file("./data/test_doc2") print "Text Similarity for ad1 and ad2" comparsion = ad1.compare(ad2) print "Result: ", comparsion.similarity_result print "Important categories:" for cat in comparsion.important_categories: print cat print_sorted_dict(ad1.categories_membership, "test_doc") print_sorted_dict(ad2.categories_membership, "test_doc2") print "Two images:" im1 = Image.open("./data/red.jpg") im2 = Image.open("./data/pom.png")
from similarity.text.document import AnalizedDocument if __name__ == '__main__': text = """ A hedgehog is any of the spiny mammals of the subfamily Erinaceinae, which is in order Erinaceomorpha. There are seventeen species of hedgehog in five genera, found through parts of Europe, Asia, Africa and New Zealand. """ ad = AnalizedDocument(text) print ad.terms_quantity for term in ad.terms_quantity: print "Term:", term for cat, items in sorted( ad.terms_relevance.items(), key=lambda x: x[1].get(term), reverse=True ): relevance = items.get(term, 0) if relevance: print "\t", cat, ":", relevance print '-'*100 ad.calculate_terms_membership() print ad.terms_membership print '-'*100 ad.calculate_membership_to_categories() for cat, value in sorted( ad.categories_membership.items(),
def get_text(self): text = self.article.cleaned_text self.content = AnalizedDocument(unidecode(text))