Example #1
0
class Manager:
    def __init__(self, name_query, surname_query):
        self.name_query = name_query
        self.surname_query = surname_query

        self.main_painter = Painter("agregator")
        self.main_painter.set_queries(name_query, surname_query)

        self.saver = Saver()
        self.temp_painters_list = []

    def run(self, path):
        raw_texts = []
        for painter in self.temp_painters_list:
            raw_texts.append(painter.temp_raw_text)
        self.add_temp_painter(Interpreter.interpret(raw_texts))

        self.merge_painters()
        self.main_painter.sort_dictionaries()
        self.saver.save_final_file(self.main_painter.text_dump(), path)

    def run_list(self, path):
        self.merge_painters()
        self.main_painter.sort_dictionaries()
        self.saver.save_final_file(self.main_painter.dump_names(), path)

    def add_temp_painter(self, painter):
        self.temp_painters_list.append(painter)

    def merge_painters(self):
        for temp_painter in self.temp_painters_list:
            self.main_painter.add_data_from_temp_painter(temp_painter)
Example #2
0
    def interpret(raw_text_list):
        new_data_painter = Painter("interpreted")
        acquired_list = []

        for raw_text in raw_text_list:
            for key, value in Interpreter.categories_keywords.items():
                found = True
                lowered_text = raw_text.lower().strip() + " "
                while found:
                    start_index = lowered_text.find(key)
                    if start_index == -1:
                        found = False
                    else:
                        found = True
                        end_index = lowered_text.find(" ", start_index)
                        if lowered_text[end_index -
                                        1] == "." or lowered_text[end_index -
                                                                  1] == ",":
                            end_index -= 1
                        substring = lowered_text[start_index:end_index]
                        acquired_list.append(value)
                        lowered_text = lowered_text.replace(substring, "")

        for phrase in acquired_list:
            print("phrase: [" + phrase + "]\n")

        new_data_painter.new_crawler_data_list(acquired_list, "kategoria")
        return new_data_painter
Example #3
0
File: wiki1.py Project: KPiasta/ZPI
def get_list_kategory(manager, category):
    painter = Painter("wikipedia")
    url = "https://pl.wikipedia.org/wiki/Kategoria:" + category_dictionary.get(category)
    get_list_by_hc_category_helper(url)

    for name in result_names:
        print(name)
    painter.new_crawler_data_list(result_names, "imie")
    manager.add_temp_painter(painter)
Example #4
0
    def __init__(self, name_query, surname_query):
        self.name_query = name_query
        self.surname_query = surname_query

        self.main_painter = Painter("agregator")
        self.main_painter.set_queries(name_query, surname_query)

        self.saver = Saver()
        self.temp_painters_list = []
Example #5
0
def get_raw(manager, *names):
    urls = get_urls(*names)

    # urls = []
    # for d in soup.find_all('div', class_="g"):
    #     a = d.find('a')['href'][7:].split('&')[0]
    #     if 'wikipedia' in a or 'facebook' in a or 'youtube' in a:
    #         continue
    #     if 'http' in a:
    #         urls.append(a)
    #     if len(urls) >= 3:
    #         break

    print(urls)

    temp_painter = Painter("generic")
    texts = []
    for u in urls:
        try:
            text = ""
            source_code = requests.get(u).text
            soup = BeautifulSoup(source_code, features="html.parser")
            for p in soup.find_all('p' or 'span'):
                text += p.getText()
            texts.append(text)
        except requests.ssl.SSLCertVerificationError:
            print(
                "=============================CERT.ERR============================================================="
            )

    temp_painter = Interpreter.interpret(texts)
    manager.add_temp_painter(temp_painter)
Example #6
0
File: wiki1.py Project: KPiasta/ZPI
def get_list(manager, names):
    array = []
    if names == '':
        return array

    painter = Painter("wikipedia")
    url = "https://pl.wikipedia.org/w/index.php?title=Specjalna:Szukaj&limit=20offset=0&profile=default&search="
    url += names
    url += "&title=Specjalna%3ASzukaj&profile=advanced&fulltext=1&advancedSearch-current=%7B%7D&ns0=1"
    print(url)

    source_code = requests.get(url).text
    soup = BeautifulSoup(source_code, features="html.parser")
    component = soup.find_all('li', class_="mw-search-result")
    wiki_prefix = "https://pl.wikipedia.org"
    for c in component:
        x = wiki_prefix + c.find('a')['href']
        check_if_painter(x, array)
    print(array)
    painter.new_crawler_data_list(array, "imie")
    manager.add_temp_painter(painter)
    return array
Example #7
0
def run(manager, *names):
    painter = Painter("touchofart")
    painter.new_temp_text(get_raw_text(*names))
    manager.add_temp_painter(painter)
Example #8
0
File: wiki1.py Project: KPiasta/ZPI
def run(manager, name):
    soup = set_up_url(name)

    painter = Painter("wikipedia")
    painter.new_temp_text(get_raw_text(soup))
    print(get_raw_text(soup))

    name = find_by_key_word(soup, 'Imię')
    painter.new_crawler_data_list({name}, "imie")

    data_ur = extract_date(find_by_key_word(soup, 'urodzenia'))
    painter.new_crawler_data_list({data_ur}, "data_ur")

    miejsce_ur = extract_place(find_by_key_word(soup, 'urodzenia'))
    painter.new_crawler_data_list({miejsce_ur}, "miejsce_ur")

    data_sm = extract_date(find_by_key_word(soup, 'śmierci'))
    painter.new_crawler_data_list({data_sm}, "data_sm")

    miejsce_sm = extract_place(find_by_key_word(soup, 'śmierci'))
    painter.new_crawler_data_list({miejsce_sm}, "miejsce_sm")

    dziela = find_work_of_arts(soup)
    painter.new_crawler_data_list(dziela, "dzielo")

    kategorie = []
    epoka = find_by_key_word(soup, 'Epoka')

    if epoka != "":
        kategorie.append(epoka)

    painter.new_crawler_data_list(kategorie, "kategoria")

    muzea = []
    muzeum = find_by_key_word(soup, 'Muzeum artysty')
    if muzeum != "":
        muzea.append(muzeum)

    painter.new_crawler_data_list(muzea, "muzeum")

    edukacja = []
    edu = find_by_key_word(soup, 'Alma Mater')
    edu_1 = find_by_key_word(soup, "Uczelnia")
    #obrazki = []
    """
    obrazki = get_images_individual(name)
    print(obrazki)
    painter.new_crawler_data_list(obrazki,"link")
    """
    if edu != "":
        edukacja.append(edu)
    if edu_1 != '':
        edukacja.append(edu_1)

    painter.new_crawler_data_list(edukacja, "studia")
    print(painter.crawler_text_dump())
    manager.add_temp_painter(painter)

# print(get_list_kategory(""))
#
# run(manager, url)
# get_list_kategory('Abstrakcjoniści')
# get_list_kategory('Impresjoniści')
# get_list_kategory('Abstrakcjoniści')
# get_list_kategory('Malarze baroku')
# get_list_kategory('Malarze gotyccy')
# get_list_kategory('Malarze klasycystyczni')
# get_list_kategory('Malarze renesansu')
# get_list_kategory('Malarze rokoko')
# get_list_kategory('Malarze romantyczni')
# get_list_kategory('Malarze secesyjni')
# get_list_kategory('Malarze wspólcześni')
# get_list_kategory('Postimpresjoniści')
# get_list_kategory('Prymitywiści')
# get_list_kategory('Realiści')
# get_list_kategory('Surrealiści')
# arr = ['Leonard da Vinci', 'Zdzisław Beksiński', 'Adolf Hitler', 'Witold Wojtkiewicz',
# 'Wiktor Borisow-Musatow',
# 'Kuźma Pietrow-Wodkin']
#
# path_read = "C:\\Users\\kpiasta\\Desktop\\ZPI\\files_stuff\\result\\result.txt"
# path_write = "C:\\Users\\kpiasta\\Desktop\\ZPI\\files_stuff\\result\\images.txt"
# get_images_with_index(path_read, path_write, 1, 20)
#get_images(arr)
Example #9
0
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import re
from unidecode import unidecode
from files_stuff.Saver import Saver
from manager.Painter import Painter

import urllib.parse
from urllib.parse import quote

Saver = Saver()
painter = Painter("magazyn_sztuki")


# start method
def run_individual(manager, phrase):
    to_find = phrase.replace(" ", "+")
    phrase = phrase.replace(" ", "-")
    to_find = convert_phrase(to_find)
    phrase = unidecode(phrase)
    phrase = phrase.lower()
    pages = set()

    url = 'http://www.magazynsztuki.pl/page/1/?s=' + to_find
    check_pages(manager, url, phrase, pages)


def run_list_artists(manager, phrase):
    to_find = phrase.replace(" ", "+")
    to_find = convert_phrase(to_find)
Example #10
0
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.request import urlopen
from unidecode import unidecode
from manager.Painter import Painter

painter = Painter("zyciorysy")


def run_individual(manager, phrase):
    to_find = phrase.replace(" ", "+")
    to_find = convert_phrase(to_find)
    phrase = unidecode(phrase)
    phrase = phrase.lower()
    pages = set()

    url = 'https://zyciorysy.info/' + to_find
    check_pages(manager, url, phrase, pages)


def check_pages(manager, pageUrl, phrase, pages):
    r = requests.get(pageUrl)
    soup = BeautifulSoup(r.text, "lxml")
    body = soup.body
    div = body.find('div', {'class': 'entry-content'})
    links = div('p')

    for link in links:
        print(link.text)
        get_image(link['href'])