class Manager: def __init__(self, name_query, surname_query): self.name_query = name_query self.surname_query = surname_query self.main_painter = Painter("agregator") self.main_painter.set_queries(name_query, surname_query) self.saver = Saver() self.temp_painters_list = [] def run(self, path): raw_texts = [] for painter in self.temp_painters_list: raw_texts.append(painter.temp_raw_text) self.add_temp_painter(Interpreter.interpret(raw_texts)) self.merge_painters() self.main_painter.sort_dictionaries() self.saver.save_final_file(self.main_painter.text_dump(), path) def run_list(self, path): self.merge_painters() self.main_painter.sort_dictionaries() self.saver.save_final_file(self.main_painter.dump_names(), path) def add_temp_painter(self, painter): self.temp_painters_list.append(painter) def merge_painters(self): for temp_painter in self.temp_painters_list: self.main_painter.add_data_from_temp_painter(temp_painter)
def interpret(raw_text_list): new_data_painter = Painter("interpreted") acquired_list = [] for raw_text in raw_text_list: for key, value in Interpreter.categories_keywords.items(): found = True lowered_text = raw_text.lower().strip() + " " while found: start_index = lowered_text.find(key) if start_index == -1: found = False else: found = True end_index = lowered_text.find(" ", start_index) if lowered_text[end_index - 1] == "." or lowered_text[end_index - 1] == ",": end_index -= 1 substring = lowered_text[start_index:end_index] acquired_list.append(value) lowered_text = lowered_text.replace(substring, "") for phrase in acquired_list: print("phrase: [" + phrase + "]\n") new_data_painter.new_crawler_data_list(acquired_list, "kategoria") return new_data_painter
def get_list_kategory(manager, category): painter = Painter("wikipedia") url = "https://pl.wikipedia.org/wiki/Kategoria:" + category_dictionary.get(category) get_list_by_hc_category_helper(url) for name in result_names: print(name) painter.new_crawler_data_list(result_names, "imie") manager.add_temp_painter(painter)
def __init__(self, name_query, surname_query): self.name_query = name_query self.surname_query = surname_query self.main_painter = Painter("agregator") self.main_painter.set_queries(name_query, surname_query) self.saver = Saver() self.temp_painters_list = []
def get_raw(manager, *names): urls = get_urls(*names) # urls = [] # for d in soup.find_all('div', class_="g"): # a = d.find('a')['href'][7:].split('&')[0] # if 'wikipedia' in a or 'facebook' in a or 'youtube' in a: # continue # if 'http' in a: # urls.append(a) # if len(urls) >= 3: # break print(urls) temp_painter = Painter("generic") texts = [] for u in urls: try: text = "" source_code = requests.get(u).text soup = BeautifulSoup(source_code, features="html.parser") for p in soup.find_all('p' or 'span'): text += p.getText() texts.append(text) except requests.ssl.SSLCertVerificationError: print( "=============================CERT.ERR=============================================================" ) temp_painter = Interpreter.interpret(texts) manager.add_temp_painter(temp_painter)
def get_list(manager, names): array = [] if names == '': return array painter = Painter("wikipedia") url = "https://pl.wikipedia.org/w/index.php?title=Specjalna:Szukaj&limit=20offset=0&profile=default&search=" url += names url += "&title=Specjalna%3ASzukaj&profile=advanced&fulltext=1&advancedSearch-current=%7B%7D&ns0=1" print(url) source_code = requests.get(url).text soup = BeautifulSoup(source_code, features="html.parser") component = soup.find_all('li', class_="mw-search-result") wiki_prefix = "https://pl.wikipedia.org" for c in component: x = wiki_prefix + c.find('a')['href'] check_if_painter(x, array) print(array) painter.new_crawler_data_list(array, "imie") manager.add_temp_painter(painter) return array
def run(manager, *names): painter = Painter("touchofart") painter.new_temp_text(get_raw_text(*names)) manager.add_temp_painter(painter)
def run(manager, name): soup = set_up_url(name) painter = Painter("wikipedia") painter.new_temp_text(get_raw_text(soup)) print(get_raw_text(soup)) name = find_by_key_word(soup, 'Imię') painter.new_crawler_data_list({name}, "imie") data_ur = extract_date(find_by_key_word(soup, 'urodzenia')) painter.new_crawler_data_list({data_ur}, "data_ur") miejsce_ur = extract_place(find_by_key_word(soup, 'urodzenia')) painter.new_crawler_data_list({miejsce_ur}, "miejsce_ur") data_sm = extract_date(find_by_key_word(soup, 'śmierci')) painter.new_crawler_data_list({data_sm}, "data_sm") miejsce_sm = extract_place(find_by_key_word(soup, 'śmierci')) painter.new_crawler_data_list({miejsce_sm}, "miejsce_sm") dziela = find_work_of_arts(soup) painter.new_crawler_data_list(dziela, "dzielo") kategorie = [] epoka = find_by_key_word(soup, 'Epoka') if epoka != "": kategorie.append(epoka) painter.new_crawler_data_list(kategorie, "kategoria") muzea = [] muzeum = find_by_key_word(soup, 'Muzeum artysty') if muzeum != "": muzea.append(muzeum) painter.new_crawler_data_list(muzea, "muzeum") edukacja = [] edu = find_by_key_word(soup, 'Alma Mater') edu_1 = find_by_key_word(soup, "Uczelnia") #obrazki = [] """ obrazki = get_images_individual(name) print(obrazki) painter.new_crawler_data_list(obrazki,"link") """ if edu != "": edukacja.append(edu) if edu_1 != '': edukacja.append(edu_1) painter.new_crawler_data_list(edukacja, "studia") print(painter.crawler_text_dump()) manager.add_temp_painter(painter) # print(get_list_kategory("")) # # run(manager, url) # get_list_kategory('Abstrakcjoniści') # get_list_kategory('Impresjoniści') # get_list_kategory('Abstrakcjoniści') # get_list_kategory('Malarze baroku') # get_list_kategory('Malarze gotyccy') # get_list_kategory('Malarze klasycystyczni') # get_list_kategory('Malarze renesansu') # get_list_kategory('Malarze rokoko') # get_list_kategory('Malarze romantyczni') # get_list_kategory('Malarze secesyjni') # get_list_kategory('Malarze wspólcześni') # get_list_kategory('Postimpresjoniści') # get_list_kategory('Prymitywiści') # get_list_kategory('Realiści') # get_list_kategory('Surrealiści') # arr = ['Leonard da Vinci', 'Zdzisław Beksiński', 'Adolf Hitler', 'Witold Wojtkiewicz', # 'Wiktor Borisow-Musatow', # 'Kuźma Pietrow-Wodkin'] # # path_read = "C:\\Users\\kpiasta\\Desktop\\ZPI\\files_stuff\\result\\result.txt" # path_write = "C:\\Users\\kpiasta\\Desktop\\ZPI\\files_stuff\\result\\images.txt" # get_images_with_index(path_read, path_write, 1, 20) #get_images(arr)
from urllib.request import urlopen from urllib.error import HTTPError from bs4 import BeautifulSoup import re from unidecode import unidecode from files_stuff.Saver import Saver from manager.Painter import Painter import urllib.parse from urllib.parse import quote Saver = Saver() painter = Painter("magazyn_sztuki") # start method def run_individual(manager, phrase): to_find = phrase.replace(" ", "+") phrase = phrase.replace(" ", "-") to_find = convert_phrase(to_find) phrase = unidecode(phrase) phrase = phrase.lower() pages = set() url = 'http://www.magazynsztuki.pl/page/1/?s=' + to_find check_pages(manager, url, phrase, pages) def run_list_artists(manager, phrase): to_find = phrase.replace(" ", "+") to_find = convert_phrase(to_find)
import requests from bs4 import BeautifulSoup from urllib.parse import urljoin from urllib.request import urlopen from unidecode import unidecode from manager.Painter import Painter painter = Painter("zyciorysy") def run_individual(manager, phrase): to_find = phrase.replace(" ", "+") to_find = convert_phrase(to_find) phrase = unidecode(phrase) phrase = phrase.lower() pages = set() url = 'https://zyciorysy.info/' + to_find check_pages(manager, url, phrase, pages) def check_pages(manager, pageUrl, phrase, pages): r = requests.get(pageUrl) soup = BeautifulSoup(r.text, "lxml") body = soup.body div = body.find('div', {'class': 'entry-content'}) links = div('p') for link in links: print(link.text) get_image(link['href'])