def get_save_links(): url = 'https://printcopy.info/?mod=erc' proxy = {'http': 'http://' + get_proxy.get_proxies_list()} useragent = {'User-Agent': get_proxy.get_useregent_list()} brand_list = parser.get_brand_model_links( parser.get_html(url, useragent, proxy), 'brandList') for brand in brand_list: print(brand['name']) proxy = {'http': 'http://' + get_proxy.get_proxies_list()} useragent = {'User-Agent': get_proxy.get_useregent_list()} # присваиваем html страницу в переменную soup soup = parser.get_html(brand['href'], useragent, proxy) page_count = parser.get_pagination_index_models(soup) print(page_count) model_link = parser.get_brand_model_links(soup, 'modelList') file_utils.save_model_links_csv(model_link, brand['name'], brand['name']) if page_count > 1: for i in range(page_count): index = i + 2 if index <= page_count: model_link = parser.get_brand_model_links( parser.get_html(brand['href'] + f'&page={index}', useragent, proxy), 'modelList') file_utils.save_model_links_csv(model_link, brand['name'], brand['name'])
def parser_models(): file_index = 0 model_links = file_utils.load_links_brand() for brand in model_links: for model in brand: file_index += 1 proxy = {'http': 'http://' + get_proxy.get_proxies_list()} useragent = {'User-Agent': get_proxy.get_useregent_list()} # присваиваем html страницу в переменную soup soup = parser.get_html(model['href'], useragent, proxy) page_count = parser.get_pagination_index_models(soup) model_name = model['name'] brands = re.findall('^[^\s]+', model_name) brand_name = brands[0] print( str(file_index) + ': ' + model_name + ', page count - ' + str(page_count)) erc_csv = parser.parser_errors(soup, brand_name, model_name) file_utils.save_error_code(erc_csv, brand_name, model_name) if page_count > 1: for i in range(page_count): index = i + 2 if index <= page_count: soup = parser.get_html( model['href'] + f'&page={index}', useragent, proxy) erc_csv = parser.parser_errors(soup, brand_name, model_name) file_utils.save_error_code(erc_csv, brand_name, model_name)
def parser_site(): # data = pandas.read_csv('models', sep=';') data = fu.load_file('models') # models = data.values.tolist() for model in data: if model: proxy = {'http': 'http://' + get_proxy.get_proxies_list()} useragent = {'User-Agent': get_proxy.get_useregent_list()} soup = hu.get_html(model[1], useragent, proxy) if soup == 404: continue brand_name, model_name, device_spec, device_data = hu.model_parser( soup, model[0]) model_name = re.sub('/', ' ', model_name) base_dir = os.path.dirname(__file__) base_dir = f'{base_dir}\\parse\\{brand_name}' if not os.path.exists(base_dir): os.mkdir(base_dir) df = pandas.DataFrame(device_spec) df.to_csv(f'{base_dir}\\{model_name}_spec.csv', index=False, header=False, sep=";") df = pandas.DataFrame(device_data) df.to_csv(f'{base_dir}\\{model_name}_parts.csv', index=False, header=False, sep=";")
def parser_models(): file_index = 0 model_links = file_utils.load_links_brand() for brand in model_links: brand_name = brand[0] for model in brand: file_index += 1 proxy = {'http': 'http://' + get_proxy.get_proxies_list()} useragent = {'User-Agent': get_proxy.get_useregent_list()} # присваиваем html страницу в переменную soup soup = parser.get_html(model['href'], useragent, proxy) model_name = model['name'] print(str(file_index) + '. ' + model_name) modules = parser.get_modules(soup, 'pcToc') for module in modules: module_name = module['name'] soup = parser.get_html(module['href'], useragent, proxy) file_utils.save_partcode( parser.get_partcodes(soup, brand_name['brand'], model_name, module_name), brand_name['brand'], model_name)