Esempio n. 1
0
def get_save_links():
    url = 'https://printcopy.info/?mod=erc'
    proxy = {'http': 'http://' + get_proxy.get_proxies_list()}
    useragent = {'User-Agent': get_proxy.get_useregent_list()}
    brand_list = parser.get_brand_model_links(
        parser.get_html(url, useragent, proxy), 'brandList')

    for brand in brand_list:
        print(brand['name'])
        proxy = {'http': 'http://' + get_proxy.get_proxies_list()}
        useragent = {'User-Agent': get_proxy.get_useregent_list()}

        # присваиваем html страницу в переменную soup
        soup = parser.get_html(brand['href'], useragent, proxy)
        page_count = parser.get_pagination_index_models(soup)
        print(page_count)
        model_link = parser.get_brand_model_links(soup, 'modelList')
        file_utils.save_model_links_csv(model_link, brand['name'],
                                        brand['name'])
        if page_count > 1:
            for i in range(page_count):
                index = i + 2
                if index <= page_count:
                    model_link = parser.get_brand_model_links(
                        parser.get_html(brand['href'] + f'&page={index}',
                                        useragent, proxy), 'modelList')
                    file_utils.save_model_links_csv(model_link, brand['name'],
                                                    brand['name'])
Esempio n. 2
0
def parser_models():
    file_index = 0
    model_links = file_utils.load_links_brand()
    for brand in model_links:
        for model in brand:
            file_index += 1
            proxy = {'http': 'http://' + get_proxy.get_proxies_list()}
            useragent = {'User-Agent': get_proxy.get_useregent_list()}
            # присваиваем html страницу в переменную soup
            soup = parser.get_html(model['href'], useragent, proxy)
            page_count = parser.get_pagination_index_models(soup)
            model_name = model['name']
            brands = re.findall('^[^\s]+', model_name)
            brand_name = brands[0]
            print(
                str(file_index) + ': ' + model_name + ', page count - ' +
                str(page_count))
            erc_csv = parser.parser_errors(soup, brand_name, model_name)
            file_utils.save_error_code(erc_csv, brand_name, model_name)
            if page_count > 1:
                for i in range(page_count):
                    index = i + 2
                    if index <= page_count:
                        soup = parser.get_html(
                            model['href'] + f'&page={index}', useragent, proxy)
                        erc_csv = parser.parser_errors(soup, brand_name,
                                                       model_name)
                        file_utils.save_error_code(erc_csv, brand_name,
                                                   model_name)
Esempio n. 3
0
def parser_site():
    # data = pandas.read_csv('models', sep=';')
    data = fu.load_file('models')
    # models = data.values.tolist()
    for model in data:
        if model:
            proxy = {'http': 'http://' + get_proxy.get_proxies_list()}
            useragent = {'User-Agent': get_proxy.get_useregent_list()}

            soup = hu.get_html(model[1], useragent, proxy)

            if soup == 404:
                continue

            brand_name, model_name, device_spec, device_data = hu.model_parser(
                soup, model[0])

            model_name = re.sub('/', ' ', model_name)
            base_dir = os.path.dirname(__file__)
            base_dir = f'{base_dir}\\parse\\{brand_name}'
            if not os.path.exists(base_dir):
                os.mkdir(base_dir)

            df = pandas.DataFrame(device_spec)
            df.to_csv(f'{base_dir}\\{model_name}_spec.csv',
                      index=False,
                      header=False,
                      sep=";")
            df = pandas.DataFrame(device_data)
            df.to_csv(f'{base_dir}\\{model_name}_parts.csv',
                      index=False,
                      header=False,
                      sep=";")
def parser_models():
    file_index = 0
    model_links = file_utils.load_links_brand()
    for brand in model_links:
        brand_name = brand[0]
        for model in brand:
            file_index += 1
            proxy = {'http': 'http://' + get_proxy.get_proxies_list()}
            useragent = {'User-Agent': get_proxy.get_useregent_list()}
            # присваиваем html страницу в переменную soup
            soup = parser.get_html(model['href'], useragent, proxy)
            model_name = model['name']
            print(str(file_index) + '. ' + model_name)
            modules = parser.get_modules(soup, 'pcToc')

            for module in modules:
                module_name = module['name']
                soup = parser.get_html(module['href'], useragent, proxy)
                file_utils.save_partcode(
                    parser.get_partcodes(soup, brand_name['brand'], model_name,
                                         module_name), brand_name['brand'],
                    model_name)