Esempio n. 1
0
 def parsing(self, html, obj, win_size_w, win_size_h):
     global i_count
     global downl_list
     soup = BS(html, features="html.parser")
     r = soup.find('div', id='dle-content').find_all('a',
                                                     class_='screen-link')
     for inc, a in enumerate(r, start=1):
         soup = BS(get_html(a.get('href'), self.E_SITE),
                   features="html.parser")
         req = soup.find('div', class_='llink').find_all('a')
         for aa in req:
             size_w = int(aa.get('href').split('/')[-2].split('x')[0])
             size_h = int(aa.get('href').split('/')[-2].split('x')[1])
             if win_size_w == size_w and win_size_h == size_h:
                 url_size = aa.get('href')
                 self.downl_list.append(url_size)
             else:
                 continue
     random_el = random.choice(self.downl_list)
     soup2 = BS(get_html(random_el, self.E_SITE), features="html.parser")
     image_url = soup2.find(id='img').get('src')
     name = f'img/{obj}' + '.jpg'
     save_file(image_url, name)
     print(f'|{name.split("/")[-1]:^{33}}|{"Загружен":^{10}}|')
     self.i_count += 1
     return self.i_count
Esempio n. 2
0
def ParsPagesUrl(MainUrl):
    html = get_html(MainUrl)
    soup = BeautifulSoup(html, 'lxml')
    ListCharPageTag = []
    ListCharPageUrl = []
    ListCharPageUrlWithPage = []
    ListCharPageTag = soup.find('div', class_='first-level').find_all('a')
    # print(ListCharPageTag)
    for CharPageTag in ListCharPageTag:
        ChatUrl = MainUrl + CharPageTag.get('href')
        ListCharPageUrl.append(ChatUrl)
        # print(ListCharPageUrl)
    for CharPageUrl in ListCharPageUrl:
        html = get_html(CharPageUrl)
        soup = BeautifulSoup(html, 'lxml')
        try:
            Pagination = int(
                soup.find('ul', class_='pagination').find_all('a')[-1].text)
        except:
            Pagination = 1
        num = 1
        while num <= Pagination:
            Url = CharPageUrl[0:-1] + str(num)
            ListCharPageUrlWithPage.append(Url)
            num += 1
    return ListCharPageUrlWithPage
Esempio n. 3
0
def main(ob):

    obj, os = ob
    url = f'http://soft.sibnet.ru/search/?text={obj.strip()}&os={os.strip()}&&pg=1'
    pages = get_pages(get_html(url, 'cp1251'))
    for i in range(1, 2):
        base_url = f'http://soft.sibnet.ru/search/?text={obj.strip()}&os={os.strip()}&&pg={i}'
        parsing(get_html(base_url, 'cp1251'), obj)
Esempio n. 4
0
def get_image(price):
    price_argo = read_csv(price)
    list_image_dir = []
    i = 1
    for stuff in price_argo:
        html = get_html(stuff['url_stuff'])
        soup = BeautifulSoup(html, 'lxml')
        try:
            image_list = soup.find('div', id='product-gallery').find_all('a')
        except:
            image_list = soup.find('div', class_='image-border').find_all('a')
        image_list_url = []
        for image in image_list:
            url_image = image.get('href')
            if url_image in image_list_url:
                break
            image_list_url.append(url_image)
        dir_for_image = os.path.join(
            r'D:\tmp\python\python_parsing\parsing_ARGO', 'image',
            stuff['category_name'],
            stuff['name_stuff']).replace(' ', '_').replace('"', '')
        while dir_for_image in list_image_dir:
            dir_for_image = dir_for_image + '_' + str(i)
            i += 1
        os.makedirs(dir_for_image)
        list_image_dir.append(dir_for_image)
        for i, image in enumerate(image_list_url):
            if image == '':
                continue
            with open(os.path.join(dir_for_image, '{}.jpg'.format(i)),
                      'wb') as image_file:
                image_file.write(requests.get(image).content)
        write_csv(stuff, dir_for_image + '\info.csv')
Esempio n. 5
0
def GetFilmCatalog(ListData, num):
    NameFile = r'D:\tmp\my_prod\Python\python\ParsingMobi711\MobiParsBk_mp.csv'
    for Data in ListData:
        html = get_html('https://mobi711.ru' + Data['Url'])
        soup = BeautifulSoup(html, 'lxml')
        try:
            DivUrlFilmCatalog = soup.find_all('div',
                                              class_='category no-description')
        except AttributeError:
            DivUrlFilmCatalog = []
        if DivUrlFilmCatalog != []:
            for Div in DivUrlFilmCatalog:
                UrlFilmCatalog = 'https://mobi711.ru' + \
                    Div.find('div', class_='text').find('a').get('href')
                Name = Div.find('div', class_='text').find('a').text
                if Name == 'Защитные пленки и стекла':
                    break
                else:
                    UrlFilmCatalog = ''
                    Name = ''
        else:
            UrlFilmCatalog = ''
        data = {
            'Brand': Data['Brand'],
            'Model': Data['Model'],
            'UrlModel': Data['Url'],
            'UrlFilmCatalog': UrlFilmCatalog
        }

        write_csv(data, NameFile)
Esempio n. 6
0
def get_stuff_info(list_stuff):
    for stuff in list_stuff:
        url_stuff = stuff['url_stuff']
        html_stuff = get_html(url_stuff)
        soup = BeautifulSoup(html_stuff, 'lxml')
        text_info = soup.find('div', id='test').find_all('li')
        size, color, cloth = '', '', ''
        price = ''
        try:
            price = soup.find('div', class_='retail-price price-prod').text
        except:
            price = ''
        for text_ in text_info:
            cur_line_name = text_.find('span', class_='opts-lab').text
            cur_line_val = text_.find('span', class_='opts-val').text
            if cur_line_name == 'Размер:':
                size = cur_line_val
            elif cur_line_name == 'Цвет:':
                color = cur_line_val
            elif cur_line_name == 'Ткань:':
                cloth = cur_line_val
        data_stuff = {
            'category_name': stuff['category_name'],
            'name_stuff': stuff['name_stuff'],
            'url_stuff': stuff['url_stuff'],
            'size': size,
            'color': color,
            'cloth': cloth,
            'price': price
        }
        print(data_stuff)
        write_csv(data_stuff,
                  r'D:\tmp\my_prod\Python\python\parsing_ARGO\argo.csv')
Esempio n. 7
0
def get_list_catalog_url(url_in):
    html = get_html(url_in)
    soup = BeautifulSoup(html, 'lxml')
    list_catalog_url = soup.find_all(
        'div', class_='col-md-4 col-sm-6 col-xs-12 redisign-category item')
    for j, url in enumerate(list_catalog_url):
        list_catalog_url[j] = url_in + url.find('a').get('href')
    return list_catalog_url
Esempio n. 8
0
    def tests_get_html(self):
        """
        Why do we need this ? For checking of connection and is site online. 
        """
        expected_value = '<form name="login"'
        actual_value = get_html()
        # Test 1 get html

        self.assertIn(expected_value, actual_value)
Esempio n. 9
0
def get_list_stuff_on_pages(list_catalog_url):
    list_stuff = []
    for q, catalog_url in enumerate(list_catalog_url):
        html_catalog = get_html(catalog_url)
        soup = BeautifulSoup(html_catalog, 'lxml')
        text_about_page = soup.find(
            'div',
            class_='col-lg-6 col-xs-12 text-right results').text.split(' ')
        for i, text in enumerate(text_about_page):
            if text == '(всего':
                max_page = int(text_about_page[i + 1])
                break
        list_stuff_div = soup.find_all(
            'div',
            class_='product-list-item xs-100 sm-100 md-100 lg-100 xl-100')
        category_name = soup.find('h1').text
        for stuff in list_stuff_div:
            url_stuff = stuff.find('h4').find('a').get('href')
            name_stuff = stuff.find('h4').find('a').text
            data_tmp = {
                'category_name': category_name,
                'name_stuff': name_stuff,
                'url_stuff': url_stuff
            }
            list_stuff.append(data_tmp)
        for page in range(1, max_page):
            url_page = catalog_url + '?page={}'.format(page + 1)
            html_page = get_html(url_page)
            soup = BeautifulSoup(html_page, 'lxml')
            list_stuff_div = soup.find_all(
                'div',
                class_='product-list-item xs-100 sm-100 md-100 lg-100 xl-100')
            for stuff in list_stuff_div:
                url_stuff = stuff.find('h4').find('a').get('href')
                name_stuff = stuff.find('h4').find('a').text
                data_tmp = {
                    'category_name': category_name,
                    'name_stuff': name_stuff,
                    'url_stuff': url_stuff
                }
                list_stuff.append(data_tmp)
    return list_stuff
Esempio n. 10
0
 def tests_get_token_from_html(self):
     """
     You could test this by checking if the token value is not null 
     e.g assertIsNotNone(token_value)
     """
     expected_value = 'maybe not even needed because token is variable'
     actual_value = 'token value'
     # Test 2 get token from html
     html = get_html(uri=self.uri)
     actual_value = get_token(html)
     self.assertIsNotNone(actual_value)
Esempio n. 11
0
def get_catalog_url(url_in):
    html = get_html(url_in)
    soup = BeautifulSoup(html, 'lxml')
    url_list_ul = soup.find('ul', class_='nav navbar-nav').find_all('li')
    catalog = []
    for url in url_list_ul:
        catalog_name = url.find('a').text
        catalog_url = url.find('a').get('href')
        catalog_dict = {'catalog_name': catalog_name,
                        'catalog_url': catalog_url}
        catalog.append(catalog_dict)
    return catalog
Esempio n. 12
0
def get_max_page(url_page='', html=''):
    if not url_page == '':
        html_new = get_html(url_page)
    elif not html == '':
        html_new = html
    soup = BeautifulSoup(html_new, 'lxml')
    pagination = soup.find('div', class_='pagination_wrap row').text.split(' ')
    for i, page in enumerate(pagination):
        if page == '(всего':
            max_page = int(pagination[i+1])
            break
    return max_page
Esempio n. 13
0
def ParsWordFromPage(url_in):
    html = get_html(url_in)
    soup = BeautifulSoup(html, 'lxml')
    ListWordTag = []
    ListWordTag = soup.find('table',
                            class_='table').find_all('td', class_='text-left')
    for WordTag in ListWordTag:
        Word = WordTag.find('a').text
        with open('python\\FindWord\\DictWord', 'a',
                  encoding='utf-8') as Dict_:
            Dict_.write(Word + '; ')
            Dict_.close()
Esempio n. 14
0
def get_catalog(utl_in):
    catalog = []
    html = get_html(url_in)
    soup = BeautifulSoup(html, 'lxml')
    catalog_div = sopu.find('div', class_='column-aside', id='aside').find(
        'ul', class_='menu').find_all('li', class_='menu__item')
    for catalog_tmp in catalog_div:
        catalog_url = catalog_tmp.find('a').get('htef')
        catalog_name = catalog_tmp.find('a').text
        data = {'catalog_name': catalog_name, 'catalog_url': catalog_url}
        catalog.append(data)
    return (catalog)
Esempio n. 15
0
    def tests_file_type(self):
        """
        Example of a test
        """
        html = get_html(uri=self.uri)
        self.login_data['tcurl'] = get_token(html)
        opener = my_opener(login_data=self.login_data)
        response = opener.open(self.test_image_uri)

        expected_value = self.file_type
        actual_value = response.info().get('Content-Type')

        self.assertIn(expected_value, actual_value)
Esempio n. 16
0
    def main(self, obj):

        start_time = datetime.now()
        win_width = 1920
        win_heigth = 1080
        url = f'https://www.nastol.com.ua/tags/{quote(obj.strip(), encoding=self.E_SITE)}/page/1/'
        pages, img_count = self.get_pages(get_html(url, self.E_SITE))
        page = random.randint(1, pages)
        base_url = f'https://www.nastol.com.ua/tags/{quote(obj.strip(), encoding=self.E_SITE)}/page/{page}/'
        print(f'{img_count}\nСтраница:{page}-{pages}')
        print('-' * 51)
        print(f'|{"Категория - имя файла":{33}}|{"Статус":{10}}|')
        print('-' * 51)
        inc = self.parsing(get_html(base_url, self.E_SITE), obj, win_width,
                           win_heigth)
        print('-' * 51, end='\n')
        print(f'Скачано:{inc}')
        end_time = datetime.now()
        print(
            f'Затрачено времени:{str(end_time - start_time).split(".")[0]:^{50}}'
        )
        print('-' * 51, end='\n')
Esempio n. 17
0
def get_stuff_info(list_stuff_on_site):
    for stuff in list_stuff_on_site:
        html = get_html(stuff['stuff_url'])
        with open('0.html', 'w', encoding='utf-8') as f:
            f.write(html)
            f.close()
        soup = BeautifulSoup(html, 'lxml')
        try:
            descr = soup.find('div', id='tab-description').text
        except AttributeError:
            descr = ''
        try:
            atr = soup.find(
                'div', id='tab-specification').find('div', class_='attribute').text
        except AttributeError:
            atr = ''
        try:
            size_span = soup.find('div', class_='option row').find(
                'tbody').find_all('span', class_='size-title')
        except AttributeError:
            size_span = []
        size_span = soup.find('div', class_='option row').find(
            'tbody').find_all('span', class_='size-title')
        size = []
        for size_tmp in size_span:
            size.append(size_tmp.text)
            ','.join(size)
        data = {'catalog_name': stuff['catalog_name'],
                'stuff_name': stuff['stuff_name'],
                'stuff_url': stuff['stuff_url'],
                'descr': descr.replace('\n', ' ').replace('\r', ' '),
                'size': size,
                'atr': atr.replace('\n', ' ').replace('\r', ' ')}
        main_dir = r'D:\tmp\python\python_parsing\parsing_ck_textil'
        write_csv(
            data, r'D:\tmp\python\python_parsing\parsing_ck_textil\ck_textil.csv')
        try:
            image_tag_a_list = soup.find(
                'div', class_='MagicToolboxSelectorsContainer').find_all('a')
        except AttributeError:
            image_tag_a_list = soup.find(
                'div', class_='MagicToolboxContainer selectorsBottom minWidth').find_all('a')
        image_dir = os.path.join(
            main_dir, stuff['catalog_name'], stuff['stuff_name']).replace(' ', '_')
        os.makedirs(image_dir)
        for i, image_tmp in enumerate(image_tag_a_list):
            image_tmp.get('href')
            image_url = 'https://' + image_tmp.get('href')[2:]
            with open(os.path.join(image_dir, '{}.jpg'.format(i)), 'wb') as file:
                file.write(requests.get(image_url).content)
        write_csv(data, os.path.join(image_dir, 'info.csv'))
Esempio n. 18
0
def get_stuff_on_page(url_catalog):
    for item in url_catalog:
        for page in range(item['max_page']):
            if page == 0:
                html = get_html(item['url'])
            else:
                html = get_html(item['url'] + '/page-{}'.format(page + 1))
            soup = BeautifulSoup(html, 'lxml')
            with open('0.html', 'w', encoding='utf-8') as f:
                f.write(html)
                f.close()

            stuff_div = soup.find(
                'div', class_='catalog-collection cleared').find_all('h3')
            for stuff in stuff_div:
                stuff_url = 'http://formateks.ru' + stuff.find('a').get('href')
                stuff_name = stuff.find('a').text
                data = {
                    'catalog_name': item['catalog_name'],
                    'stuff_name': stuff_name,
                    'stuff_url': stuff_url
                }
                write_csv(data, 'formarket.csv')
Esempio n. 19
0
def ListModel(mainUrl, urlBrand):
    html = get_html(urlBrand)
    soup = BeautifulSoup(html, 'lxml')
    modelSpan = soup.find_all('span', class_='brandphonename')
    listModetTablet = []
    for span in modelSpan:
        nameModel = span.find('a').text
        char = ['\"', '\\', '/', '\'', ':', '|']
        for c in char:
            nameModel = nameModel.replace(c, '')
        urlModel = span.find('a').get('href')
        modelDict = {'nameModel': nameModel,
                     'urlModel': urlModel}
        listModetTablet.append(modelDict)
    return listModetTablet
Esempio n. 20
0
def get_stuff_on_page(page_catalog_url):
    list_stuff_on_page = []
    html = get_html(page_catalog_url)
    soup = BeautifulSoup(html, 'lxml')
    list_stuff = soup.find_all('div', class_='product-thumb transition')
    catalog_name = soup.find('h1').text
    for stuff in list_stuff:
        url = stuff.find('div', class_='caption').find('a').get('href')
        stuff_name = stuff.find(
            'div', class_='caption').find('a').text.replace('"', '')
        data = {'catalog_name': catalog_name,
                'stuff_name': stuff_name,
                'stuff_url': url}
        list_stuff_on_page.append(data)
        write_csv(data, 'ck.csv')
    return list_stuff_on_page
Esempio n. 21
0
def GetImg(urlModel):
    html = get_html(urlModel)
    soup = BeautifulSoup(html, 'lxml')
    try:
        urlImg = soup.find('img', class_='b-devPic__picNew').get('src')
        nameImg = soup.find('img', class_='b-devPic__picNew').get('alt')
        char = ['\"', '\\', '/', '\'', ':', '|']
        for c in char:
            nameImg = nameImg.replace(c, '')
    except:
        urlImg = None
        nameImg = None

    imgDict = {'urlImg': urlImg,
               'nameImg': nameImg}
    return imgDict
Esempio n. 22
0
def ListBrand(urlIn):
    html = get_html(urlIn)
    soup = BeautifulSoup(html, 'lxml')
    brandDiv = soup.find_all(
        'div', class_='b-listli b-listli_big b-listli_u p-listul__listli')
    listBrandTablet = []
    for brand in brandDiv:
        nameBrand = brand.find('a').text
        char = ['\"', '\\', '/', '\'', ':', '|']
        for c in char:
            nameBrand = nameBrand.replace(c, '')
        urlBrand = brand.find('a').get('href')
        brandDict = {'nameBrand': nameBrand,
                     'urlBrand': urlBrand}
        listBrandTablet.append(brandDict)
    return listBrandTablet
Esempio n. 23
0
    def tests_access_picture_max_size(self):
        """
        I would say if the status_code is 200 and file_type match a certain type 
        it a success e.g tuple (200, 'image/png')
        Don't use print statement unless you test something
        """
        # Test 3 are we have access to an picture of max size?
        html = get_html(uri=self.uri)
        self.login_data['tcurl'] = get_token(html)
        opener = my_opener(login_data=self.login_data)
        response = opener.open(self.test_image_uri)

        expected_value = self.file_type
        actual_value = response.info().get('Content-Type')

        self.assertIn(expected_value, actual_value)
Esempio n. 24
0
def GetListBrandAndModels(mainUrl):
    html = get_html(mainUrl)
    soup = BeautifulSoup(html, 'lxml')
    ListBrandDiv = soup.find_all('div', class_='category-wrap')
    for Div in ListBrandDiv:
        BrandName = Div.find('div', class_='text').find('a').text
        try:
            ListModelsA = Div.find('div', class_='sub').find_all('a')
        except AttributeError:
            ListModelsA = ''
        for A in ListModelsA:
            Text = A.text
            if Text != 'Показать еще':
                ModelName = Text
                Url = A.get('href')
            data = {'Brand': BrandName, 'Model': ModelName, 'Url': Url}
            write_csv(data, 'Mobi711.csv')
Esempio n. 25
0
def get_all_stuff(list_stuff):
    list_dir = []
    for stuff in list_stuff:
        html = get_html(stuff['stuff_url'])
        soup = BeautifulSoup(html, 'lxml')
        with open('0.html', 'w', encoding='utf-8') as file:
            file.write(html)
            file.close()
        try:
            descr = soup.find(
                'div',
                class_='block-text block-type-catalogitem-text textcontent'
            ).find('p').text
        except:
            descr = ''
        try:
            url_image = soup.find('div',
                                  class_='block-picture').find('a').get('href')
        except:
            continue
        catalog_name = stuff['catalog_name']
        stuff_name = stuff['stuff_name'].replace('"', '')
        stuff_url = stuff['stuff_url']
        data = {
            'catalog_name': catalog_name,
            'stuff_name': stuff_name,
            'stuff_url': stuff_url,
            'stuff_descr': descr
        }
        main_dir = r'D:\tmp\python\python_parsing\parsing_formateks'
        new_dir = os.path.join(main_dir, catalog_name,
                               stuff_name).replace(' ', '_').lower()

        if new_dir in list_dir:
            new_dir = new_dir + '_1'
        else:
            list_dir.append(new_dir)
        os.makedirs(new_dir)
        write_csv(data, os.path.join(new_dir, 'info.csv'))
        write_csv(data, os.path.join(main_dir, 'formarket.csv'))
        with open(os.path.join(new_dir, '0.jpg'), 'wb') as file:
            file.write(requests.get(url_image).content)
            file.close()