Python make_soup Examples, utils.make_soup Python Examples

Example #1

0

Show file

File: packtpub.py Project: rosha1022/packtupub

    def __POST_login(self):
        data = self.info.copy()
        data['email'] = self.__config.get('credential', 'credential.email')
        data['password'] = self.__config.get('credential', 'credential.password')
        data['op'] = 'Login'
        # print '[-] data: {0}'.format(urllib.urlencode(data))

        url = self.__url_base
        response = None
        if self.__dev:
            url += self.__config.get('url', 'url.loginPost')
            response = self.__session.get(url, headers=self.__headers, data=data)
            self.__log_response(response)
        else:
            url += self.__config.get('url', 'url.login')
            response = self.__session.post(url, headers=self.__headers, data=data)
            self.__log_response(response, 'POST', True)

        soup = make_soup(response)
        div_target = soup.find('div', {'id': 'deal-of-the-day'})

        title = div_target.select('div.dotd-title > h2')[0].text.strip()
        self.info['title'] = title
        self.info['filename'] = title.encode('ascii', 'ignore').replace(' ', '_')
        self.info['description'] = div_target.select('div.dotd-main-book-summary > div')[2].text.strip()
        self.info['url_image'] = 'https:' + div_target.select('div.dotd-main-book-image img')[0]['data-original']
        self.info['url_claim'] = self.__url_base + div_target.select('a.twelve-days-claim')[0]['href']
        # remove useless info
        self.info.pop('form_build_id', None)
        self.info.pop('form_id', None)

Example #2

0

Show file

File: packtpub.py Project: gknepper/packtpub-crawler-plus

    def get_library_list(self):
        self.__GET_login()
        wait(self.__delay)
        self.__POST_login()
        wait(self.__delay)

        url = self.__url_base + self.__config.get('url', 'url.myebooks')
        response = self.__session.get(url, headers=self.__headers)
        self.__log_response(response)
        soup = make_soup(response)
        for a in soup.findAll('div', attrs={'class': 'product-line unseen'}):
            print "Title:     " + a.attrs.get('title')
            print "Directory: " + a.attrs.get('title')[:-8].replace(' ', '_')
            # print a
            cover_url = a.find('img', attrs={'class': ' imagecache imagecache-thumbview'}).get('src').replace('thumbview', 'dotd_main_image')
            print "Cover URL: " "http:" + cover_url

            links = []
            for link in a.findAll('a', href=True):
                url = link.attrs.get('href')
                if not '#' in url:
                    links.append(url)
            for i in range(1, len(links)):
                if "cart" not in links[i] or not '#' or None:
                    if  links[i].split("/")[-1] == 'pdf':
                        print "Download pdf:   " + self.__url_base + links[i]
                    elif links[i].split("/")[-1] == 'epub':
                        print "Download epub:   " + self.__url_base + links[i]
                    elif links[i].split("/")[-1] == 'mobi':
                        print "Download mobi:   " + self.__url_base + links[i]
                    else:
                        print "Download extras:   " + self.__url_base + links[i]

Example #3

0

Show file

File: packtpub.py Project: tuksik/packtpub-crawler

    def __GET_claim(self):
        if self.__dev:
            url = self.__url_base + self.__config.get('url', 'url.account')
        else:
            url = self.info['url_claim']

        response = self.__session.get(url, headers=self.__headers)
        self.__log_response(response, 'GET', self.__dev)

        soup = make_soup(response)
        div_target = soup.find('div', {'id': 'product-account-list'})

        if div_target is None:
            raise Exception('Could not access claim page. This is most likely caused by invalid credentials')

        errorMessage = soup.find(id='messages-container')

        if errorMessage is not None and errorMessage.text.strip() == 'You have already claimed this promotion.':
            raise AlreadyClaimedException()

        # only last one just claimed
        div_claimed_book = div_target.select('.product-line')[0]
        self.info['book_id'] = div_claimed_book['nid']
        self.info['author'] = div_claimed_book.find(class_='author').text.strip()

        source_code = div_claimed_book.find(href=re.compile('/code_download/*'))
        if source_code is not None:
            self.info['url_source_code'] = self.__url_base + source_code['href']

Example #4

0

Show file

File: scraper.py Project: shrestha-prabin/kfebrakes-catalog-scraper

    def export(self, url):
        file_name = 'output/products/' + shorten_url(url) + '.json'
        if os.path.exists(file_name):
            return

        response = requests.get(url)
        soup = make_soup(response.text)

        if self.get_product_name(soup) is None:
            print('*** Error:', url, '***')
            return

        if self.get_specifications(soup) is None:
            print('*** Error Table:', url, '***')
            return

        if self.get_applications(soup) is None:
            print('*** Error Application:', url, '***')
            return

        details = {
            'product_name': self.get_product_name(soup),
            'specifications': self.get_specifications(soup),
            'applications': self.get_applications(soup),
            'image_url': self.get_image_url(soup)
        }

        save_json(details, file_name)
        print(file_name)

Example #5

0

Show file

    def __GET_claim(self):
        if self.__dev:
            url = self.__url_base + self.__config.get('url', 'url.account')
        else:
            url = self.info['url_claim']

        response = self.__session.get(url, headers=self.__headers)
        self.__log_response(response, 'GET', self.__dev)

        soup = make_soup(response)
        div_target = soup.find('div', {'id': 'product-account-list'})

        if div_target is None:
            raise Exception('Could not access claim page. This is most likely caused by invalid credentials')

        errorMessage = soup.find(id='messages-container')

        if errorMessage is not None and errorMessage.text.strip() == 'You have already claimed this promotion.':
            raise AlreadyClaimedException()

        # only last one just claimed
        div_claimed_book = div_target.select('.product-line')[0]
        self.info['book_id'] = div_claimed_book['nid']
        self.info['author'] = div_claimed_book.find(class_='author').text.strip()

        source_code = div_claimed_book.find(href=re.compile('/code_download/*'))
        if source_code is not None:
            self.info['url_source_code'] = self.__url_base + source_code['href']

Example #6

0

Show file

File: packtpub.py Project: Diwahars/Sample

    def __POST_login(self):
        data = self.info.copy()
        data['email'] = self.__config.get('credential', 'credential.email')
        data['password'] = self.__config.get('credential', 'credential.password')
        data['op'] = 'Login'
        # print '[-] data: {0}'.format(urllib.urlencode(data))

        url = self.__url_base
        response = None
        if self.__dev:
            url += self.__config.get('url', 'url.loginPost')
            response = self.__session.get(url, headers=self.__headers, data=data)
            self.__log_response(response)
        else:
            url += self.__config.get('url', 'url.login')
            response = self.__session.post(url, headers=self.__headers, data=data)
            self.__log_response(response, 'POST', True)

        soup = make_soup(response)
        div_target = soup.find('div', {'id': 'deal-of-the-day'})

        title = div_target.select('div.dotd-title > h2')[0].text.strip()
        self.info['title'] = title
        self.info['filename'] = title.encode('ascii', 'ignore').replace(' ', '_')
        self.info['description'] = div_target.select('div.dotd-main-book-summary > div')[2].text.strip()
        self.info['url_image'] = 'https:' + div_target.select('div.dotd-main-book-image img')[0]['src']
        self.info['url_claim'] = self.__url_base + div_target.select('a.twelve-days-claim')[0]['href']
        # remove useless info
        self.info.pop('form_build_id', None)
        self.info.pop('form_id', None)

Example #7

0

Show file

    def export(self, meta_item):
        url = meta_item['product_url']
        output_dir = 'output/products/' + meta_item['category']
        make_dir(output_dir)

        if url is None:
            return

        output_file_name = output_dir + '/' + meta_item[
            'product_code'] + '.json'

        response = requests.get(url)
        soup = make_soup(response.text)

        if self.get_image_url(soup, url) is None:
            return

        details = {
            'url': url,
            'name': soup.find('h2', attrs={
                'id': 'title'
            }).text.strip(),
            'image_url': self.get_image_url(soup, url),
            'specifications': self.get_specifications(soup),
            'cross_reference': self.get_cross_reference(soup),
            'applications': self.get_applications(soup)
        }

        save_json(details, output_file_name)
        print(output_file_name)

Example #8

0

Show file

File: packtpub.py Project: haiquan/packtpub-crawler

    def __POST_login(self, url):
        data = self.info.copy()
        data['email'] = self.__config.get('credential', 'credential.email')
        data['password'] = self.__config.get('credential',
                                             'credential.password')
        data['op'] = 'Login'
        # print '[-] data: {0}'.format(urllib.urlencode(data))

        response = None
        if self.__dev:
            response = self.__session.get(url,
                                          headers=self.__headers,
                                          data=data)
            self.__log_response(response, 'GET', self.__dev)
        else:
            response = self.__session.post(url,
                                           headers=self.__headers,
                                           data=data)
            self.__log_response(response, 'POST', self.__dev)

        soup = make_soup(response)

        error_node = soup.find('div', {'class': 'messages error'})

        if error_node is not None:
            raise Exception(error_node.text.strip())

Example #9

0

Show file

File: bovada.py Project: livinlefevreloca/arbify

    def _get_markup(self):
        """
        retrieve the html to extract the lines info from

        return(BeautifulSoup object) -> the soup made from the recovered html
        """
        data = self.driver.find_element_by_class_name("grouped-events")
        return make_soup(data.get_attribute("innerHTML"))

Example #10

0

Show file

    def fetch_product_list(self, brand_item, model_id, class_item):
        url = 'http://www.jsfilter.jp/application/get_applications/'
        response = requests.post(url,
                                 data={
                                     'modelId': model_id,
                                     'classId': class_item['app_class_id'],
                                     'year': '',
                                     'eng_vol': ''
                                 })
        soup = make_soup(response.text)
        skip = True
        model = ''
        result = []
        for child in soup.find('table').findAll(recursive=False):
            if skip:
                skip = False
                continue
            if child.get('class') == ['model-title']:
                model = child.text.split('»')[-1].strip()
                continue

            result.append({
                'brand':
                brand_item['name'],
                'class':
                class_item['app_class_name'],
                'model':
                model,
                'year':
                child.find('td', attrs={
                    'data-title': 'YEAR'
                }).text.strip(),
                'engine_vol':
                child.find('td', attrs={
                    'data-title': 'ENG VOL'
                }).text.strip(),
                'engine_no':
                child.find('td', attrs={
                    'data-title': 'ENG NO'
                }).text.strip(),
                'body_no':
                child.find('td', attrs={
                    'data-title': 'BODY NO'
                }).text.strip(),
                'oil':
                self.get_filter_data(child, 'OIL'),
                'air':
                self.get_filter_data(child, 'AIR'),
                'fuel':
                self.get_filter_data(child, 'FUEL'),
                'cabin':
                self.get_filter_data(child, 'CABIN'),
                'trans':
                self.get_filter_data(child, 'TRANS'),
            })

        return result

Example #11

0

Show file

File: betonline.py Project: livinlefevreloca/arbify

    def _get_markup(self):
        """
        retrieve the html to extract the lines info from

        return(BeautifulSoup object) -> the soup made from the recovered html
        """
        return make_soup(
            self.driver.find_element_by_id("contestDetailTable").get_attribute(
                "innerHTML"))

Example #12

0

Show file

File: scraper.py Project: shrestha-prabin/kfebrakes-catalog-scraper

    def fetch_year_list(self):
        response = requests.get(base_url)
        soup = make_soup(response.text)

        result = []
        container = soup.find('select', attrs={'id': 'year_select'})
        for item in container.findAll('option'):
            if item['value'] != 'default':
                result.append(item.text)
        return result

Example #13

0

Show file

File: packtpub.py Project: rosha1022/packtupub

    def __GET_login(self):
        url = self.__url_base
        if self.__dev:
            url += self.__config.get('url', 'url.loginGet')
        else:
            url += self.__config.get('url', 'url.login')

        response = self.__session.get(url, headers=self.__headers)
        self.__log_response(response)

        soup = make_soup(response)
        form = soup.find('form', {'id': 'packt-user-login-form'})
        self.info['form_build_id'] = form.find('input', attrs={'name': 'form_build_id'})['value']
        self.info['form_id'] = form.find('input', attrs={'name': 'form_id'})['value']

Example #14

0

Show file

    def fetch_brand_list(self):
        url = 'http://www.jsfilter.jp/catalogue'
        response = requests.get(url)
        soup = make_soup(response.text)

        result = []
        for option in soup.find('select', attrs={
                'id': 'selBrand'
        }).findAll('option'):
            result.append({
                'name': option.text.strip(),
                'value': option['value']
            })
        return result

Example #15

0

Show file

File: packtpub.py Project: Diwahars/Sample

    def __GET_login(self):
        url = self.__url_base
        if self.__dev:
            url += self.__config.get('url', 'url.loginGet')
        else:
            url += self.__config.get('url', 'url.login')

        response = self.__session.get(url, headers=self.__headers)
        self.__log_response(response)

        soup = make_soup(response)
        form = soup.find('form', {'id': 'packt-user-login-form'})
        self.info['form_build_id'] = form.find('input', attrs={'name': 'form_build_id'})['value']
        self.info['form_id'] = form.find('input', attrs={'name': 'form_id'})['value']

Example #16

0

Show file

File: packtpub.py Project: tuksik/packtpub-crawler

    def __GET_login(self, url):
        response = self.__session.get(url, headers=self.__headers)
        self.__log_response(response, 'GET', self.__dev)

        soup = make_soup(response)

        form = soup.find('form', {'id': 'packt-user-login-form'})

        if form is None:
            raise Exception('Could not find login form')

        self.info['form_build_id'] = form.find('input', attrs={'name': 'form_build_id'})['value']
        self.info['form_id'] = form.find('input', attrs={'name': 'form_id'})['value']

        return soup

Example #17

0

Show file

    def __GET_login(self, url):
        response = self.__session.get(url, headers=self.__headers)
        self.__log_response(response, 'GET', self.__dev)

        soup = make_soup(response)

        form = soup.find('form', {'id': 'packt-user-login-form'})

        if form is None:
            raise Exception('Could not find login form')

        self.info['form_build_id'] = form.find('input', attrs={'name': 'form_build_id'})['value']
        self.info['form_id'] = form.find('input', attrs={'name': 'form_id'})['value']

        return soup

Example #18

0

Show file

File: packtpub.py Project: gknepper/packtpub-crawler-plus

    def dump_all_library(self):
        # self.__GET_login()
        # wait(self.__delay)
        # self.__POST_login()
        # wait(self.__delay)

        url = self.__url_base + self.__config.get('url', 'url.myebooks')
        response = self.__session.get(url, headers=self.__headers)
        self.__log_response(response)
        soup = make_soup(response)
        for a in soup.findAll('div', attrs={'class': 'product-line unseen'}):
            log_info("[+] Downloading :     " + a.attrs.get('title'))
            #print "Downloading :     " + a.attrs.get('title')
            directory = a.attrs.get('title')[:-8].replace(' ', '_')
            directory = directory.encode('ascii', 'ignore').replace('/', '-')   ##### Error -  UnicodeEncodeError: 'ascii' codec can't encode character u'\u2019' in position
            filename = directory

            #print "Directory: " + a.attrs.get('title')[:-8].replace(' ', '_')
            # print a
            # print "Cover URL: " "http:" + a.find('img', attrs={'class': ' imagecache imagecache-thumbview'}).get('src').replace('thumbview', 'dotd_main_image')

            cover_url = a.find('img', attrs={'class': ' imagecache imagecache-thumbview'}).get('src').replace('thumbview', 'dotd_main_image')
            download_file(self.__session, 'http:' + cover_url, self.__config.get('path', 'path.dumps') + '/' + directory, filename +'.jpg')

            links = []
            for link in a.findAll('a', href=True):
                url = link.attrs.get('href')
                if not '#' in url:
                    links.append(url)
            for i in range(1, len(links)):
                if "cart" not in links[i] or not '#' or None:
                    if  links[i].split("/")[-1] == 'pdf':
                    #    print "Download pdf:   " + self.__url_base + links[i]
                        download_file(self.__session, self.__url_base + links[i], self.__config.get('path', 'path.dumps') + '/' + directory, filename + '.pdf')
                    elif links[i].split("/")[-1] == 'epub':
                    #    print "Download epub:   " + self.__url_base + links[i]
                        download_file(self.__session, self.__url_base + links[i], self.__config.get('path', 'path.dumps') + '/' + directory, filename + '.epub')
                    elif links[i].split("/")[-1] == 'mobi':
                    #    print "Download mobi:   " + self.__url_base + links[i]
                        download_file(self.__session, self.__url_base + links[i], self.__config.get('path', 'path.dumps') + '/' + directory, filename + '.mobi')
                    else:
                    #    print "Download extras:   " + self.__url_base + links[i]
                        download_file(self.__session, self.__url_base + links[i], self.__config.get('path', 'path.dumps') + '/' + directory, filename +'.zip')

            wait(self.__delay)

Example #19

0

Show file

File: scraper.py Project: shrestha-prabin/kfebrakes-catalog-scraper

    def fetch_product_list(self, make_item, model_item, year_item):
        url = 'https://kfebrakes.com/wp-content/plugins/kfe-catalog/kfe-catalog-get-data.php' \
              '?make=' + make_item + \
              '&model=' + model_item.replace(' ', '+') + \
              '&year=' + year_item

        result = []

        response = requests.get(url)
        soup = make_soup(response.text)

        for item in soup.findAll('tr'):
            cells = item.findAll('td')

            if len(cells) != 6:
                return None

            front_pad = cells[3]
            rear_pad = cells[4]

            result.append({
                'model':
                cells[0].text.strip(),
                'year':
                cells[1].text.strip(),
                'trim':
                cells[2].text.strip(),
                'front_pad':
                front_pad.text.strip(),
                'front_pad_url':
                front_pad.find('a')['href']
                if front_pad.find('a') is not None else '',
                'rear_pad':
                rear_pad.text.strip(),
                'rear_pad_url':
                rear_pad.find('a')['href']
                if rear_pad.find('a') is not None else '',
                'note':
                cells[5].text.strip()
            })

        return result

Example #20

0

Show file

File: packtpub.py Project: rosha1022/packtupub

    def __GET_claim(self):
        if self.__dev:
            url = self.__url_base + self.__config.get('url', 'url.account')
        else:
            url = self.info['url_claim']

        response = self.__session.get(url, headers=self.__headers)
        self.__log_response(response)

        soup = make_soup(response)
        div_target = soup.find('div', {'id': 'product-account-list'})

        # only last one just claimed
        div_claimed_book = div_target.select('.product-line')[0]
        self.info['book_id'] = div_claimed_book['nid']
        self.info['author'] = div_claimed_book.find(class_='author').text.strip()

        source_code = div_claimed_book.find(href=re.compile('/code_download/*'))
        if source_code is not None:
            self.info['url_source_code'] = self.__url_base + source_code['href']

Example #21

0

Show file

File: packtpub.py Project: Diwahars/Sample

    def __GET_claim(self):
        if self.__dev:
            url = self.__url_base + self.__config.get('url', 'url.account')
        else:
            url = self.info['url_claim']

        response = self.__session.get(url, headers=self.__headers)
        self.__log_response(response)

        soup = make_soup(response)
        div_target = soup.find('div', {'id': 'product-account-list'})

        # only last one just claimed
        div_claimed_book = div_target.select('.product-line')[0]
        self.info['book_id'] = div_claimed_book['nid']
        self.info['author'] = div_claimed_book.find(class_='author').text.strip()

        source_code = div_claimed_book.find(href=re.compile('/code_download/*'))
        if source_code is not None:
            self.info['url_source_code'] = self.__url_base + source_code['href']

Example #22

0

Show file

File: crawler.py Project: spacywacy/wiki_crawler

    def init_seed(self):
        #get frontier
        seed_soup = utils.make_soup(self.seed_url,
                                    class_val=self.content_class)
        self.frontier = utils.get_page_urls(seed_soup,
                                            url_prefix=self.url_prefix)

        #store seed page
        seed_hash_val = utils.hash_url(self.seed_url)
        seed_doc_path = utils.store_doc(seed_hash_val,
                                        seed_soup,
                                        self.store_docs_at,
                                        if_store=self.if_store_doc)
        utils.store_url(self.conn,
                        self.table_name,
                        seed_hash_val,
                        self.seed_url,
                        seed_doc_path,
                        url_file=self.url_file)
        self.depth += 1

Example #23

0

Show file

File: packtpub.py Project: tuksik/packtpub-crawler

    def __POST_login(self, url):
        data = self.info.copy()
        data['email'] = self.__config.get('credential', 'credential.email')
        data['password'] = self.__config.get('credential', 'credential.password')
        data['op'] = 'Login'
        # print '[-] data: {0}'.format(urllib.urlencode(data))

        response = None
        if self.__dev:
            response = self.__session.get(url, headers=self.__headers, data=data)
            self.__log_response(response, 'GET', self.__dev)
        else:
            response = self.__session.post(url, headers=self.__headers, data=data)
            self.__log_response(response, 'POST', self.__dev)

        soup = make_soup(response)

        error_node = soup.find('div', {'class': 'messages error'})

        if error_node is not None:
            raise Exception(error_node.text.strip())

Example #24

0

Show file

    def get_offhire_dates(self):
        containers = self.__get_containers_from_db()

        url = INTERPORTURL.format(containers)
        html = make_soup(url)
        table_row_elements = html.find_all('tr')

        container_statuses = []
        for row in table_row_elements:
            row_text = row.text.split()
            container_number = row_text[0] + row_text[1].replace('-', '')
            print('Updating container no. {}'.format(container_number))

            if row_text[2]:
                offhire_date = datetime.strptime(
                    row_text[2], '%m/%d/%Y').strftime('%Y-%m-%d')
                container_statuses.append(
                    dict(container_number=container_number,
                         offhire_date=offhire_date))

        return container_statuses

Example #25

0

Show file

File: watson_indexer.py Project: Ramiren/NewsCrawler

    changed_date = datetime.strptime(changed_date, "%d.%m.%y, %H:%M")
    if changed_date < EARLIEST_PUBLISHED:
        return None  # only fetch stories newer than 2015
    if since and changed_date <= since: return None

    lead_p = story_div.find(find_lead_p)
    if not lead_p or not lead_p.string: return None
    subtitle = lead_p.string
    if not subtitle: return None

    text = ""
    for text_content in story_div.find_all(find_text_content):
        if text_content.string:
            text += text_content.string + "\n"
    if not text: return None

    return {
        "title": title_txt,
        "subtitle": subtitle,
        "text": text,
        "published": changed_date
    }


if __name__ == "__main__":
    from utils import make_soup
    story_soup = make_soup(
        "https://www.watson.ch/Digital/Wissen/532340777-Roboter-und-virtuelle-Restaurants-%E2%80%93-wie-das-Silicon-Valley-unsere-Esskultur-revolutioniert"
    )
    assert story_soup, "Could not make soup!"
    print(index(story_soup))

Example #26

0

Show file

File: crawler.py Project: spacywacy/wiki_crawler

    def DFS(self):
        self.dfs_tree[1] = self.frontier

        while True:
            #check to break
            if self.url_count >= self.max_url_count or self.depth < 0:
                self.pickle_self()
                break  #end if reach max

            #check to go back up a level
            if self.depth > self.max_depth:
                self.depth -= 1

            #get current url
            if len(self.dfs_tree[self.depth]) != 0:
                url = self.dfs_tree[self.depth].pop(0)
            else:
                self.depth -= 1  #if current level level done, go up a level

            #do crawl
            hash_val = utils.hash_url(url)
            if utils.check_unique(
                    self.conn, self.table_name,
                    hash_val):  #query db to check in url is unique
                doc_soup = utils.make_soup(url, class_val=self.content_class)
                utils.delay(self.sleep_time)

                if self.focused:
                    if_relevant = utils.check_relevant(
                        doc_soup,
                        self.keywords)  #read the document and match key words
                else:
                    if_relevant = True

                if if_relevant:
                    doc_path = utils.store_doc(
                        hash_val,
                        doc_soup,
                        self.store_docs_at,
                        if_store=self.if_store_doc
                    )  #store document content on disk
                    utils.store_url(self.conn,
                                    self.table_name,
                                    hash_val,
                                    url,
                                    doc_path,
                                    url_file=self.url_file
                                    )  #store url & path to doc content to db
                    self.depth += 1  #go down a level

                    #track total depth
                    if self.depth > self.depth_reached:
                        self.depth_reached = self.depth

                    self.dfs_tree[self.depth] = utils.get_page_urls(
                        doc_soup, url_prefix=self.url_prefix
                    )  #create url list for lower level
                    self.url_count += 1
                    print('url count:', self.url_count)

            else:
                self.duplicate_count += 1

Example #27

0

Show file

File: crawler.py Project: spacywacy/wiki_crawler

    def BFS(self):
        self.frontier.append(self.level_end_str)

        while True:
            #get current url
            if len(self.frontier) != 0:
                url = self.frontier.pop(0)
            else:
                self.pickle_self()
                break  #end if no more url in frontier

            #check to break
            if self.depth > self.max_depth or self.url_count >= self.max_url_count:
                self.pickle_self()
                break  #end if reach max

            #check to increment depth
            if url == self.level_end_str:
                self.frontier.append(self.level_end_str)
                self.depth += 1

                #track total depth
                if self.depth > self.depth_reached:
                    self.depth_reached = self.depth

                continue

            #do crawl
            hash_val = utils.hash_url(url)
            if utils.check_unique(
                    self.conn, self.table_name,
                    hash_val):  #query db to check in url is unique

                doc_soup = utils.make_soup(url, class_val=self.content_class)
                utils.delay(self.sleep_time)

                if self.focused:
                    if_relevant = utils.check_relevant(
                        doc_soup,
                        self.keywords)  #read the document and match key words
                else:
                    if_relevant = True

                if if_relevant:
                    doc_path = utils.store_doc(
                        hash_val,
                        doc_soup,
                        self.store_docs_at,
                        if_store=self.if_store_doc
                    )  #store document content on disk
                    utils.store_url(self.conn,
                                    self.table_name,
                                    hash_val,
                                    url,
                                    doc_path,
                                    url_file=self.url_file
                                    )  #store url & path to doc content to db
                    self.frontier += utils.get_page_urls(
                        doc_soup, url_prefix=self.url_prefix
                    )  #append urls in current page to frontier
                    self.url_count += 1
                    print('url count:', self.url_count)

            else:
                self.duplicate_count += 1

Example #28

0

Show file

File: blick_crawler.py Project: Ramiren/NewsCrawler

from typing import Set


def find_article_containers(tag):
    if tag.name == "div":
        if tag.has_attr("class") and tag.a:
            if "g6Slead" in tag["class"] or "standard_teaser" in tag["class"]:
                return True
    return False


def crawl(base_link, sec_soup) -> Set[str]:
    return {
        tag.a["href"]
        for tag in sec_soup.find_all(find_article_containers)
    }  # sec_soup.find(find_articles)


if __name__ == "__main__":
    from utils import make_soup
    urls = crawl("http://www.blick.ch",
                 make_soup("http://www.blick.ch/news/wirtschaft/"))
    for url in urls:
        print(url)
    print(str(len(urls)))

Example #29

0

Show file

File: blick_indexer.py Project: Ramiren/NewsCrawler

    date1 = article_body.span.text
    if not date1: return None

    date2 = get_stripped_date(date1)
    try:
        if ":" in date2:
            # Publiziert am 05.02.2017 | Aktualisiert um 14:59 Uhr
            published_date = datetime.strptime(date2, "%d.%m.%Y%H:%M")
        else:
            # oder Publiziert am 04.02.2017 | Akt... am 04.02.2017
            published_date = datetime.strptime(date2[:10], "%d.%m.%Y")
    except ValueError:
        logger.error(f"Could not convert from string to date: {date1} => {date2}")
        return None

    text = " ".join([tag.text for tag in article_body.find_all(find_article_text)])


    return {
        "title": title,
        "subtitle": subtitle,
        "text": text,
        "published": published_date
    }


if __name__ == "__main__":
    from utils import make_soup
    index(make_soup("http://www.blick.ch/news/schweiz/neue-komplizen-der-polizei-apotheker-sollen-bombenbauer-entlarven-id6171409.html"))

Example #30

0

Show file

File: watson_crawler.py Project: Ramiren/NewsCrawler

def find_wrapper(tag):
    if tag.name == "div" and tag.has_attr(
            "class") and "wrapper" in tag["class"]:
        return True
    return False


def find_storylink(tag):
    if tag.name == "a" and tag.has_attr(
            "class") and "storylink" in tag["class"]:
        return True
    return False


def crawl(base_link, sec_soup) -> List[str]:
    story_urls = set()
    wrapper = sec_soup.find(find_wrapper)
    for storylink in wrapper.find_all(find_storylink):
        story_urls.add(storylink["href"])
    story_urls = {base_link + url for url in story_urls}
    return list(story_urls)


if __name__ == "__main__":
    from utils import make_soup
    soup = make_soup("http://www.watson.ch/Wirtschaft")
    assert soup, "Couldn't make soup!"
    urls = crawl("http://www.watson.ch", soup)
    for url in urls:
        print(url)
    print(str(len(urls)))