def __POST_login(self): data = self.info.copy() data['email'] = self.__config.get('credential', 'credential.email') data['password'] = self.__config.get('credential', 'credential.password') data['op'] = 'Login' # print '[-] data: {0}'.format(urllib.urlencode(data)) url = self.__url_base response = None if self.__dev: url += self.__config.get('url', 'url.loginPost') response = self.__session.get(url, headers=self.__headers, data=data) self.__log_response(response) else: url += self.__config.get('url', 'url.login') response = self.__session.post(url, headers=self.__headers, data=data) self.__log_response(response, 'POST', True) soup = make_soup(response) div_target = soup.find('div', {'id': 'deal-of-the-day'}) title = div_target.select('div.dotd-title > h2')[0].text.strip() self.info['title'] = title self.info['filename'] = title.encode('ascii', 'ignore').replace(' ', '_') self.info['description'] = div_target.select('div.dotd-main-book-summary > div')[2].text.strip() self.info['url_image'] = 'https:' + div_target.select('div.dotd-main-book-image img')[0]['data-original'] self.info['url_claim'] = self.__url_base + div_target.select('a.twelve-days-claim')[0]['href'] # remove useless info self.info.pop('form_build_id', None) self.info.pop('form_id', None)
def get_library_list(self): self.__GET_login() wait(self.__delay) self.__POST_login() wait(self.__delay) url = self.__url_base + self.__config.get('url', 'url.myebooks') response = self.__session.get(url, headers=self.__headers) self.__log_response(response) soup = make_soup(response) for a in soup.findAll('div', attrs={'class': 'product-line unseen'}): print "Title: " + a.attrs.get('title') print "Directory: " + a.attrs.get('title')[:-8].replace(' ', '_') # print a cover_url = a.find('img', attrs={'class': ' imagecache imagecache-thumbview'}).get('src').replace('thumbview', 'dotd_main_image') print "Cover URL: " "http:" + cover_url links = [] for link in a.findAll('a', href=True): url = link.attrs.get('href') if not '#' in url: links.append(url) for i in range(1, len(links)): if "cart" not in links[i] or not '#' or None: if links[i].split("/")[-1] == 'pdf': print "Download pdf: " + self.__url_base + links[i] elif links[i].split("/")[-1] == 'epub': print "Download epub: " + self.__url_base + links[i] elif links[i].split("/")[-1] == 'mobi': print "Download mobi: " + self.__url_base + links[i] else: print "Download extras: " + self.__url_base + links[i]
def __GET_claim(self): if self.__dev: url = self.__url_base + self.__config.get('url', 'url.account') else: url = self.info['url_claim'] response = self.__session.get(url, headers=self.__headers) self.__log_response(response, 'GET', self.__dev) soup = make_soup(response) div_target = soup.find('div', {'id': 'product-account-list'}) if div_target is None: raise Exception('Could not access claim page. This is most likely caused by invalid credentials') errorMessage = soup.find(id='messages-container') if errorMessage is not None and errorMessage.text.strip() == 'You have already claimed this promotion.': raise AlreadyClaimedException() # only last one just claimed div_claimed_book = div_target.select('.product-line')[0] self.info['book_id'] = div_claimed_book['nid'] self.info['author'] = div_claimed_book.find(class_='author').text.strip() source_code = div_claimed_book.find(href=re.compile('/code_download/*')) if source_code is not None: self.info['url_source_code'] = self.__url_base + source_code['href']
def export(self, url): file_name = 'output/products/' + shorten_url(url) + '.json' if os.path.exists(file_name): return response = requests.get(url) soup = make_soup(response.text) if self.get_product_name(soup) is None: print('*** Error:', url, '***') return if self.get_specifications(soup) is None: print('*** Error Table:', url, '***') return if self.get_applications(soup) is None: print('*** Error Application:', url, '***') return details = { 'product_name': self.get_product_name(soup), 'specifications': self.get_specifications(soup), 'applications': self.get_applications(soup), 'image_url': self.get_image_url(soup) } save_json(details, file_name) print(file_name)
def __POST_login(self): data = self.info.copy() data['email'] = self.__config.get('credential', 'credential.email') data['password'] = self.__config.get('credential', 'credential.password') data['op'] = 'Login' # print '[-] data: {0}'.format(urllib.urlencode(data)) url = self.__url_base response = None if self.__dev: url += self.__config.get('url', 'url.loginPost') response = self.__session.get(url, headers=self.__headers, data=data) self.__log_response(response) else: url += self.__config.get('url', 'url.login') response = self.__session.post(url, headers=self.__headers, data=data) self.__log_response(response, 'POST', True) soup = make_soup(response) div_target = soup.find('div', {'id': 'deal-of-the-day'}) title = div_target.select('div.dotd-title > h2')[0].text.strip() self.info['title'] = title self.info['filename'] = title.encode('ascii', 'ignore').replace(' ', '_') self.info['description'] = div_target.select('div.dotd-main-book-summary > div')[2].text.strip() self.info['url_image'] = 'https:' + div_target.select('div.dotd-main-book-image img')[0]['src'] self.info['url_claim'] = self.__url_base + div_target.select('a.twelve-days-claim')[0]['href'] # remove useless info self.info.pop('form_build_id', None) self.info.pop('form_id', None)
def export(self, meta_item): url = meta_item['product_url'] output_dir = 'output/products/' + meta_item['category'] make_dir(output_dir) if url is None: return output_file_name = output_dir + '/' + meta_item[ 'product_code'] + '.json' response = requests.get(url) soup = make_soup(response.text) if self.get_image_url(soup, url) is None: return details = { 'url': url, 'name': soup.find('h2', attrs={ 'id': 'title' }).text.strip(), 'image_url': self.get_image_url(soup, url), 'specifications': self.get_specifications(soup), 'cross_reference': self.get_cross_reference(soup), 'applications': self.get_applications(soup) } save_json(details, output_file_name) print(output_file_name)
def __POST_login(self, url): data = self.info.copy() data['email'] = self.__config.get('credential', 'credential.email') data['password'] = self.__config.get('credential', 'credential.password') data['op'] = 'Login' # print '[-] data: {0}'.format(urllib.urlencode(data)) response = None if self.__dev: response = self.__session.get(url, headers=self.__headers, data=data) self.__log_response(response, 'GET', self.__dev) else: response = self.__session.post(url, headers=self.__headers, data=data) self.__log_response(response, 'POST', self.__dev) soup = make_soup(response) error_node = soup.find('div', {'class': 'messages error'}) if error_node is not None: raise Exception(error_node.text.strip())
def _get_markup(self): """ retrieve the html to extract the lines info from return(BeautifulSoup object) -> the soup made from the recovered html """ data = self.driver.find_element_by_class_name("grouped-events") return make_soup(data.get_attribute("innerHTML"))
def fetch_product_list(self, brand_item, model_id, class_item): url = 'http://www.jsfilter.jp/application/get_applications/' response = requests.post(url, data={ 'modelId': model_id, 'classId': class_item['app_class_id'], 'year': '', 'eng_vol': '' }) soup = make_soup(response.text) skip = True model = '' result = [] for child in soup.find('table').findAll(recursive=False): if skip: skip = False continue if child.get('class') == ['model-title']: model = child.text.split('»')[-1].strip() continue result.append({ 'brand': brand_item['name'], 'class': class_item['app_class_name'], 'model': model, 'year': child.find('td', attrs={ 'data-title': 'YEAR' }).text.strip(), 'engine_vol': child.find('td', attrs={ 'data-title': 'ENG VOL' }).text.strip(), 'engine_no': child.find('td', attrs={ 'data-title': 'ENG NO' }).text.strip(), 'body_no': child.find('td', attrs={ 'data-title': 'BODY NO' }).text.strip(), 'oil': self.get_filter_data(child, 'OIL'), 'air': self.get_filter_data(child, 'AIR'), 'fuel': self.get_filter_data(child, 'FUEL'), 'cabin': self.get_filter_data(child, 'CABIN'), 'trans': self.get_filter_data(child, 'TRANS'), }) return result
def _get_markup(self): """ retrieve the html to extract the lines info from return(BeautifulSoup object) -> the soup made from the recovered html """ return make_soup( self.driver.find_element_by_id("contestDetailTable").get_attribute( "innerHTML"))
def fetch_year_list(self): response = requests.get(base_url) soup = make_soup(response.text) result = [] container = soup.find('select', attrs={'id': 'year_select'}) for item in container.findAll('option'): if item['value'] != 'default': result.append(item.text) return result
def __GET_login(self): url = self.__url_base if self.__dev: url += self.__config.get('url', 'url.loginGet') else: url += self.__config.get('url', 'url.login') response = self.__session.get(url, headers=self.__headers) self.__log_response(response) soup = make_soup(response) form = soup.find('form', {'id': 'packt-user-login-form'}) self.info['form_build_id'] = form.find('input', attrs={'name': 'form_build_id'})['value'] self.info['form_id'] = form.find('input', attrs={'name': 'form_id'})['value']
def fetch_brand_list(self): url = 'http://www.jsfilter.jp/catalogue' response = requests.get(url) soup = make_soup(response.text) result = [] for option in soup.find('select', attrs={ 'id': 'selBrand' }).findAll('option'): result.append({ 'name': option.text.strip(), 'value': option['value'] }) return result
def __GET_login(self, url): response = self.__session.get(url, headers=self.__headers) self.__log_response(response, 'GET', self.__dev) soup = make_soup(response) form = soup.find('form', {'id': 'packt-user-login-form'}) if form is None: raise Exception('Could not find login form') self.info['form_build_id'] = form.find('input', attrs={'name': 'form_build_id'})['value'] self.info['form_id'] = form.find('input', attrs={'name': 'form_id'})['value'] return soup
def dump_all_library(self): # self.__GET_login() # wait(self.__delay) # self.__POST_login() # wait(self.__delay) url = self.__url_base + self.__config.get('url', 'url.myebooks') response = self.__session.get(url, headers=self.__headers) self.__log_response(response) soup = make_soup(response) for a in soup.findAll('div', attrs={'class': 'product-line unseen'}): log_info("[+] Downloading : " + a.attrs.get('title')) #print "Downloading : " + a.attrs.get('title') directory = a.attrs.get('title')[:-8].replace(' ', '_') directory = directory.encode('ascii', 'ignore').replace('/', '-') ##### Error - UnicodeEncodeError: 'ascii' codec can't encode character u'\u2019' in position filename = directory #print "Directory: " + a.attrs.get('title')[:-8].replace(' ', '_') # print a # print "Cover URL: " "http:" + a.find('img', attrs={'class': ' imagecache imagecache-thumbview'}).get('src').replace('thumbview', 'dotd_main_image') cover_url = a.find('img', attrs={'class': ' imagecache imagecache-thumbview'}).get('src').replace('thumbview', 'dotd_main_image') download_file(self.__session, 'http:' + cover_url, self.__config.get('path', 'path.dumps') + '/' + directory, filename +'.jpg') links = [] for link in a.findAll('a', href=True): url = link.attrs.get('href') if not '#' in url: links.append(url) for i in range(1, len(links)): if "cart" not in links[i] or not '#' or None: if links[i].split("/")[-1] == 'pdf': # print "Download pdf: " + self.__url_base + links[i] download_file(self.__session, self.__url_base + links[i], self.__config.get('path', 'path.dumps') + '/' + directory, filename + '.pdf') elif links[i].split("/")[-1] == 'epub': # print "Download epub: " + self.__url_base + links[i] download_file(self.__session, self.__url_base + links[i], self.__config.get('path', 'path.dumps') + '/' + directory, filename + '.epub') elif links[i].split("/")[-1] == 'mobi': # print "Download mobi: " + self.__url_base + links[i] download_file(self.__session, self.__url_base + links[i], self.__config.get('path', 'path.dumps') + '/' + directory, filename + '.mobi') else: # print "Download extras: " + self.__url_base + links[i] download_file(self.__session, self.__url_base + links[i], self.__config.get('path', 'path.dumps') + '/' + directory, filename +'.zip') wait(self.__delay)
def fetch_product_list(self, make_item, model_item, year_item): url = 'https://kfebrakes.com/wp-content/plugins/kfe-catalog/kfe-catalog-get-data.php' \ '?make=' + make_item + \ '&model=' + model_item.replace(' ', '+') + \ '&year=' + year_item result = [] response = requests.get(url) soup = make_soup(response.text) for item in soup.findAll('tr'): cells = item.findAll('td') if len(cells) != 6: return None front_pad = cells[3] rear_pad = cells[4] result.append({ 'model': cells[0].text.strip(), 'year': cells[1].text.strip(), 'trim': cells[2].text.strip(), 'front_pad': front_pad.text.strip(), 'front_pad_url': front_pad.find('a')['href'] if front_pad.find('a') is not None else '', 'rear_pad': rear_pad.text.strip(), 'rear_pad_url': rear_pad.find('a')['href'] if rear_pad.find('a') is not None else '', 'note': cells[5].text.strip() }) return result
def __GET_claim(self): if self.__dev: url = self.__url_base + self.__config.get('url', 'url.account') else: url = self.info['url_claim'] response = self.__session.get(url, headers=self.__headers) self.__log_response(response) soup = make_soup(response) div_target = soup.find('div', {'id': 'product-account-list'}) # only last one just claimed div_claimed_book = div_target.select('.product-line')[0] self.info['book_id'] = div_claimed_book['nid'] self.info['author'] = div_claimed_book.find(class_='author').text.strip() source_code = div_claimed_book.find(href=re.compile('/code_download/*')) if source_code is not None: self.info['url_source_code'] = self.__url_base + source_code['href']
def init_seed(self): #get frontier seed_soup = utils.make_soup(self.seed_url, class_val=self.content_class) self.frontier = utils.get_page_urls(seed_soup, url_prefix=self.url_prefix) #store seed page seed_hash_val = utils.hash_url(self.seed_url) seed_doc_path = utils.store_doc(seed_hash_val, seed_soup, self.store_docs_at, if_store=self.if_store_doc) utils.store_url(self.conn, self.table_name, seed_hash_val, self.seed_url, seed_doc_path, url_file=self.url_file) self.depth += 1
def get_offhire_dates(self): containers = self.__get_containers_from_db() url = INTERPORTURL.format(containers) html = make_soup(url) table_row_elements = html.find_all('tr') container_statuses = [] for row in table_row_elements: row_text = row.text.split() container_number = row_text[0] + row_text[1].replace('-', '') print('Updating container no. {}'.format(container_number)) if row_text[2]: offhire_date = datetime.strptime( row_text[2], '%m/%d/%Y').strftime('%Y-%m-%d') container_statuses.append( dict(container_number=container_number, offhire_date=offhire_date)) return container_statuses
changed_date = datetime.strptime(changed_date, "%d.%m.%y, %H:%M") if changed_date < EARLIEST_PUBLISHED: return None # only fetch stories newer than 2015 if since and changed_date <= since: return None lead_p = story_div.find(find_lead_p) if not lead_p or not lead_p.string: return None subtitle = lead_p.string if not subtitle: return None text = "" for text_content in story_div.find_all(find_text_content): if text_content.string: text += text_content.string + "\n" if not text: return None return { "title": title_txt, "subtitle": subtitle, "text": text, "published": changed_date } if __name__ == "__main__": from utils import make_soup story_soup = make_soup( "https://www.watson.ch/Digital/Wissen/532340777-Roboter-und-virtuelle-Restaurants-%E2%80%93-wie-das-Silicon-Valley-unsere-Esskultur-revolutioniert" ) assert story_soup, "Could not make soup!" print(index(story_soup))
def DFS(self): self.dfs_tree[1] = self.frontier while True: #check to break if self.url_count >= self.max_url_count or self.depth < 0: self.pickle_self() break #end if reach max #check to go back up a level if self.depth > self.max_depth: self.depth -= 1 #get current url if len(self.dfs_tree[self.depth]) != 0: url = self.dfs_tree[self.depth].pop(0) else: self.depth -= 1 #if current level level done, go up a level #do crawl hash_val = utils.hash_url(url) if utils.check_unique( self.conn, self.table_name, hash_val): #query db to check in url is unique doc_soup = utils.make_soup(url, class_val=self.content_class) utils.delay(self.sleep_time) if self.focused: if_relevant = utils.check_relevant( doc_soup, self.keywords) #read the document and match key words else: if_relevant = True if if_relevant: doc_path = utils.store_doc( hash_val, doc_soup, self.store_docs_at, if_store=self.if_store_doc ) #store document content on disk utils.store_url(self.conn, self.table_name, hash_val, url, doc_path, url_file=self.url_file ) #store url & path to doc content to db self.depth += 1 #go down a level #track total depth if self.depth > self.depth_reached: self.depth_reached = self.depth self.dfs_tree[self.depth] = utils.get_page_urls( doc_soup, url_prefix=self.url_prefix ) #create url list for lower level self.url_count += 1 print('url count:', self.url_count) else: self.duplicate_count += 1
def BFS(self): self.frontier.append(self.level_end_str) while True: #get current url if len(self.frontier) != 0: url = self.frontier.pop(0) else: self.pickle_self() break #end if no more url in frontier #check to break if self.depth > self.max_depth or self.url_count >= self.max_url_count: self.pickle_self() break #end if reach max #check to increment depth if url == self.level_end_str: self.frontier.append(self.level_end_str) self.depth += 1 #track total depth if self.depth > self.depth_reached: self.depth_reached = self.depth continue #do crawl hash_val = utils.hash_url(url) if utils.check_unique( self.conn, self.table_name, hash_val): #query db to check in url is unique doc_soup = utils.make_soup(url, class_val=self.content_class) utils.delay(self.sleep_time) if self.focused: if_relevant = utils.check_relevant( doc_soup, self.keywords) #read the document and match key words else: if_relevant = True if if_relevant: doc_path = utils.store_doc( hash_val, doc_soup, self.store_docs_at, if_store=self.if_store_doc ) #store document content on disk utils.store_url(self.conn, self.table_name, hash_val, url, doc_path, url_file=self.url_file ) #store url & path to doc content to db self.frontier += utils.get_page_urls( doc_soup, url_prefix=self.url_prefix ) #append urls in current page to frontier self.url_count += 1 print('url count:', self.url_count) else: self.duplicate_count += 1
from typing import Set def find_article_containers(tag): if tag.name == "div": if tag.has_attr("class") and tag.a: if "g6Slead" in tag["class"] or "standard_teaser" in tag["class"]: return True return False def crawl(base_link, sec_soup) -> Set[str]: return { tag.a["href"] for tag in sec_soup.find_all(find_article_containers) } # sec_soup.find(find_articles) if __name__ == "__main__": from utils import make_soup urls = crawl("http://www.blick.ch", make_soup("http://www.blick.ch/news/wirtschaft/")) for url in urls: print(url) print(str(len(urls)))
date1 = article_body.span.text if not date1: return None date2 = get_stripped_date(date1) try: if ":" in date2: # Publiziert am 05.02.2017 | Aktualisiert um 14:59 Uhr published_date = datetime.strptime(date2, "%d.%m.%Y%H:%M") else: # oder Publiziert am 04.02.2017 | Akt... am 04.02.2017 published_date = datetime.strptime(date2[:10], "%d.%m.%Y") except ValueError: logger.error(f"Could not convert from string to date: {date1} => {date2}") return None text = " ".join([tag.text for tag in article_body.find_all(find_article_text)]) return { "title": title, "subtitle": subtitle, "text": text, "published": published_date } if __name__ == "__main__": from utils import make_soup index(make_soup("http://www.blick.ch/news/schweiz/neue-komplizen-der-polizei-apotheker-sollen-bombenbauer-entlarven-id6171409.html"))
def find_wrapper(tag): if tag.name == "div" and tag.has_attr( "class") and "wrapper" in tag["class"]: return True return False def find_storylink(tag): if tag.name == "a" and tag.has_attr( "class") and "storylink" in tag["class"]: return True return False def crawl(base_link, sec_soup) -> List[str]: story_urls = set() wrapper = sec_soup.find(find_wrapper) for storylink in wrapper.find_all(find_storylink): story_urls.add(storylink["href"]) story_urls = {base_link + url for url in story_urls} return list(story_urls) if __name__ == "__main__": from utils import make_soup soup = make_soup("http://www.watson.ch/Wirtschaft") assert soup, "Couldn't make soup!" urls = crawl("http://www.watson.ch", soup) for url in urls: print(url) print(str(len(urls)))