def get_meaning_with_etymology(self, header: BeautifulSoup) -> [Dict[str, Any]]: result = {'etymology': None, 'values': []} next_sibling: BeautifulSoup = header.find_next_sibling() # p is etymology details, capture it while next_sibling.name == 'p': p: BeautifulSoup = header.find_next_sibling() etymology = result.get('etymology') result['etymology'] = p.get_text().strip() if etymology is None \ else etymology + '\n' + p.get_text().strip() next_sibling = next_sibling.find_next_sibling() # Skip pronunciation headers while WiktionaryTrEnScraper.is_pronunciation_header( next_sibling) or next_sibling.name == 'ul': next_sibling = next_sibling.find_next_sibling() # h4 is the header for parts of speech if WiktionaryTrEnScraper.is_part_of_speech_header(next_sibling): span: BeautifulSoup = next_sibling.find('span') if span: result['part_of_speech'] = span.get_text().strip().lower() if next_sibling.name == 'h3': self.processed_headers.append(str(next_sibling)) next_sibling = next_sibling.find_next_sibling() WiktionaryTrEnScraper.process_meaning_values(next_sibling, result) return result
def get_pronunciation(header: BeautifulSoup) -> List[Dict[str, Any]]: results = [] if header.find_next_sibling().name == 'ul': ul: BeautifulSoup = header.find_next_sibling() WiktionaryTrEnScraper.get_pronunciation_type( ul, results, 'IPA', 'IPA') WiktionaryTrEnScraper.get_pronunciation_type( ul, results, 'Hyphenation', 'Latn') return results
def crawl_pwca(self, task): date = datetime.strptime(task.date, '%Y-%m-%d') tag = re.compile(f'T.*{datetime.strftime(date, "%a %d %b %y")}') results_page = 'results/' if datetime.now().year == date.year else f'results_{date.year}/' URL = FLIGHT_PROVIDERS['PWCA']['BASE_URL'] + results_page page = requests.get(URL + 'results.htm') event = BeautifulSoup(page.text, 'lxml').find('b', string=tag) tracks_link = event.find_next_sibling('a').attrs.get('href', None) tracks = requests.get(URL + tracks_link, stream=True) file_size = int(tracks.headers.get('content-length', 0)) with open(f'/tmp/{task.date}.zip', 'wb') as f: downloaded = 0 with tqdm(total=file_size, desc='downloading_tracks', disable=self._progress!='gui') as pbar: for data in tracks.iter_content(32*1024): f.write(data); if self._progress == 'ratio': downloaded += len(data) print(f'{downloaded/file_size:.0%}', file=sys.stderr, flush=True) else: pbar.update(len(data)) tracks_dir = f'/tmp/{task.date}' z = zipfile.ZipFile(f'/tmp/{task.date}.zip') z.extractall(tracks_dir) os.remove(f'/tmp/{task.date}.zip') return tracks_dir
def _(): response = request(plugin.pathquery) document = BeautifulSoup(response.text, 'html.parser').find( 'ul', {'class': ['list-episode-item', 'list-star']}) items = [] if document is not None: if plugin.path == '/list-star.html' or ( 'type' in plugin.query and 'stars' in plugin.query['type']): for li in document.find_all('li', recursive=False): plot = li.find('ul') item = ListItem(li.find('img').attrs['alt']) item.setArt({'poster': li.find('img').attrs['data-original']}) item.setInfo('video', {'plot': '' if plot is None else plot.text}) items.append( (plugin.url_for(li.find('a').attrs['href']), item, True)) elif plugin.path in ('/most-popular-drama', '/search'): idb.connect() for a in document.find_all('a'): path = a.attrs['href'] drama = drama_detail(path) item = ListItem(drama['title']) item.setArt({'poster': drama.pop('poster')}) item.setInfo('video', drama) items.append((plugin.url_for(path), item, True)) idb.close() else: for a in document.find_all('a'): item = ListItem(u'[{}] {} {}'.format( a.find('span', { 'class': 'type' }).text, a.find('h3').text, a.find('span', { 'class': 'ep' }).text)) item.setArt({'poster': a.find('img').attrs['data-original']}) item.setInfo('video', {}) item.setProperty('IsPlayable', 'true') items.append((plugin.url_for(a.attrs['href']), item, False)) document = document.find_next_sibling() if document is not None: for li in document.find_all('li', {'class': ['next', 'previous']}): item = ListItem( localized_str(33600 if li.text == 'Next >' else 33601)) items.append( (plugin.url_for(plugin.path + li.find('a').attrs['href']), item, True)) print(' ----- ' + plugin.url_for(plugin.path + li.find('a').attrs['href'])) xbmcplugin.setContent(plugin.handle, 'videos') xbmcplugin.addDirectoryItems(plugin.handle, items, len(items)) xbmcplugin.endOfDirectory(plugin.handle)
def is_part_of_speech_header(header: BeautifulSoup) -> bool: next_sibling: BeautifulSoup = header.find_next_sibling() while next_sibling and next_sibling.name == 'table': next_sibling = next_sibling.find_next_sibling() if next_sibling and next_sibling.name == 'p': next_sibling = next_sibling.find_next_sibling() if next_sibling.name == 'ol': return True return False
def get_meaning_without_etymology( header: BeautifulSoup) -> [Dict[str, Any]]: result = {'etymology': None, 'values': []} span: BeautifulSoup = header.find('span') if span: result['part_of_speech'] = span.get_text().strip().lower() next_sibling: BeautifulSoup = header.find_next_sibling() while next_sibling.name == 'table': next_sibling = next_sibling.find_next_sibling() WiktionaryTrEnScraper.process_meaning_values(next_sibling, result) return result
def getip(url): index = requests.get(url) i = BeautifulSoup(index.content, 'lxml') iplist = i.find_all("img", {"title": "非常危险"}) ipall = [] for i in iplist: u = i.find_next_sibling('a') if u == None: continue ipall.append(u.getText()) return ipall
def init_tab(self): driver.switch_to.window(self.tab) print(f'{self.title}: Waiting the page to load') driver.get(self.link) # visit the link while True: try: if 'Stock' in self.title: driver.execute_script('$("#stocksFilter").val("#all");') driver.execute_script("doStocksFilter('select',this)") if 'Crypto' in self.title: desired_quanity = BeautifulSoup(driver.page_source, 'html.parser') desired_quanity = desired_quanity.find( 'span', text='Number of Currencies') desired_quanity = int( desired_quanity.find_next_sibling().get_text().replace( ',', '')) else: desired_quanity = self.object_.objects.count() break except Exception as e: print(e) while True: try: soup = BeautifulSoup(driver.page_source, 'html.parser') table = soup.find('table', class_=self.table_class) len_table = int(len(table.find_all('tr')) * 1.1) if len_table < desired_quanity: print( f'{self.title}: Waiting more... {len_table}/{desired_quanity}' ) sleep(1) continue break except AttributeError: sleep(1) # removing unnecessary elements classes_to_remove = ['generalOverlay', 'signupWrap', 'midHeader'] driver.execute_script("$('header').remove()") driver.execute_script("$('footer').remove()") driver.execute_script("$('#rightColumn').remove()") for cl in classes_to_remove: # for skipping TimeoutException: Message: script timeout try: driver.execute_script(f"$('.{cl}').remove()") except: pass print(f'{self.title}: Tab is initializated!')
def scrape_student_office(): """Get info about the student service office""" scraped_info = {} student_office_url = "http://www.univaq.it/section.php?id=607" headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "accept-encoding": "gzip,deflate,sdch", "accept-language": "en-US,en;q=0.8", } request = requests.get(student_office_url, headers=headers) if request.status_code != 200: print("Error! Status " + request.status_code) return first_row = BeautifulSoup(request.text, "html.parser").find(string="AREA SCIENTIFICA")\ .parent.parent.find_next_sibling().find("tr") address = first_row.find(class_="address_table_description").text phone = first_row.find_next_sibling().find( class_="address_table_description").text email = first_row.find_next_sibling().find_next_sibling()\ .find(class_="address_table_description").text hours = first_row.find_next_sibling().find_next_sibling().find_next_sibling()\ .find(class_="address_table_description").text.replace('\n', '')\ .replace("13", "13, ") scraped_info.update({ "indirizzo": address, "telefono": phone, "e-mail": email, "orari": hours }) utils.write_json(scraped_info, "../json/student_office.json")
def scrape_student_office(): """Get info about the student service office""" scraped_info = {} student_office_url = "http://www.univaq.it/section.php?id=607" headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "accept-encoding": "gzip,deflate,sdch", "accept-language": "en-US,en;q=0.8", } request = requests.get(student_office_url, headers=headers) if request.status_code != 200: print("Error! Status "+request.status_code) return first_row = BeautifulSoup(request.text, "html.parser").find(string="AREA SCIENTIFICA")\ .parent.parent.find_next_sibling().find("tr") address = first_row.find(class_="address_table_description").text phone = first_row.find_next_sibling().find(class_="address_table_description").text email = first_row.find_next_sibling().find_next_sibling()\ .find(class_="address_table_description").text hours = first_row.find_next_sibling().find_next_sibling().find_next_sibling()\ .find(class_="address_table_description").text.replace('\n', '')\ .replace("13", "13, ") scraped_info.update({ "indirizzo": address, "telefono": phone, "e-mail": email, "orari": hours }) utils.write_json(scraped_info, "../json/student_office.json")
def _link_to_responce(self, link): """ We only want to scrap Canada's public health. Many other links go to responses for financial aid an other public sites. """ if link[0] is not '/': return None try: html = requests.get('https://www.canada.ca' + link) soup = BeautifulSoup(html.content, 'lxml').find(['h2', 'h3'], { 'id': link.split('#')[1] }).find_next_sibling() responce = str(soup) while soup.find_next_sibling( ) is not None and soup.find_next_sibling().name not in [ 'h2', 'h3', 'div' ]: soup = soup.find_next_sibling() responce += " " + str(soup) return responce except BaseException: return None
def extrairPartesProcesso(self, sopa): try: self.PARTES_PROCESSO = [] partes_processo = sopa.find(id='tableTodasPartes').prettify() soupa = BeautifulSoup(partes_processo, 'html.parser').tr filho = BeautifulSoup(soupa.prettify(), 'html.parser').td filhos = filho.find_next_sibling() irmaos = soupa.find_next_siblings() junta = '' junta += filho.span.text.replace(":", ":|").strip() junta += filhos.text.strip().split('\n')[0].strip() + "|" for indice in range(1, len(filhos.text.strip().split('\n'))): conteudo = filhos.text.strip().split('\n') if conteudo[indice]: junta += (conteudo[indice].strip() + "'") tratamento = junta.replace("''", "','") self.PARTES_PROCESSO.append(tratamento) for irmao in irmaos: filho = BeautifulSoup(irmao.prettify(), 'html.parser').td filhos = filho.find_next_sibling() junta = '' junta += filho.span.text.replace(":", ":|").strip() junta += filhos.text.strip().split('\n')[0].strip() + "|" for indice in range(1, len(filhos.text.strip().split('\n'))): conteudo = filhos.text.strip().split('\n') if conteudo[indice]: junta += (conteudo[indice].strip() + "'") tratamento = junta.replace("''", "','").replace("'", '') self.PARTES_PROCESSO.append(tratamento) except Exception as e: self.STATUS = False print("Nao foi possivel coletar partes do processo") print(e)
def find_next_sibling(self, soup: BeautifulSoup, *args, **kwargs): return soup.find_next_sibling(*args, **kwargs)
def pagination(): if plugin.path == '/search' and 'keyword' not in plugin.query: keyboard = Keyboard() keyboard.doModal() if keyboard.isConfirmed(): response = request(plugin.pathqs + '&keyword=' + keyboard.getText()) else: return else: response = request(plugin.pathqs) document = BeautifulSoup(response.text, 'html.parser').find( 'ul', {'class': ['list-episode-item', 'list-star']}) items = [] if document is not None: if plugin.path in ('/list-star.html', '/most-popular-drama', '/search'): if plugin.path == '/list-star.html' or ('type' in plugin.query and 'stars' in plugin.query['type']): for li in document.find_all('li', recursive=False): plot = li.find('ul') item = ListItem(li.find('img').attrs['alt']) item.setArt( {'poster': li.find('img').attrs['data-original']}) item.setInfo('video', {'plot': '' if plot is None else plot.text}) items.append((plugin.url_for(li.find('a').attrs['href']), item, True)) else: InternalDatabase.connect() for a in document.find_all('a'): path = a.attrs['href'] drama = drama_detail(path) item = ListItem(drama['title']) item.setArt({'poster': drama.pop('poster')}) item.setInfo('video', drama) items.append((plugin.url_for(path), item, True)) InternalDatabase.close() else: for a in document.find_all('a'): item = ListItem(u'[{}] {} {}'.format( a.find('span', { 'class': 'type' }).text, a.find('h3').text, a.find('span', { 'class': 'ep' }).text)) item.setArt({'poster': a.find('img').attrs['data-original']}) item.setInfo('video', {}) item.setProperty('IsPlayable', 'true') items.append((plugin.url_for(a.attrs['href']), item, False)) document = document.find_next_sibling() if document is not None: for li in document.find_all('li', {'class': ['next', 'previous']}): item = ListItem(li.text) items.append( (plugin.url_for(plugin.path + li.find('a').attrs['href']), item, True)) xbmcplugin.setContent(plugin.handle, 'videos') xbmcplugin.addDirectoryItems(plugin.handle, items, len(items)) xbmcplugin.endOfDirectory(plugin.handle)
def live_on(self): driver.switch_to.window(self.tab) soup = BeautifulSoup(driver.page_source, 'html.parser') table = soup.find('table', class_=self.table_class) if 'Crypto' in self.title: desired_quanity = BeautifulSoup(driver.page_source, 'html.parser') desired_quanity = desired_quanity.find('span', text='Number of Currencies') desired_quanity = int( desired_quanity.find_next_sibling().get_text().replace( ',', '')) else: desired_quanity = self.object_.objects.count() while True: try: soup = BeautifulSoup(driver.page_source, 'html.parser') table = soup.find('table', class_=self.table_class) len_table = int(len(table.find_all('tr')) * 1.1) if len_table < desired_quanity: print( f'{self.title}: Waiting more... {len_table}/{desired_quanity}' ) sleep(1) continue break except AttributeError: sleep(1) link_list = self.link_list for tr in table.find_all('tr')[1:]: if tr.find('a') is None: continue link = 'https://www.investing.com' + tr.find('a')['href'] if not link in link_list: continue tds = [] if self.type_ == 'crncy': for td in tr.find_all('td')[2:]: tds.append(td.get_text().strip()) elif self.type_ == 'crptcrncy': for td in tr.find_all('td')[4:]: tds.append(td.get_text().strip()) elif self.type_ == 'etf' or self.type_ == 'fnd': for td in tr.find_all('td')[3:-1]: tds.append(td.get_text().strip()) else: for td in tr.find_all('td')[2:-1]: tds.append(td.get_text().strip()) now = timezone.now() is_closed = True if self.type_ == 'crncy' and len( tr.find_all('td')[-1].get_text()) <= 5: is_closed = True elif self.type_ == 'crncy' or self.type_ == 'crptcrncy' or 'greenClockIcon' in tr.find_all( 'td')[-1].span['class']: is_closed = False elif 'redClockIcon' in tr.find_all('td')[-1].span['class']: is_closed = True if not is_closed: # if the market is open collect the live data live_data = {} l = [] for key, value in live_fields.items(): l += value all_live_fields = list(set(l)) for field in all_live_fields: live_data[field] = None # Overiding neccessary fields for key, value in zip(self.live_fields, tds): if value in ' -N/A': live_data[key] = None else: live_data[key] = value if self.type_ != 'bnd': if live_data['Last'] == self.last_price: # if market is closed but clock icon haven't changed # check market condition buy checking if the price is moving # 1. Navigate current tab to the blank page # 2. call init_tab() # 3. break - go to the next tab driver.get('about:blank') self.__class__.init_tab(self) break if not self.type_ in ['bnd', 'crptcrncy']: try: live_data['Prev. Close'] = round( float(live_data['Last'].replace(',', '')) - float(live_data['Chg.']), 2) except: pass models.AllAssetsLive.objects.filter(link=link).delete() try: if self.type_ == 'crptcrncy': time_ = now elif len(live_data['Time']) <= 5: time_ = datetime.datetime.strptime( str(now.year) + str(live_data['Time']), '%Y%d/%m') else: time_ = datetime.datetime.strptime( timezone.now().date().strftime('%Y:%m:%d:') + str(live_data['Time']), '%Y:%m:%d:%H:%M:%S') time_ = timezone.make_aware(time_) except: time_ = None try: if self.type_ == 'cmdty': if live_data['Month'] is None: pass elif live_data['Month'] in ' ': live_data['Month'] = None else: live_data['Month'] = datetime.datetime.strptime( live_data['Month'], '%b %y') except: live_data['Month'] = None if not '%' in live_data['Chg. %']: live_data['Chg. %'] += '%' models.AllAssetsLive( Type=self.type_, link=link, prev_close=validate_price(live_data['Prev. Close']), last_price=validate_price(live_data['Last']), month=validate_price(live_data['Month']), Open=validate_price(live_data['Open']), high=validate_price(live_data['High']), low=validate_price(live_data['Low']), change=validate_price(live_data['Chg.']), change_7d=validate_price(live_data['Chg. (7D)']), change_perc=validate_price(live_data['Chg. %']), volume=validate_price(live_data['Vol.']), market_cap=validate_price(live_data['Market Cap']), Yield=validate_price(live_data['Yield']), total_vol=validate_price(live_data['Total Vol.']), total_assets=validate_price(live_data['Total Assets']), time=time_).save(force_insert=True) self.last_price = live_data['Last'] print(f'{self.title}: saved Live') for time_frame, hist_model in self.__class__.hist_objects.items( ): if time_frame[-1] == 'D': if self.last_obj_count[ time_frame] == 0 or now - datetime.timedelta( minutes=self.__class__.minutes_[time_frame] ) > self.last_obj[time_frame].date: # if there's no data at all or latest data is older (smaller) than needed # send (Save) data hist_model( Type=self.type_, link=link, date=now, price=validate_price(live_data['Last']), Open=validate_price(live_data['Open']), high=validate_price(live_data['High']), low=validate_price(live_data['Low']), volume=validate_price(live_data['Vol.']), ).save(force_insert=True) print( f'{self.title}: saved HISTORICAL{time_frame}') else: # 6M1M, 1Y, 5Y, Max update last value if self.last_obj[time_frame]: if now.date == self.last_obj[time_frame].date: hist_model.objects.filter(link=link).reverse( ).first().delete() # remove newest model hist_model( Type=self.type_, link=link, date=now, price=validate_price(live_data['Last']), Open=validate_price(live_data['Open']), high=validate_price(live_data['High']), low=validate_price(live_data['Low']), volume=validate_price(live_data['Vol.']), ).save(force_insert=True) print(f'{self.title}: saved HISTORICAL{time_frame}') if time_frame != 'Max': # delete outdated data if self.last_obj_count[time_frame]: data1 = self.last_obj[time_frame].date if time_frame[-1] == 'D': data2 = now else: data2 = now.date() diff = data2 - data1 days, seconds = diff.days, diff.seconds hours = days * 24 + seconds // 3600 if hours > self.__class__.hours_[time_frame]: hist_model.objects.filter(link=link).first( ).delete() # remove oldest model elif is_closed: # check whether "after live data" for today is available last_obj_after_count = models.AllAssetsAfterLive.objects.filter( link=link).count() if last_obj_after_count > 0: last_obj_after = models.AllAssetsAfterLive.objects.filter( link=link).order_by('-id')[0] if last_obj_after_count == 0 or ( (now.date() - last_obj_after.date).days >= 1): after_live_threads.append({ 'link': link, 'type': self.type_, 'after fields': self.after_fields, 'title': self.title }) else: print('Time Icon is not found/recognized')
#coding=utf-8 from bs4 import BeautifulSoup import requests import pyquery content = requests.get('http://www.baidu.com').text soup = BeautifulSoup(content, 'lxml') soup.find_next_sibling()
r=requests.get("http://python123.io/ws/demo.html") soup=BeautifulSoup(r.text,"html.parser") print(soup.prettify()) #读取这个文件中的所有标签: for tag in soup.find_all(True): print(tag.name) #查找所有的a标签 for i in soup.find_all("a"): print(i.attrs["href"]) # 查找a,b标签 for i in soup.find_all(["a","b"]): print(i) #查找所有以B开头的字符串 for i in soup(re.compile("b")): print(i.name) # 查找标签为P,属性值为"title"的数据 for i in soup.find_all("a","py1"): print(i) #查找class="py1"的标签 for i in soup.find_all(id="link1"): print(i.name) print(soup.find_next_sibling("p","title")) print("*"*50) #查找P标签的父标签 for i in soup.p.find_parents(): print(i.name) print(soup.p.find_next_sibling()) print("*"*20) print(soup.p.string) for i in soup.find_all("p","course")[0].children: print(i)