def test_select(self): """Test select.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = bs4.BeautifulSoup(markup, 'html5lib') ids = [] for el in sv.select('span[id]', soup): ids.append(el.attrs['id']) self.assertEqual(sorted(['5', 'some-id']), sorted(ids)) ids = [] for el in sv.select('span[id]', soup, limit=1): ids.append(el.attrs['id']) self.assertEqual(sorted(['5']), sorted(ids)) self.assertEqual( sv.select('span[id]', soup, limit=1)[0].attrs['id'], sv.select_one('span[id]', soup).attrs['id']) self.assertEqual(None, sv.select_one('h1', soup)) ids = [] for el in sv.iselect('span[id]', soup): ids.append(el.attrs['id']) self.assertEqual(sorted(['5', 'some-id']), sorted(ids)) span = sv.select('span[id]', soup)[0] ids = [] for el in sv.select('span[id]:not(#some-id)', span.parent): ids.append(el.attrs['id']) self.assertEqual(sorted(['5']), sorted(ids))
def test_select_one(self): """Test select one.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html5lib') self.assertEqual( sv.select('span[id]', soup, limit=1)[0].attrs['id'], sv.select_one('span[id]', soup).attrs['id'] )
def test_closest(self): """Test closest.""" markup = """ <article id="article"> <div id="div-01">Here is div-01 <div id="div-02">Here is div-02 <div id="div-04">Here is div-04</div> <div id="div-03">Here is div-03</div> </div> <div id="div-05">Here is div-05</div> </div> </article> """ soup = bs4.BeautifulSoup(markup, 'html5lib') el = sv.select_one('#div-03', soup) self.assertTrue(sv.closest('#div-02', el).attrs['id'] == 'div-02') self.assertTrue(sv.closest('div div', el).attrs['id'] == 'div-03') self.assertTrue( sv.closest('article > div', el).attrs['id'] == 'div-01') self.assertTrue(sv.closest(':not(div)', el).attrs['id'] == 'article') self.assertTrue(sv.closest('div #div-05', el) is None) self.assertTrue(sv.closest('a', el) is None)
def test_closest_match_self(self): """Test closest match self.""" markup = """ <article id="article"> <div id="div-01">Here is div-01 <div id="div-02">Here is div-02 <div id="div-04">Here is div-04</div> <div id="div-03">Here is div-03</div> </div> <div id="div-05">Here is div-05</div> </div> </article> """ soup = self.soup(markup, 'html5lib') el = sv.select_one('#div-03', soup) self.assertTrue(sv.closest('div div', el).attrs['id'] == 'div-03')
def test_closest_must_be_parent(self): """Test that closest only matches parents or self.""" markup = """ <article id="article"> <div id="div-01">Here is div-01 <div id="div-02">Here is div-02 <div id="div-04">Here is div-04</div> <div id="div-03">Here is div-03</div> </div> <div id="div-05">Here is div-05</div> </div> </article> """ soup = self.soup(markup, 'html5lib') el = sv.select_one('#div-03', soup) self.assertTrue(sv.closest('div #div-05', el) is None) self.assertTrue(sv.closest('a', el) is None)
def test_closest_match_complex_parent(self): """Test closest match complex parent.""" markup = """ <article id="article"> <div id="div-01">Here is div-01 <div id="div-02">Here is div-02 <div id="div-04">Here is div-04</div> <div id="div-03">Here is div-03</div> </div> <div id="div-05">Here is div-05</div> </div> </article> """ soup = self.soup(markup, 'html5lib') el = sv.select_one('#div-03', soup) self.assertTrue(sv.closest('article > div', el).attrs['id'] == 'div-01') self.assertTrue(sv.closest(':not(div)', el).attrs['id'] == 'article')
def scrape(city): city_url_id = re.sub(r'[\s,]+', "-", city, 0) city_url = "https://www.tide-forecast.com/locations/" + city_url_id + "/tides/latest" r = requests.get(url=city_url) html = r.text soup = BeautifulSoup(html, 'html.parser') tr = sv.select(".tide-table > tr", soup) curr_date = "" timeofday = "" for el in tr: dateInst = sv.select_one(".date", el) if dateInst != None: curr_date = dateInst.text.strip() tide_time = "" tide_time_inst = sv.select_one(".time", el) if tide_time_inst != None: tide_time = tide_time_inst.text.strip() timezone = "" timezone_inst = sv.select_one(".time-zone", el) if tide_time_inst != None: timezone = timezone_inst.text.strip() level = "" level_inst = sv.select_one(".level", el) if level_inst != None: level = level_inst.text.strip() tide_phase = "" tide_phase_inst = sv.select_one(".tide:last-child", el) if tide_phase_inst != None: tide_phase = tide_phase_inst.text.strip() else: timeofday_inst = sv.select_one("td:last-child", el) if timeofday_inst != None: timeofday_val = timeofday_inst.text.strip() if timeofday_val == "Sunrise": timeofday = timeofday_val elif timeofday_val == "Sunset": timeofday = timeofday_val else: timeofday = "" if tide_phase == "Low Tide" and (timeofday == "Sunrise" or timeofday == "Sunset"): print('{0} {1} {2} {3} {4}'.format(city, curr_date, tide_time, timezone, level))
def test_select_one_none(self): """Test select one returns none for no match.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html5lib') self.assertEqual(None, sv.select_one('h1', soup))
soundBody: Tag for soundBody in sv.select('div.sound__body', beautifulSoapContent): # print ("sound__body Element : " + str(soundBody) + "\n") # coverArtElement = sv.select_one('a.sound__coverArt',soundBody) # print ("sound__coverArt Element : " + str(coverArtElement)) # print ("Track Page : https://soundcloud.com" + coverArtElement.get('href')) # # TODO : Wait to load Cover Arts # # coverArtUrl = get_backgroundImage_url( # # get_first_span_element_by_custom_attribute(coverArtElement, 'aria-role', 'img')) # type: str # # print "Cover Art Url : " + coverArtUrl contentElement = sv.select_one('div.sound__content', soundBody) # print ("sound__content Element : " + str(contentElement)) trackElement = sv.select_one('a.soundTitle__title.sc-link-dark', contentElement) trackTitle = sv.select_one('span', trackElement).text print("Track Title : " + trackTitle) print("Track Page : https://soundcloud.com" + trackElement.get('href')) print("Track Station Page : https://soundcloud.com/stations/track" + trackElement.get('href')) artistElement = sv.select_one('a.soundTitle__username.sc-link-light', contentElement) print("Artist : " + sv.select_one('span.soundTitle__usernameText', artistElement).text) print("Artist Page : https://soundcloud.com" + artistElement.get('href'))
def scrape_posts(html): soup = BeautifulSoup(html, 'lxml') next_link = soup.select_one('a[href*="?bacr="], a[href*="&bacr="]') if next_link is not None: next_link = resolve_relative_url(next_link.get('href')) posts = [] post_elements = soup.select('#m_group_stories_container > div > [data-ft]') for el in post_elements: full_story_link = soupsieve.select_one('a:-soup-contains("Full Story")', el) if not full_story_link: continue post_url = cleanup_post_link(full_story_link.get('href')) user_label, user = extract_user_information_from_link(el.select_one('h3 a')) formatted_date = el.select_one('abbr').get_text().strip() parsed_date = parse_date(formatted_date) reactions_item = el.select_one('[id^="like_"]') reactions = '0' if reactions_item: reactions_text = reactions_item.get_text() if reactions_text.count('·') > 1: reactions = reactions_text.split('·', 1)[0].strip() comments_item = soupsieve.select_one('a:-soup-contains(" Comment")', el) comments = '0' if comments_item: comments = comments_item.get_text().split('Comment', 1)[0].strip() text_root = el.select_one('[data-ft=\'{"tn":"*s"}\']') additional_html_roots = [] img_root = el.select_one('[data-ft=\'{"tn":"H"}\']') if img_root: additional_html_roots.append(img_root) all_text_elements = text_root.find_all('div', recursive=False) text_elements = [] translated_text_elements = [] translation_link = None for text_el in all_text_elements: translation_link = text_el.select_one('a[href^="/basic/translation_preferences/"]') if translation_link is None: text_elements.append(text_el) else: translation_link.extract() translated_text_elements.append(text_el) html_elements = text_elements + additional_html_roots comment_text = get_display_text(text_elements) comment_html = ''.join(str(el) for el in html_elements) translated_comment_text = get_display_text(translated_text_elements) translated_comment_html = ''.join(str(el) for el in translated_text_elements) translated_from = translation_link.get_text().rsplit('from ', 1)[-1].strip() if translation_link else None post = FacebookPost( url=post_url, user_id=getattr(user, 'id', ''), user_handle=getattr(user, 'handle', ''), user_url=getattr(user, 'url', ''), user_label=user_label, text=comment_text, html=comment_html, translated_text=translated_comment_text, translated_html=translated_comment_html, translated_from=translated_from, formatted_date=formatted_date, date=parsed_date, reactions=reactions, comments=comments ) posts.append(post) return next_link, posts
def interpret_scraper(scraper, element, root=None, context=None, path=[], scope=None): if scope is None: scope = EvaluationScope() # Is this a tail call of item? if isinstance(scraper, str): if scraper in EXTRACTOR_NAMES: return extract(element, scraper) return element.get(scraper) sel = get_sel(scraper) iterator = get_iterator(scraper) # First we need to solve local selection if sel is not None: element = soupsieve.select_one(sel, element) elif 'sel_eval' in scraper: evaluated_sel = eval_expression( scraper['sel_eval'], element=element, elements=[], context=context, root=root, path=path + ['sel_eval'], expect=(Tag, str), allow_none=True, scope=scope ) if isinstance(evaluated_sel, str): element = soupsieve.select_one(evaluated_sel, element) else: element = evaluated_sel if element is None: return None # Then we need to solve iterator single_value = True if iterator is not None: single_value = False elements = soupsieve.select(iterator, element) elif 'iterator_eval' in scraper: single_value = False evaluated_elements = eval_expression( scraper['iterator_eval'], element=element, elements=[], context=context, root=root, path=path + ['iterator_eval'], check=is_valid_iterator_eval_output, scope=scope ) if isinstance(evaluated_elements, str): elements = soupsieve.select(evaluated_elements, element) else: elements = evaluated_elements else: elements = [element] # Handling local context if 'set_context' in scraper: local_context = {} for k, field_scraper in scraper['set_context'].items(): local_context[k] = interpret_scraper( field_scraper, element, root=root, context=context, path=path + ['set_context', k], scope=scope ) context = merge_contexts(context, local_context) # Actual iteration acc = None if single_value else [] already_seen = set() if 'uniq' in scraper and not single_value else None for element in elements: value = None # Do we have fields? if 'fields' in scraper: value = {} for k, field_scraper in scraper['fields'].items(): value[k] = interpret_scraper( field_scraper, element, root=root, context=context, path=path + ['fields', k], scope=scope ) # Do we have a scalar? elif 'item' in scraper: # Default value is text value = interpret_scraper( scraper['item'], element, root=root, context=context, path=path + ['item'], scope=scope ) else: if 'attr' in scraper: value = element.get(scraper['attr']) elif 'extract' in scraper: value = extract(element, scraper['extract']) elif 'get_context' in scraper: value = nested_get(scraper['get_context'], context) elif 'default' not in scraper: # Default value is text value = extract(element, 'text') # Eval? if 'eval' in scraper: value = eval_expression( scraper['eval'], element=element, elements=elements, value=value, context=context, root=root, path=path + ['eval'], expect=DATA_TYPES, allow_none=True, scope=scope ) # Default value after all? if 'default' in scraper and value is None: value = scraper['default'] if single_value: acc = value else: # Filtering? if 'filter_eval' in scraper: passed_filter = eval_expression( scraper['filter_eval'], element=element, elements=elements, value=value, context=context, root=root, path=path + ['filter_eval'], expect=bool, allow_none=True, scope=scope ) if not passed_filter: continue if 'filter' in scraper: filtering_clause = scraper['filter'] if filtering_clause is True and not value: continue if isinstance(filtering_clause, str) and not nested_get(filtering_clause, value): continue if 'uniq' in scraper: uniq_clause = scraper['uniq'] k = value if uniq_clause is True and value in already_seen: continue if isinstance(uniq_clause, str): k = nested_get(uniq_clause, value) if k in already_seen: continue already_seen.add(k) acc.append(value) return acc
def googleTranslate(self, translateString): # 구글번역 driver = self.driver wait = self.wait driver.find_element_by_css_selector( 'textarea#source.orig.tlid-source-text-input.goog-textarea').clear( ) wait.until( expected.invisibility_of_element_located( (By.CSS_SELECTOR, 'span.tlid-translation.translation'))) # driver.get('https://translate.google.com/#auto|ko|{}'.format(translateString)) driver.find_element_by_css_selector( 'textarea#source.orig.tlid-source-text-input.goog-textarea' ).send_keys(translateString) try: wait.until( expected.visibility_of_element_located( (By.CSS_SELECTOR, 'span.tlid-translation.translation'))) except: html = driver.page_source driver.find_element_by_css_selector( 'textarea#source.orig.tlid-source-text-input.goog-textarea' ).clear() if (sv.select_one('div.result-error', BeautifulSoup(html, 'html.parser'))): return { "data1": sv.select_one('span.tlid-result-error', BeautifulSoup(html, 'html.parser')).text } html = driver.page_source driver.find_element_by_css_selector( 'textarea#source.orig.tlid-source-text-input.goog-textarea').clear( ) # driver.implicitly_wait(5) select1 = str( sv.select_one('div.homepage-content-wrap', BeautifulSoup(html, 'html.parser'))) # 번역창 분리 # data1 resultString1 = '' select4 = sv.select_one('span.tlid-translation.translation', BeautifulSoup(select1, 'html.parser')) resultString1 = str(select4).replace('<br/>', '<span>\n</span>') resultString1 = BeautifulSoup(resultString1, 'html.parser').text # print(resultString1) # data2 resultString2 = '' if ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style="display: none;"' in select1): pass elif ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style=""' in select1): select2 = str( sv.select_one('table.gt-baf-table', BeautifulSoup(select1, 'html.parser'))) select3 = sv.select( 'span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable', BeautifulSoup(select2, 'html.parser')) for data in select3: strData = str(data) if ('gt-baf-cell' in strData): resultString2 += "{}, ".format(data.text) elif ('gt-cd-pos' in strData): resultString2 = resultString2.rstrip(", ") resultString2 += "\n* {} *\n: ".format(data.text) resultString2 = resultString2.lstrip("\n") resultString2 = resultString2.rstrip(", ") # print(resultString2) return {"data1": resultString1, "data2": resultString2}
def googleTranslate(self): # def googleTranslate(self, translateString): # 구글번역 # driver = self.driver # wait = self.wait # # driver.get('https://translate.google.com/#auto|ko|{}'.format(translateString)) # driver.find_element_by_css_selector('textarea#source.orig.tlid-source-text-input.goog-textarea').send_keys(translateString) # wait.until(expected.visibility_of_element_located((By.CSS_SELECTOR, 'span.tlid-translation.translation span'))) # html = driver.page_source # driver.find_element_by_css_selector('textarea#source.orig.tlid-source-text-input.goog-textarea').clear() # # driver.implicitly_wait(5) with open('./string.html', 'r', encoding='utf-8') as f: html = f.read() select1 = str( sv.select_one('div.homepage-content-wrap', BeautifulSoup(html, 'html.parser'))) # 번역창 분리 with open('./test.html', 'w', encoding='utf-8') as f: f.write(select1) # data1 resultString1 = '' select4 = sv.select_one('span.tlid-translation.translation', BeautifulSoup(select1, 'html.parser')) resultString1 = str(select4).replace('<br/>', '<span>\n</span>') resultString1 = BeautifulSoup(resultString1, 'html.parser').text print(resultString1) # data2 resultString2 = '' if ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style="display: none;"' in select1): pass elif ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style=""' in select1): select2 = str( sv.select_one('table.gt-baf-table', BeautifulSoup(select1, 'html.parser'))) select3 = sv.select( 'span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable', BeautifulSoup(select2, 'html.parser')) for data in select3: strData = str(data) if ('gt-baf-cell' in strData): resultString2 += "{}, ".format(data.text) elif ('gt-cd-pos' in strData): resultString2 = resultString2.rstrip(", ") resultString2 += "\n* {} *\n: ".format(data.text) resultString2 = resultString2.lstrip("\n") # resultString2 = resultString2.rstrip(", ") print(resultString2) # # print(html) # select11 = sv.select_one('span.tlid-translation.translation', BeautifulSoup(select1, 'html.parser')) # tmp1 = str(select11).replace('<br/>', '<span>\n</span>') # tmp1 = BeautifulSoup(tmp1, 'html.parser').text # print(tmp1) # resultString2 = '' # if('<div class="gt-lc gt-lc-mobile" style="display: none;">' in select1): # pass # elif('<div class="gt-lc gt-lc-mobile" style="">' in select1): # select2 = str(sv.select_one('table.gt-baf-table', BeautifulSoup(select1, 'html.parser'))) # select3 = sv.select('span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable', BeautifulSoup(select2, 'html.parser')) # for data in select3: # strData = str(data) # if('gt-baf-cell' in strData): # resultString2 += "{}, ".format(data.text) # elif('gt-cd-pos' in strData): # resultString2 = resultString2.rstrip(", ") # resultString2 += "\n* {} *\n: ".format(data.text) # resultString2 = resultString2.lstrip("\n") # resultString2 = resultString2.rstrip(", ") # print(resultString2) # return {"data1": resultString1, "data2": resultString2} return
artist_id = sys.argv[2] search_term = sys.argv[3] else: artist = 'Spice-Girls' artist_id = 199833 search_term = 'good sheep' print(str(artist_id) + ' : ' + artist) print('Search for : ' + search_term) url = 'https://www.lyrics.com/artist.php?name=' + artist + '&aid=' + str( artist_id) + '&o=1' page_source = requests.get(url).text beautiful_soap_content = BeautifulSoup(page_source, "lxml") for song in sv.select('tr', sv.select_one('tbody', beautiful_soap_content)): song_element = sv.select_one('a', song) print('\n\nSong Title : ' + song_element.text) song_url = 'https://www.lyrics.com' + song_element.get('href') print('Song URL : ' + song_url + '\n') song_page_source = requests.get(song_url).text song_page_content = BeautifulSoup(song_page_source, "lxml") # print('Song Lyrics') song_lyrics = sv.select_one('pre', song_page_content).text print(song_lyrics) # if search_term in song_lyrics : # break if re.search(search_term, song_lyrics, re.IGNORECASE): print(search_term + ' Found On ' + song_element.text)