def first(self,journal_temp): urls = [] data = requests.get(journal_temp[Row_Name.TEMP_URL],verify=False) soup = BeautifulSoup(data.text, "html.parser") # print(soup) div = soup.find("div", id="primarycontent") h4 = div.find_all("h4") p = div.find_all("p") soup.extract() if len(h4) == len(p): for title, i in zip(h4, p): article_info = dict(journal_temp) a = i.find("a") # print("http://bmathaa.org/" + a["href"]) title = title.get_text() t = re.match("\d*\.", title) if t != None: title = title.replace(t.group(), "").strip() # print(title) a.extract() au=i.get_text().replace("Author:", "").replace("Authors: ", "").replace(" and ", "##").replace(",","##") article_info[Row_Name.TITLE]=title article_info[Row_Name.AUTHOR_NAME]=au article_info[Row_Name.FULLTEXT_URL]="http://bmathaa.org/" + a["href"] urls.append(article_info) return urls
def getText(link): print(" - ", link) try: r = requests.get(link, verify=False, timeout=5) except KeyboardInterrupt: raise except: # print("Error") return "" b = BeautifulSoup(r.text, "lxml") [b.extract() for b in b(['script', 'link', 'meta', 'style'])] text = [b.extract() for b in b(['p', 'div'])] output = "" outputList = [] for excerpt in text: found = excerpt.get_text(separator=" ").strip().replace( "\xa0", ". ").replace("\n", ". ").replace("\t", ". ").replace("\r", ". ").strip() while " " in found: # Remove double spaces found = found.replace(" ", " ") while ". . " in found: found = found.replace(". . ", ". ") if found.count(" ") > 5: outputList.append(found) # sentences = found.split(". ") # for sentence in sentences: # stripped = sentence.strip() # # while " " in stripped: # Remove double spaces # stripped = stripped.replace(" ", " ") # while ". . " in stripped: # stripped = stripped.replace(". . ", ". ") # # if stripped.count(" ") < 5: # continue # # output += stripped + ". " # outputList.append(stripped) # while " " in output: # Remove double spaces # output = output.replace(" ", " ") # while ". . " in output: # output = output.replace(". . ", ". ") # return outputList return outputList
def get_lyric(self, singer, song): # Replace spaces with _ singer = singer.replace(' ', '_') song = song.replace(' ', '_') url = 'http://lyrics.wikia.com/{0}:{1}'.format(singer, song) req = requests.get(url) s = BeautifulSoup(req.text, "lxml") # Get main lyrics holder lyrics = s.find("div", {'class': 'lyricbox'}) if lyrics is None: return None # Remove Scripts [s.extract() for s in lyrics('script')] # Remove comments comments = lyrics.findAll(text=lambda text: isinstance(text, Comment)) # Remove unecessary tags for tag in ['div', 'i', 'b', 'a']: for match in lyrics.findAll(tag): match.replaceWithChildren() # TODO: check if you need the encode/decode thing, if you do then do a try catch for it # get output as string and remove non unicode characters and replace <br> with newlines #output = str(lyrics).encode('utf-8', errors = 'replace')[22:-6:].decode('utf-8').replace('\n','').replace('<br/>','\n') output = str(lyrics).replace('\n', '').replace('<br/>', '\n')[22:-6:] try: return output except: return output.encode('utf-8')
def getLyrics(singer, song): #Replace spaces with _ singer = singer.replace(' ', '_') song = song.replace(' ', '_') r = requests.get('http://lyrics.wikia.com/{0}:{1}'.format(singer,song)) s = BeautifulSoup(r.text) #Get main lyrics holder lyrics = s.find("div",{'class':'lyricbox'}) if lyrics is None: raise ValueError("Song or Singer does not exist or the API does not have Lyrics") return None #Remove Scripts [s.extract() for s in lyrics('script')] #Remove Comments comments = lyrics.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] #Remove unecessary tags for tag in ['div','i','b','a']: for match in lyrics.findAll(tag): match.replaceWithChildren() #Get output as a string and remove non unicode characters and replace <br> with newlines output = str(lyrics).encode('utf-8', errors='replace')[22:-6:].decode("utf-8").replace('\n','').replace('<br/>','\n') try: return output except: return output.encode('utf-8')
def lyricswikia(artist, song): # original code found @ # https://github.com/geekpradd/PyLyrics/blob/master/PyLyrics/functions.py song = song.split(' - ', 1)[0] artist = artist.replace(' ', '_') song = song.replace(' ', '_') url = 'http://lyrics.wikia.com/{0}:{1}'.format(artist, song) print('Trying:', url) r = requests.get(url) s = BeautifulSoup(r.text, 'html.parser') # Get main lyrics holder lyrics = s.find("div", {'class': 'lyricbox'}) if lyrics is not None: # Remove Scripts [s.extract() for e in lyrics('script')] # Remove Comments comments = lyrics.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] # Remove unecessary tags for tag in ['div', 'i', 'b', 'a']: for match in lyrics.findAll(tag): match.replaceWithChildren() # Get output as a string and remove non unicode characters and replace # <br> with newlines lyrics = str(lyrics).encode( 'utf-8', errors='replace')[22:-6:].decode("utf-8").replace( '\n', '').replace('<br/>', '\n') try: return lyrics except: return lyrics.encode('utf-8')
def getLyrics(singer, song): #Replace spaces with _ singer = singer.replace(' ', '_') song = song.replace(' ', '_') r = requests.get('http://lyrics.wikia.com/{0}:{1}'.format( singer, song)) s = BeautifulSoup(r.text, 'html.parser') #Get main lyrics holder lyrics = s.find("div", {'class': 'lyricbox'}) if lyrics is None: raise ValueError( "Song or Singer does not exist or the API does not have Lyrics" ) #Remove Scripts [s.extract() for s in lyrics('script')] #Remove Comments comments = lyrics.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] #Remove unecessary tags for tag in ['div', 'i', 'b', 'a']: for match in lyrics.findAll(tag): match.replaceWithChildren() #Get output as a string and remove non unicode characters and replace <br> with newlines output = str(lyrics).encode( 'utf-8', errors='replace')[22:-6:].decode("utf-8").replace( '\n', '').replace('<br/>', '\n') try: return output except: return output.encode('utf-8')
def test_extract_with_arg(self): ''' Test extracting by calling extract() with a random argument. ''' markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' extr_arg_soup = BeautifulSoup(markup, "html.parser") with self.assertRaises(TypeError): extr_arg_soup_extracted = extr_arg_soup.extract("arg")
def getSalesRank(salesRankFrom, asin, salesRankTo): """ method to check if the sales rank of the product lies in the range provided by the user salesRankFrom : lower limit for sales rank entered by the user asin : asin of the product salesRankTo : upper limit for sales rank entered by the user """ if salesRankFrom and salesRankTo: url = 'http://www.amazon.co.uk/dp/' + asin mech = Browser() session = requests.Session() proxy = check_proxy(session) if proxy != 'NA': mech.set_proxies({'http': proxy}) page = mech.open(url) html = page.read() # try to get the sales rank else return True. Does not work sometimes. Need to be more robust. try: soup = BeautifulSoup(str(html)) data = BeautifulSoup.extract((soup)) salesRankElem = data.find(attrs={ 'id': 'SalesRank' }).find(attrs={ 'class': 'value' }).getText() salesRank = re.findall(r'\n(.*?)\sin', salesRankElem)[0].replace( 'in', '').replace(',', '').strip() return salesRankFrom <= int(salesRank) <= salesRankTo except Exception, e: return True
def buildInteractions(self): f = open(self.__name+".txt", "a") page = requests.get(self.__link) soup = BeautifulSoup(page.content, 'html.parser') div1 = (soup.select("ul.interactions.ddc-list-unstyled li.int_1 a")) soup = BeautifulSoup(str(div1)[1:-1],'html.parser') soup = BeautifulSoup(str(soup.extract()),'html.parser') level1 = str(soup) if(level1.strip()==""): f.write("No level 1 interactions\n") else: f.write("Level 1 interactions:\n") for level in soup: curr =re.sub('<a href=[^>]+>', '',str(level)) if(curr[0] != ','): f.write(curr.replace('</a>',"")+"\n") soup = BeautifulSoup(page.content, 'html.parser') div2 = soup.select("ul.interactions.ddc-list-unstyled li.int_2 a") soup = BeautifulSoup(str(div2)[1:-1],'html.parser') soup = BeautifulSoup(str(soup.extract()),'html.parser') level2 = str(soup) if(level2.strip()==""): f.write("No level 2 interactions\n") else: f.write("Level 2 interactions:\n") for level in soup: curr =re.sub('<a href=[^>]+>', '',str(level)) if(curr[0] != ','): f.write(curr.replace('</a>',"")+"\n") soup = BeautifulSoup(page.content, 'html.parser') div3 = soup.select("ul.interactions.ddc-list-unstyled li.int_3 a") soup = BeautifulSoup(str(div3)[1:-1],'html.parser') soup = BeautifulSoup(str(soup.extract()),'html.parser') level3 = str(soup) if(level3.strip()==""): f.write("No level 3 interactions\n") else: f.write("Level 3 interactions:\n") for level in soup: curr =re.sub('<a href=[^>]+>', '',str(level)) if(curr[0] != ','): f.write(curr.replace('</a>',"")+"\n") f.close()
def test_extract_raises(self): ''' Test extracting a non-existing tag, ensuring that the original isn't changed. ''' markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' extr_soup = BeautifulSoup(markup, "html.parser") extr_null_soup = extr_soup.extract() self.assertEqual(extr_soup, extr_null_soup)
def getSalesRank(salesRankFrom, asin, salesRankTo): """ method to check if the sales rank of the product lies in the range provided by the user salesRankFrom : lower limit for sales rank entered by the user asin : asin of the product salesRankTo : upper limit for sales rank entered by the user """ salesRank = 'NA' url = 'http://www.amazon.co.uk/dp/' + unidecode(asin) mech = Browser() session = requests.Session() proxy = check_proxy(session) if proxy != 'NA': mech.set_proxies({'http': proxy}) page = mech.open(url) if page.code == 503: print page.geturl() html = page.read() # try to get the sales rank else return True. Does not work sometimes. Need to be more robust. try: soup = BeautifulSoup(str(html)) data = BeautifulSoup.extract((soup)) salesRankElem = data.find(attrs={ 'id': 'SalesRank' }).find(attrs={ 'class': 'value' }).getText() salesRank = re.findall(r'\n(.*?)\sin', salesRankElem)[0].replace( 'in', '').replace(',', '').strip() if salesRankFrom and salesRankTo: return (salesRankFrom <= int(salesRank) <= salesRankTo), salesRank else: return True, salesRank except Exception, e: try: salesRankElem = data.find(attrs={ 'class': 'zg_hrsr_item' }).getText() salesRank = salesRankElem.split()[0].replace('#', '').strip() if salesRankFrom and salesRankTo: return (salesRankFrom <= int(salesRank) <= salesRankTo), salesRank else: return True, salesRank except: try: product = amazon_uk.lookup(ItemId=unidecode(asin)) salesRank = product.sales_rank if salesRank is not None: return (salesRankFrom <= int(salesRank) <= salesRankTo), salesRank else: return True, 'NA' except: return True, salesRank
def get_text(filepath,file_name): output_dir=os.path.join('/Users/hzxue/output1',filepath.split('/',3)[3]) if os.path.exists(output_dir) is not True: os.makedirs(output_dir) read_file_path=os.path.join(filepath,file_name) f=open(os.path.join(output_dir,file_name),'w') soup=BeautifulSoup(open(read_file_path),'lxml') print soup.title.string for s in soup("script"): soup.extract() queue = [([], soup)] # queue of (path, element) pairs while queue: path, element = queue.pop(0) if hasattr(element, 'children'): # check for leaf elements for child in element.children: queue.append((path + [child.name if child.name is not None else type(child)],child)) if element.string and is_text(element)==1: if is_used(element.string)==1: f.write(element.string.strip()+'\n')
def getSalesRank(salesRankFrom, asin, salesRankTo): url = 'http://www.amazon.co.uk/dp/' + asin mech = Browser() page = mech.open(url) html = page.read() soup = BeautifulSoup(html) data = BeautifulSoup.extract(soup) salesRankElem = data.find(attrs = {'id':'SalesRank'}).find(attrs = {'class' : 'value'}).getText() salesRank = re.findall(r'\d+\sin', salesRankElem)[0].replace('in','').strip() return salesRankFrom <= int(salesRank) <= salesRankTo
def get_text(cls, text): text = Function.remove_carriage_return(text) text = Function.replace_tab_to_space(text) text = Function.remove_tags(text, 'script') text = Function.remove_tags(text, 'head') text = Function.remove_tags(text, 'footer') text = Function.remove_tags(text, 'style') soup = BeautifulSoup(text, 'html.parser') text = soup.extract().get_text() text = Function.replace_duplicate_to_space(text) return text
def parse(self, item, *args, **kwargs): self.exporter.fields_to_export = [ 'uuid', 'domain', 'url', 'word', 'word_point', 'date', 'section', 'pdate' ] try: tm = textmine.textmine() soup = BeautifulSoup(item['body'], 'html.parser') text = soup.extract().get_text() if self.conf.has_option(self.section, 'exclude_keywords'): exclude_keywords = ast.literal_eval( self.conf.get(self.section, 'exclude_keywords')) for ex_word in exclude_keywords: text = text.replace(ex_word, '') tm_result = tm.get(text) if len(tm_result) > 0 and len(tm_result[1]) > 0: for word in tm_result[1]: new_item = CommonItem() new_item.fields["uuid"] = CommonField() new_item = CommonItem() new_item.fields["domain"] = CommonField() new_item = CommonItem() new_item.fields["url"] = CommonField() new_item = CommonItem() new_item.fields["word"] = CommonField() new_item = CommonItem() new_item.fields["word_point"] = CommonField() new_item = CommonItem() new_item.fields["date"] = CommonField() new_item = CommonItem() new_item.fields["section"] = CommonField() new_item = CommonItem() new_item.fields["pdate"] = CommonField() new_item["encoding"] = item["encoding"] new_item["uuid"] = item["uuid"] new_item["domain"] = item["domain"] new_item["url"] = item["url"] new_item["word"] = word[0] new_item["word_point"] = str(word[1]) new_item["date"] = item["date"] new_item["section"] = item["section"] new_item["pdate"] = datetime.datetime.now().strftime( '%Y%m%d%H%M00') if self.start_pdate is None: self.start_pdate = new_item["pdate"] yield new_item except Exception as ex: print(ex)
def parse(self, item, *args, **kwargs): try: tm = textmine.textmine() soup = BeautifulSoup(item['body'], 'html.parser') text = soup.extract().get_text() if self.conf.has_option(self.section, 'exclude_keywords'): exclude_keywords = ast.literal_eval( self.conf.get(self.section, 'exclude_keywords')) for ex_word in exclude_keywords: text = text.replace(ex_word, '') tm_result = tm.get(text) # item.fields["pdate"] = CommonField() # item["pdate"] = datetime.datetime.now().strftime('%Y%m%d%H%M00') # if self.start_pdate is None: # self.start_pdate = item["pdate"] # item['body'] = str(item['body']).replace('\n', ' ').strip() # item.fields["text"] = CommonField() # item["text"] = text.replace('\n', ' ').strip() item.fields["top_sentence"] = CommonField() item.fields["top_word"] = CommonField() item.fields["sentences"] = CommonField() item.fields["words"] = CommonField() if len(tm_result) > 0 and len(tm_result[0]) > 0: item["top_sentence"] = str(tm_result[0][0][2]).replace( '\n', ' ').strip() if len(tm_result) > 0 and len(tm_result[1]) > 0: item["top_word"] = str(tm_result[1][0][0]).replace( '\n', ' ').strip() if len(tm_result) > 0: item["sentences"] = str(tm_result[0]).replace('\n', ' ').strip() if len(tm_result) > 1: item["words"] = str(tm_result[1]).replace('\n', ' ').strip() # self.exporter.fields_to_export = ['uuid', 'domain', 'url', 'keyword', 'top_sentence', 'top_word', 'sentences', 'words', 'text', 'body', 'date', 'section', 'pdate'] self.exporter.fields_to_export = [ 'uuid', 'domain', 'url', 'keyword', 'top_sentence', 'top_word', 'sentences', 'words', 'date', 'section' ] yield item except Exception as ex: print(ex)
def clean_data(text, origin_url=None): # 文本预清洗 text = pre_clean(text) # 标签清洗 soup = BeautifulSoup(text, 'lxml') soup = clean_attrs(soup) soup = clean_tags(soup) soup = clean_extra(soup) # 对相对地址进行还原 if origin_url: soup = join_url(soup, origin_url) # 转换为文本继续处理 text = str(soup.extract()) text = clean_text(text) return text
def webcapture(url): headers = { # pretend I am a browser 'User-Agent': 'Mozilla/5.0', } session = requests.Session() #setup session data = session.get(url, headers=headers) #scrape the data soup = BeautifulSoup(data.text, 'html.parser') #parse the data ext = soup.extract() #return the parsed data ii = _remove_attrs(soup) #i2 = soup.clear() p = soup.find_all('p') gettext = soup.get_text() tt = '' for d in p: tt += extract(str(d)) return tt #,s.get_text()
def getLyrics(singer, song): # Replace spaces with _ singer = singer.replace(' ', '_') song = song.replace(' ', '_') r = requests.get('http://lyrics.wikia.com/{0}:{1}'.format( singer, song)) s = BeautifulSoup(r.text, features="html.parser") if s.find('i') is None: # raise ValueError("API could not find the song.") return None album = str(s.find('i').find('a').text) lyrics = s.find("div", {'class': 'lyricbox'}) if lyrics is None: raise ValueError("API could not find the lyrics to the song.") return None # Remove Scripts [s.extract() for s in lyrics('script')] # Remove Comments comments = lyrics.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] # Remove unecessary tags for tag in ['div', 'i', 'b', 'a']: for match in lyrics.findAll(tag): match.replaceWithChildren() #Get output as a string and remove non unicode characters and replace\ #<br> with newlines # Python 3 if sys.version_info.major > 2: output = str(lyrics).encode('utf-8', errors='replace')[22:-6:].\ decode("utf-8").replace('\n','').replace('<br/>','\n') else: # Python 2 output = str(lyrics)[22:-6:].decode("utf-8").replace('\n','').\ replace('<br/>','\n') try: return [album, output] except: return [album, output.encode('utf-8')]
def hooker(self): """ hookers gonna hook """ try: req = requests.get(DATABASES_URL.format(self.query.lower()), headers=self.headers, proxies=self.proxies) soup = BeautifulSoup(req.content, "html.parser") self.content = soup.extract() results = self._parse_html() if results: self._find_database_links() if len(self.database_links) != 0: return self._download_database() else: return [] except Exception: return []
def downloadAndChangeImgPath(self, html_have_img, newsDate) -> str: ''' :param html_have_img: 新闻的正文的html :param newsDate: 新闻的日期(用来做下载图片的文件名) :return: img 中的src修改成下载到本地的地址 ''' print("正在下载正文中") soup = BeautifulSoup(html_have_img, 'lxml') for img in soup.find_all("img"): tempSrc = img['src'] if tempSrc.find("http:") == -1: # 默认可能漏掉了这部分的 tempSrc = "http:" + tempSrc # time.sleep(1) fixedSrc = self.downloadTool.downloadImg( img_url=tempSrc, imgName=None, referer=None, now_date=newsDate) # 这个是下面新建力的文件夹,默认都是延迟一天的。 img['src'] = fixedSrc # 这个地址放回去 # 下载,返回path,然后修改。 print(img['src']) print("图片下载并且修改src完成。") return [str(soup.extract()).replace("'", '"')]
def getSKUData(prod, queue): data = [] asin = (prod['asin']) image = prod['image'] title = prod['title'] price = prod['price_new'] url = 'http://www.amazon.com/dp/' + unidecode(asin) mech = Browser() session = requests.Session() page = mech.open(url) if page.code == 503: print '503', page.geturl() html = page.read() soup = BeautifulSoup(str(html)) global data data = BeautifulSoup.extract((soup)) # try to get the sales rank else return True. Does not work sometimes. Need to be more robust. try: salesRankElem = data.find(attrs={ 'id': 'SalesRank' }).find(attrs={ 'class': 'value' }).getText() salesRank = re.findall(r'\n(.*?)\sin', salesRankElem)[0].replace( 'in', '').replace(',', '').strip() except Exception, e: try: salesRankElem = data.find(attrs={ 'class': 'zg_hrsr_item' }).getText() salesRank = salesRankElem.split()[0].replace('#', '').strip() except: try: product = amazon.lookup(ItemId=unidecode(asin)) salesRank = product.sales_rank except: salesRank = 'NA'
def getAddress(self, response): sel = scrapy.Selector(response) results = sel.xpath("//*[contains(@class, 'media-text')]")[0] address = BeautifulSoup(results.extract(), 'html.parser').get_text() address = address.replace("A wine and liquor store located in ", "") address = address.replace("A store located at ", "") if address.find("USA") >= 0: addresses = open('addresses.txt', 'a') addresses.write("%s\n" % address) appname = sel.xpath( "//*[contains(@class, 'media-heading text-center')]")[0] appname = BeautifulSoup(appname.extract(), 'html.parser').get_text() appname = appname.replace('\' mobile app', "") appname = appname.replace('\'s mobile app', "") appnames = open('appnames.txt', 'a') appnames.write("%s\n" % appname) else: other = open('other.txt', 'a') other.write("%s\n" % address)
linkSite = omgSite.find_all('a') # <codecell> saveLinkz = open('htmldoc', 'w') saveLinkz.write(siteData) saveLinkz.close() # <codecell> openLinkz = open('htmldoc', 'r') openLinkz.read() # <codecell> print omgSite.extract() # <codecell> print omgSite.setup # <codecell> print omgSite.title # <codecell> print omgSite.wrap # <codecell>
def get_item(self, response): title = response.css('#productTitle::text').extract_first() title = title.strip() if title is not None else None # print(title) details_output = dict() ASIN = None details = response.css('#detail-bullets .content > ul > li') for detail in details: detail_name = detail.css('b::text').extract_first() detail_name = detail_name.replace(':', '').strip() detail = BeautifulSoup(detail.extract(), 'lxml') # Remove detail name's tag in each detail for span in detail.find_all('b'): span.extract() detail = Selector(text=str(detail)) detail_values = detail.css('li ::text').extract() detail_values = utils.normalize_str_array(detail_values) detail_value = detail_values[0] if len(detail_values) > 0 else None # Parse ranks number if 'Amazon Best Sellers Rank' in detail_name: detail_value = detail_value.strip().split(' ')[0] detail_value = utils.parse_int(detail_value) if 'ASIN' in detail_name: ASIN = detail_value details_output[detail_name] = detail_value alt_images = response.css('#altImages img::attr(src)').extract() # print(alt_images) brand = response.css('#bylineInfo::text').extract_first() # print(brand) brand_url = response.css('#bylineInfo::attr(href)').extract_first() brand_url = response.urljoin( brand_url) if brand_url is not None else None # print(brand_url) price = response.css( '.snsPriceBlock .a-color-price::text').extract_first() if price is None: price = response.css('#priceblock_ourprice::text').extract_first() price = price.strip() if price is not None else None # print(price) description = response.css( '#productDescription p::text, #productDescription h3::text' ).extract() description = utils.normalize_str_array(description) # description = '\n'.join(description) # print(description) plus_desc = response.css('#aplus') plus_desc_html = plus_desc.css('.aplus-v2').extract_first() plus_desc_texts = plus_desc.css( '*:not(script):not(style)::text').extract() plus_desc_texts = utils.normalize_str_array(plus_desc_texts) plus_desc_text = '\n'.join(plus_desc_texts) features = response.css('#feature-bullets ul li ::text').extract() features = [feature.strip() for feature in features] # print(features) videos = response.css( '#vse-rel-videos-carousel .vse-video-item::attr(data-video-url)' ).extract() # print(videos) return { 'ASIN': ASIN, 'url': response.url, 'title': title, 'brand': { 'name': brand, 'url': brand_url }, 'alt_images': alt_images, 'details': details_output, 'price': price, 'description': description, 'plus_description': { 'text': plus_desc_text, 'html': plus_desc_html }, 'features': features, 'videos': videos, }
class SimpleRender(object): def __init__(self, style="default"): self.__md = "" self.__soup = None def __soup_append_style(self, css_select, style): for cs in css_select.split(","): for i in self.__soup.select(cs.strip()): i["style"] = i.get("style", "") + style def __soup_render_table_row(self): for table in self.__soup.select(".markdown-body table"): i = 0 for row in table.select("tbody tr"): if i % 2 == 0: row["style"] = row.get("style", "") + \ "background-color:#f8f8f8;" i += 1 def render(self): html = markdown.markdown( self.__md, extensions=[exttables.TableExtension()]) html = "<div class=\"markdown-body\">%s</div>" % html self.__soup = BeautifulSoup(html) self.__soup_append_style( ".markdown-body", ('font-family:"Helvetica Neue",Helvetica,"Segoe UI",Arial,' 'freesans,sans-serif,"Apple Color Emoji","Segoe UI Emoji",' '"Segoe UI Symbol";' 'font-size:16px;' 'line-height:1.6;' 'word-wrap:break-word;' 'width:1280px;' 'margin:auto;' 'color:#333;') ) self.__soup_append_style( (".markdown-body p," ".markdown-body blockquote," ".markdown-body ul," ".markdown-body ol," ".markdown-body dl," ".markdown-body table," ".markdown-body pre"), 'margin-top:0;margin-bottom:16px;' ) self.__soup_append_style( "h1, h2, h3, h4", ("margin-top:1em;" "margin-bottom:16px;" "font-weight:bold;" "line-height:1.4;") ) self.__soup_append_style( ".markdown-body h1", ("padding-bottom:0.3em;" "font-size:2.25em;" "line-height:1.2;" "border-bottom:1px solid #eee;") ) self.__soup_append_style( ".markdown-body h2", ("padding-bottom:0.3em;" "font-size:1.75em;" "line-height:1.225;" "border-bottom:1px solid #eee;") ) self.__soup_append_style( ".markdown-body h3", ("font-size:1.5em;" "line-height:1.43;") ) self.__soup_append_style(".markdown-body h4", "font-size:1.25em;") self.__soup_append_style( ".markdown-body table", ("display:block;" "width:100%;" "overflow:auto;" "word-break:normal;" "word-break:keep-all;" "border-spacing:0;" "border-collapse:collapse;") ) self.__soup_append_style( ".markdown-body table th, .markdown-body table td", "padding:6px 13px;border:1px solid #ddd;" ) self.__soup_append_style( ".markdown-body table th", "font-weight:bold;" ) self.__soup_append_style( ".markdown-body ul", "padding-left:2em;" ) self.__soup_append_style( ".markdown-body code", ('font-family:Consolas,"Liberation Mono",Menlo,Courier,monospace;' "padding:0.2em 0.4em;" "margin:0;" "font-size:85%;" "background-color:rgba(0,0,0,0.04);" "border-radius:3px;") ) self.__soup_append_style( ".markdown-body span.red", ("color:#f33;" "padding:0.2em 0.4em;" "margin:0;" "background-color:rgba(200,32,32,0.15);" "border-radius:3px;") ) self.__soup_render_table_row() return self.__soup.extract() def add_header1(self, header=""): self.__md += ("# %s\n" % header) def add_header2(self, header=""): self.__md += ("## %s\n" % header) def add_header3(self, header=""): self.__md += ("### %s\n" % header) def add_header4(self, header=""): self.__md += ("#### %s\n" % header) def add_text(self, text=""): self.__md += ("\n%s\n\n" % text) def add_table(self, theader=[], trows=[{}], align=None): table = "|%s|\n" % "|".join(theader) style = {"c": ":---:", "l": ":---", "r": "---:"} if align is None: table += "|%s|\n" % "|".join(["---"] * len(theader)) else: table += "|%s|\n" % "|".join([style.get(a, "---") for a in align]) for row in trows: l = [] for h in theader: l.append(str(row.get(h, " "))) table += "|%s|\n" % "|".join(l) self.__md += (table + "\n") def add_list(self, text_list=[]): def f(ll, wcnt): text = "" for l in ll: if isinstance(l, tuple): text += ("%s* %s\n" % (" " * wcnt, l[0])) text += f(l[1], wcnt + 4) else: text += ("%s* %s\n" % (" " * wcnt, l)) return text self.__md += f(text_list, 0) def add_md_text(self, text): self.__md += text
salesRank = product.sales_rank except: salesRank = 'NA' Main_Image = data.find(attrs={'id': 'imgTagWrapperId'}).find('img')['src'] Num_Images = len( data.find(attrs={ 'id': 'altImages' }).findAll(attrs={'class': 'a-spacing-small item'})) Title = title Price = price Average_Customer_Review = data.find( attrs={'class': 'reviewCountTextLinkedHistogram noUnderline'})['title'] page1 = mech.open('http://www.amazon.com/product-reviews/' + asin) html1 = page1.read() soup1 = BeautifulSoup(str(html1)) data1 = BeautifulSoup.extract((soup1)) Ratings = data1.findAll(attrs={'class': 'a-histogram-row'}) Stars_and_numbers = {} for rating in Ratings: key = rating.findAll( attrs={'class': 'a-text-right aok-nowrap'})[0].text value = s.findAll(attrs={'class': 'a-text-right aok-nowrap'})[1].text Stars_and_numbers[key] = value #Stars_and_numbers = data.find(attrs = {'class':'a-icon-alt'}).getText() Product_Link = url product_0 = amazon.lookup(ItemId=asin) Description = product_0.editorial_review features = data.find(attrs={'id': 'feature-bullets'}).findAll('li') Feature1 = features[1].getText() Feature2 = features[2].getText()
#coding: UTF-8 from readability.readability import Document from bs4 import BeautifulSoup import html2text import urllib import codecs f = open('input.html') htmldata = f.read() readable_article = Document(htmldata).summary() readable_title = Document(htmldata).short_title() soup = BeautifulSoup(readable_article, "lxml") soup.get_text() soup.find("img") soup.find("a") soup.extract() readable_article = soup.get_text() f = codecs.open('output.html', 'w', 'utf-8') f.write(readable_title) f.write(readable_article) readable_article = html2text.html2text(readable_article, ) f = codecs.open('output.txt', 'w', 'utf-8') f.write(readable_title) f.write(readable_article) f.close()
# using this header to prevent mod security error headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0)' } url = "http://www.clivebanks.co.uk/X-Files%20Timeline.htm" site_url = "/".join(url.split("/")[:3]) response = requests.get(url, headers=headers) # Obtaining the front page as writting it on disk instead memory. with open("front_page.html", mode="wb") as file: file.write(response.content) # Getting the links of all front page with open("front_page.html", mode="rb") as file: soup = BeautifulSoup(file, features="html.parser") # Looping through links of season one. urls = [] for a in soup.find_all("a", href=True): urls.append(site_url + "/" + a["href"]) if a["href"] == "X-Files/Truth.htm": break # Saving out the text file to disk as a dataset. with open("x_files_dataset_new.txt", mode="w") as file: for url_ in urls[1:]: response = requests.get(url_, headers=headers) soup = BeautifulSoup(response.content, features="html.parser") file.write(soup.extract().text)
import mechanize from mechanize import Browser from bs4 import BeautifulSoup url = 'http://www.amazon.co.uk/dp/B00NVDNDUW' mech = Browser() page = mech.open(url) html = page.read() soup = BeautifulSoup(html) #data = BeautifulSoup.extract(soup) def extract(soup): table = soup.find("table",attrs={'id':'ctl00_TemplateBody_WebPartManager1_gwpste_container_SearchForm_ciSearchForm_RTable'}) #print table data = [] for row in table.findAll("tr"): s = row.getText() data.append(s) salesRankElem = data.find(attrs = {'id':'SalesRank'}).find(attrs = {'class' : 'value'}).getText() salesRank = re.findall(r'\d+\sin', salesRankElem)[0].replace('in','').strip() return int(salesRank) data = BeautifulSoup.extract(soup)
for serie in series[3:]: for episode in range(serie[1],serie[2]): if len(str(episode)) < serie[3]: # Enterprise naming nonsense. God, the inconsistency episode = "0" * (serie[3] - len(str(episode))) + str(episode) url = "http://chakoteya.net/" + serie[0] + str(episode) + ".htm" #49.htm response = requests.post(url, data="{Test data}") #print(response) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") #print(soup) #print(soup.find_all('a')) time.sleep(.01) directory = "./" + serie[0] if not os.path.exists(directory): os.makedirs(directory) file = open("./" + serie[0] + str(episode) + ".txt", 'w+') lines = soup.extract().text #lines = response.text.replace("\n", " ").replace("\r", " ").replace(" ", " ") #print(lines) #lines = re.findall(": (.*?)" + re.escape("<br>"), lines) #print(lines[:60])# file.write("".join(lines[75:-260])) file.close() else: print("Missed: ", url) print("Through", serie[0])
def getText(link, keywords, haveKeyword): print("---", link) try: r = requests.get(link, verify=False, timeout=5) except: return [] b = BeautifulSoup(r.text, "lxml") [b.extract() for b in b(['script', 'link', 'meta', 'style'])] # assume the text will only be in p and div text = [b.extract() for b in b(['p', 'div'])] for excerpt in text: # iterate through each para/div # Change newline tags to sentences found = excerpt.get_text(separator=" ").strip().replace( "\xa0", ". ").replace("\n", ". ").replace("\t", ". ").replace("\r", ". ").strip() while " " in found: # Remove double spaces found = found.replace(" ", " ") if found.count("/") > 2: # likely a link continue words = found.split(" ") if len(words) > 15: # Time to add it in # Check if it has the name of the product foundCleaned = [word for word in words if word not in stops] uniqueWords = set(foundCleaned) maxCount = 0 for word in uniqueWords: maxCount = max( maxCount, found.count(word) ) # count the number of unique words (excluding stopwords) if maxCount / len(foundCleaned) > 0.15: # ignored.append(found) continue #pass elif len(foundCleaned) / len(words) > 0.73: # Too many unique words continue else: haveKeywords = any(keyword in found.lower() for keyword in keywords) sentences = re.split('\. |! |; ', found) #found.split(". ") for sentence in sentences: # Remove parentheses while "(" in sentence and ")" in sentence: startBracket = sentence.find("(") endBracket = sentence.find(")") if startBracket > -1 and endBracket > startBracket: sentence = sentence[0:startBracket].strip( ) + " " + sentence[endBracket + 1:].strip() else: break sentence = sentence.strip() # Remove ending punctuation while len(sentence) > 0 and ( sentence[len(sentence) - 1] == "." or sentence[len(sentence) - 1] == ","): sentence = sentence[0:len(sentence) - 1] if len(sentence) == 0: continue words = sentence.split(" ") if len(words) > 2: if haveKeywords: if sentence not in haveKeyword: sentenceToAdd = "" for word in words: if len(word) > 0: sentenceToAdd += word + " " haveKeyword.append(sentenceToAdd.strip())
html_doc = """ <html> <head> <title>Titre de votre site</title> </head> <body> <p class="c1 c2">texte a lire1</p> <p class="c3">texte a lire2</p> </body> </html> """ soup = BeautifulSoup(html_doc) print soup.extract() for p in soup.find_all('p'): print p.get("class") print "---------------" for p in soup.find_all('p'): print p p.string = "NV T" print p print soup print "-----------------------"