def getCategoryUrl(site="",url=""): catDb = openTable(tableName=global_setting['catTable']) r = session.get(url) if not r.text: return False soup = BeautifulSoup(r.text) for level1 in soup.select('.classify_books'): curLevel1 = level1.select('.classify_title')[0].text curLevel1 = re.sub('\s', '', curLevel1) for level2 in level1.select('.classify_kind'): curLevel2 = level2.select('.classify_kind_name')[0].text curLevel2 = re.sub('\s', '', curLevel2) for level3 in level2.select('ul li a'): #curLevel3 = re.sub('\s', '', level3.text) curLevel3 = level3.text.strip() curlUrl = level3['href'] retFind = re.findall(r'\/cp(.*)\.html',curlUrl) if retFind: curCatID = retFind[0] catType = 'book' else: retFind = re.findall(r'\/cid(.*)\.html',curlUrl) if retFind: curCatID = retFind[0] catType = 'nonbook' if retFind: if catDb.find({'catId':curCatID}).count() >0: logger.debug('catetogy %s exists,skip\n'%(curCatID)) else: catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl,'catType':catType, 'site':site}) return True
def show_options(id): r = requests.get("https://interaktiv.mx.dk/toolbox/" + votetype + "/get/" + id) soup2 = BeautifulSoup(r.text, "lxml") clear_console() print_logo() print "(Interaktiv version. Kør scriptet med -h eller --help for flere indstillinger.)" print vote_text = soup2.find("div", attrs={"id": "vote_text"}).text print vote_text print if votetype == "advancedvotes": for option in soup2.find_all("div", attrs={"class": "vote_button"}): number = option.get("data-vote") text = option.text print "(%s) %s" % (number, text) print else: for option in soup2.find_all("div", attrs={"class": "vote_button"}): if option.get("id") == "vote_yes": number = "1" else: number = "0" text = option.text print "(%s) %s" % (number, text) print
def replace_links_with_text(html): """any absolute links will be replaced with the url in plain text, same with any img tags """ soup = BeautifulSoup(html, 'html5lib') abs_url_re = r'^http(s)?://' images = soup.find_all('img') for image in images: url = image.get('src', '') text = image.get('alt', '') if url == '' or re.match(abs_url_re, url): image.replaceWith(format_url_replacement(url, text)) links = soup.find_all('a') for link in links: url = link.get('href', '') text = ''.join(link.text) or '' if text == '': # this is due to an issue with url inlining in comments link.replaceWith('') elif url == '' or re.match(abs_url_re, url): link.replaceWith(format_url_replacement(url, text)) return force_text(soup.find('body').renderContents(), 'utf-8')
def get_Comics(self, name, comic_url): if not self.mkdir(name): again = '' while (1): again = str(input('Directory ' + name + ' already exists, do you wanna to download again? (Y/N)')) if again == 'Y' or again == 'N': break if again == 'N': print('Folder \'BLEACH/' + name + '\' already exists!') return else: shutil.rmtree(self.path) self.mkdir(name) # Parse html page_url = self.prefix + comic_url data = urllib.request.urlopen(page_url).read().decode('utf-8', 'ignore') data.encode('utf-8') soup = BeautifulSoup(data, 'lxml') lists = soup.findAll('img', {'class': 'BDE_Image'}) print('Downloading: ' + name) # Define progress bar's length progress_bar = tqdm(unit='Pic', total=len(lists)) count = 0 for each in lists: pic_url = each['src'] filename = '%03d.txt' % count + '.' + pic_url.split('.')[-1] urllib.request.urlretrieve(pic_url, filename = self.path + '/' + filename) progress_bar.update(1) count = count + 1 # Close bar progress_bar.close()
def getMoviesActors(movieList): """ :param A list containing formatted movie list :return: A list containing ID of the movie and all actors in that movie including actors ID """ actorsInMovies = {} for x in movieList: req = urllib.request.Request(BASE_URL+movieList[x]["Url"]+"/fullcredits") #print(req.full_url) # Header is necessary to get the right movie titles, as in the english title req.add_header('Accept-Language', 'en-US,en') # Send the request and get response response = urllib.request.urlopen(req) bsoup = BeautifulSoup(response) findCastList = bsoup.find("table", {"class": "cast_list"}) findAllActors = findCastList.findAll("td", itemprop="actor") actors = {} for d in findAllActors: actorName = d.find("span", itemprop="name") actorNumber = d.find("a", href=re.compile("\/name\/nm")) actorID = re.match("(?:\/name\/nm)(?P<userid>\d+)", actorNumber["href"]).group("userid") actors[actorID] = actorName.contents[0] actorsInMovies[movieList[x]["ID"]] = actors return actorsInMovies
def parse(html): ''' 页面分析,如果接收到的内容是ERROR_NUM,则说明超时了,则无需在分析; 如果正常,则分别匹配出商品的id,name,price,stat,并写到以日期命名的文件中 ''' if not html: logger.info('======pass parse=====') return {} items = {} # print isinstance(html, str) parse_page = BeautifulSoup(html) goods = parse_page.find_all('div', class_='goods-content') for good in goods: good_id = good['nctype_goods'][1:]#在开始有一个空格 good_name = good.select('div[class="goods-name"]')[0].a.text.replace(',', '_') good_price = good.select('em[class="sale-price"]')[0].text if re.findall(u'\u4e07', good_price):#处理‘1.3万’这种价格 good_price = str(float(good_price[:-1])*10000) else:#去掉价格里的人民币符号 good_price = good_price[1:] good_stat = good.select('a[class="status"]')[0].text items[good_id] = good_name + ',' + good_price + ',' + good_stat return items
def get_visible_text(html): """returns visible text from html http://stackoverflow.com/a/19760007/110274 """ soup = BeautifulSoup(html, 'html5lib') [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] return soup.get_text()
def from_pmml(self, pmml): """Returns a model with the intercept and coefficients represented in PMML file.""" model = self() # Reads the input PMML file with BeautifulSoup. with open(pmml, "r") as f: lm_soup = BeautifulSoup(f, "xml") if not lm_soup.RegressionTable: raise ValueError("RegressionTable not found in the input PMML file.") else: ##### DO I WANT TO PULL THIS OUT AS ITS OWN FUNCTION? ##### # Pulls out intercept from the PMML file and assigns it to the # model. If the intercept does not exist, assign it to zero. intercept = 0 if "intercept" in lm_soup.RegressionTable.attrs: intercept = lm_soup.RegressionTable['intercept'] model.intercept_ = float(intercept) # Pulls out coefficients from the PMML file, and assigns them # to the model. if not lm_soup.find_all('NumericPredictor'): raise ValueError("NumericPredictor not found in the input PMML file.") else: coefs = [] numeric_predictors = lm_soup.find_all('NumericPredictor') for i in numeric_predictors: i_coef = float(i['coefficient']) coefs.append(i_coef) model.coef_ = numpy.array(coefs) return model
def get_text_from_html(html_text): """Returns the content part from an HTML document retains links and references to images and line breaks. """ soup = BeautifulSoup(html_text, 'html5lib') # replace <a> links with plain text links = soup.find_all('a') for link in links: url = link.get('href', '') text = ''.join(link.text) or '' link.replaceWith(format_url_replacement(url, text)) # replace <img> tags with plain text images = soup.find_all('img') for image in images: url = image.get('src', '') text = image.get('alt', '') image.replaceWith(format_url_replacement(url, text)) # extract and join phrases body_element = soup.find('body') filter_func = lambda s: bool(s.strip()) phrases = map( lambda s: s.strip(), filter(filter_func, body_element.get_text().split('\n')) ) return '\n\n'.join(phrases)
def moderate_tags(html): """replaces instances of <a> and <img> with "item in moderation" alerts """ from askbot.conf import settings soup = BeautifulSoup(html, 'html5lib') replaced = False if settings.MODERATE_LINKS: links = soup.find_all('a') if links: template = get_template('widgets/moderated_link.jinja') aviso = BeautifulSoup(template.render(), 'html5lib').find('body') map(lambda v: v.replaceWith(aviso), links) replaced = True if settings.MODERATE_IMAGES: images = soup.find_all('img') if images: template = get_template('widgets/moderated_link.jinja') aviso = BeautifulSoup(template.render(), 'html5lib').find('body') map(lambda v: v.replaceWith(aviso), images) replaced = True if replaced: return force_text(soup.find('body').renderContents(), 'utf-8') return html
def prettify(which, id): prefix = which[0] bs = BeautifulSoup(open(os.path.join(root,which, i+"-" + prefix + ".xml")), 'xml') sgm = i + "-" + prefix + ".sgm" out = bs.prettify(encoding='utf-8') [first, rest] = out.split("\n",1) return rest.replace(sgm, i) # the ID in the files look like "atwoma-b.sgm" rather than "atwoma"
def extract_images(base,html): images = [] soup = BeautifulSoup(html) for img in soup.find_all("img"): if img.has_attr("src"): images.append(urljoin(base,img["src"])) return images
def get_sp500_symbols(): page_html = wiki_html('List_of_S%26P_500_companies', 'SP500.html') wiki_soup = BeautifulSoup(page_html, "html.parser") symbol_table = wiki_soup.find(attrs={'class': 'wikitable sortable'}) symbol_data_list = list() for symbol in symbol_table.find_all("tr"): symbol_data_content = dict() symbol_raw_data = symbol.find_all("td") td_count = 0 for symbol_data in symbol_raw_data: if(td_count == 0): symbol_data_content[ 'symbol'] = symbol_data.text elif(td_count == 1): symbol_data_content[ 'company'] = symbol_data.text elif(td_count == 3): symbol_data_content[ 'sector'] = symbol_data.text elif(td_count == 4): symbol_data_content[ 'industry'] = symbol_data.text elif(td_count == 5): symbol_data_content[ 'headquarters'] = symbol_data.text td_count += 1 symbol_data_list.append(symbol_data_content) return symbol_data_list[1::]
def test_23_admin_add_category(self): """Test ADMIN add category works""" self.create() category = {'name': 'cat', 'short_name': 'cat', 'description': 'description'} # Anonymous user url = '/admin/categories' res = self.app.post(url, data=category, follow_redirects=True) dom = BeautifulSoup(res.data) err_msg = "Anonymous users should be redirected to sign in" assert dom.find(id='signin') is not None, err_msg # Authenticated user but not admin self.signin(email=self.email_addr2, password=self.password) res = self.app.post(url, data=category, follow_redirects=True) err_msg = "Non-Admin users should get 403" assert res.status_code == 403, err_msg self.signout() # Admin self.signin(email=self.root_addr, password=self.root_password) res = self.app.post(url, data=category, follow_redirects=True) err_msg = "Category should be added" assert "Category added" in res.data, err_msg assert category['name'] in res.data, err_msg category = {'name': 'cat', 'short_name': 'cat', 'description': 'description'} self.signin(email=self.root_addr, password=self.root_password) res = self.app.post(url, data=category, follow_redirects=True) err_msg = "Category form validation should work" assert "Please correct the errors" in res.data, err_msg
def getWeibos(self, keyword, page=1, count=None): url = 'http://t.hexun.com/k/topic.html?type=1&value=%s&pg=%d' % (json.dumps(keyword).replace('\\', '%').replace('"', ''), page) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: infos = result['info'].decode('gb2312') soup = BeautifulSoup(infos) total_soup = soup.select('.headerR1')[0] total_num = total_soup.get_text().split('共')[-1].split('条')[0].strip() return_val = {'total_count': int(total_num), 'msgs':[]} allmsgs = [] msgs_soup = soup.select('.nr_con') for msg_soup in msgs_soup: avatar = 'http://t.hexun.com%s' % msg_soup.select('.nr_conLa > a')[0].get('href') nickandtext = msg_soup.select('.nr_shuo')[0].get_text().split(':') nickname = nickandtext[0] text = nickandtext[1] ts = msg_soup.select('.nr_tan > h3 > a')[0].get_text() allmsgs.append({ 'avatar': avatar, 'nickname': nickname, 'text': text, 'datetime': ts, }) return_val['msgs'] = allmsgs return return_val
def __call__(self, url, count_of_crawler): """ Function which fetch the content from the given URL and collect all the URL in the content and pass the first url of the page to fetch the content. """ try: page = urllib2.urlopen(url) soup = BeautifulSoup(page.read()) links_on_page = map(lambda anchor: anchor.get('href'), soup.find_all('a')) cleaned_url = map(lambda link: link if urlparse(link).scheme and urlparse(url).netloc else (urlparse(url) .scheme+"://"+urlparse(url).netloc+link if link[0] == "/" else url+link), links_on_page) visited_url.append(url) total_collected_url.append(cleaned_url) next_url_to_visit = [next_url for next_url in cleaned_url\ if not next_url in visited_url and not "#" in next_url][0] if count_of_crawler and next_url_to_visit: count_of_crawler = crawler(next_url_to_visit, count_of_crawler-1) except: print "It seems there is some issue in URL "+url return count_of_crawler
def get_page_info(id_no, s=None): ''' Extract restaurant information from Charlotte's health inspection website INPUT: id_no = int, id # for ESTABLISHMENT s = request.Session(), [OPTIONAL] OUTPUT: out = dict, establishment-level information ''' if s is None: s = requests.Session() link = 'https://public.cdpehs.com/NCENVPBL/INSPECTION/ShowESTABLISHMENTPage.aspx' payload = {'ESTABLISHMENT':id_no, 'esttst_cty':60} z = s.get(link, params=payload) soup = BeautifulSoup(z.content, from_encoding='UTF-8') t = soup.findAll('table')[0] insp_info = np.array([y.text for y in t.findAll('td', attrs={'class':'ttc'})]).reshape(-1,4) if insp_info.shape[0] < 1: return None r = t.findAll('td', attrs={'class':'dfv'}) rest_info = [x.text for x in r] return {'name' :rest_info[0], 'address' :rest_info[2], 'city' :rest_info[8], 'state' :rest_info[9], 'zip' :rest_info[10], 'type' :rest_info[16], 'county' :rest_info[19], 'inspections':insp_info}
def scrap_items(): for itemlist in ITEMLIST: soup = BS(urllib2.urlopen(''.join([LOLWIKI, itemlist])).read()) item_table = soup.find('table', class_='stdt sortable') for tr in item_table.find_all('tr'): tds = tr.find_all('td') if len(tds) < 1: continue if tr.find('p') == None: continue item_name = tr.find('p').text.strip() item_url = tr.find('img')['src'] if item_url.split(':')[0] == 'data': item_url = tr.find('img')['data-src'] if not HOOKED: continue #store item in database d_item = Item() d_item.name = item_name t_img = NamedTemporaryFile(delete=True) t_img.write(urllib2.urlopen(item_url).read()) t_img.flush() t_img.name = '.'.join([item_name, 'jpg']) d_item.picture = File(t_img) d_item.save()
def get_games(date, output_file=None): # games_url = base + '/scoreboard/' + format_date(date) + '/games.json' games_url = si_base + 'schedule' #print format_date(date) result = requests.get(games_url, params={'date': format_date(date)}) #print games_url + format_date(date) soup = BeautifulSoup(result.text) #date_string = date.strftime('%B %d,%Y') games = soup.find_all('tr', 'component-scoreboard-list final') game_ids = [] for game in games: game_date_elem = game.find('div', 'game-anchor') game_date_text = game_date_elem['id'] game_date = date_parser.parse(game_date_text).date() if game_date == date: game_id = int(game['data-id']) game_ids.append(game_id) if output_file is not None: of = open(output_file, 'w') of.write(json.dumps({'game_date': format_date(date), 'game_ids': game_ids})) of.close() return game_ids
def parse(self, response): logger.info("Parsing {}".format(response.url)) soup = BeautifulSoup(response.body, "html.parser") trs = soup.find_all("tr", "item") if trs: for tr in trs: link = tr.find("a") article_url = DETAIL_URL.format(link["href"]) r = scrapy.Request(article_url, callback=self.parse_article) yield r # next urls try: next_url = soup.find(class_="next").a cat_url = response.url u = urlparse(cat_url) query = None # Strip the query part u = u._replace(query=query) follow_url = urlunparse(u) + next_url["href"] r = scrapy.Request(follow_url, callback=self.parse) yield r except AttributeError: logger.info("Done with".format(response.url)) pass
def Get_All_Teams(): data_path = '../data/' # get the teams url = 'http://espn.go.com/nba/teams' html = urllib.urlopen(url).read() soup = BeautifulSoup(html, 'lxml') # print (soup.prettify()) tables = soup.find_all('ul', class_ = 'medium-logos') tables[0].find_all('li')[0].h5.a name_pref_Tuples = [] city_name_Dict = {} for table in tables: lis = table.find_all('li') for li in lis: info = li.h5.a team_url = info['href'] team_name = info.text pref = team_url.split('/')[-2] city_name = ' '.join(info.text.split()[:-1]) if team_name == 'Portland Trail Blazers': city_name = 'Portland' city_name_Dict[city_name] = team_name name_pref_Tuples.append((team_name, pref)) print 'output two files: city_name.pickle and name_pref.pickle' print 'city_name.pickle is a dict with (city, team_name) pairs' print 'name_pref.pickle is a list of (team_name, team_name_prefix) tuples' pk.dump(city_name_Dict, open(data_path + 'city_name.pickle', 'wb')) pk.dump(name_pref_Tuples, open(data_path + 'name_pref.pickle', 'wb'))
def _get_new_brunswick_flows(requests_obj): """ Gets current electricity flows in and out of New Brunswick. There is no reported data timestamp in the page. The page returns current time and says "Times at which values are sampled may vary by as much as 5 minutes." """ url = 'https://tso.nbpower.com/Public/en/SystemInformation_realtime.asp' response = requests_obj.get(url) soup = BeautifulSoup(response.text, 'html.parser') table = soup.find('table', attrs={'bordercolor': '#191970'}) rows = table.find_all('tr') headers = rows[1].find_all('td') values = rows[2].find_all('td') flows = {headers[i].text.strip(): float(row.text.strip()) for i, row in enumerate(values)} return flows
def _login(self, username=None, store_password=False): if username is None: if self.USERNAME == "": raise LoginError("If you do not pass a username to login(), you should configure a default one!") else: username = self.USERNAME # Get password from keyring or prompt password_from_keyring = keyring.get_password("astroquery:www.eso.org", username) if password_from_keyring is None: if system_tools.in_ipynb(): log.warn("You may be using an ipython notebook:" " the password form will appear in your terminal.") password = getpass.getpass("{0}, enter your ESO password:\n".format(username)) else: password = password_from_keyring # Authenticate log.info("Authenticating {0} on www.eso.org...".format(username)) # Do not cache pieces of the login process login_response = self._request("GET", "https://www.eso.org/sso/login", cache=False) login_result_response = self._activate_form(login_response, form_index=-1, inputs={'username': username, 'password': password}) root = BeautifulSoup(login_result_response.content, 'html5lib') authenticated = not root.select('.error') if authenticated: log.info("Authentication successful!") else: log.exception("Authentication failed!") # When authenticated, save password in keyring if needed if authenticated and password_from_keyring is None and store_password: keyring.set_password("astroquery:www.eso.org", username, password) return authenticated
def crawlLinkScoial(url): try: pages=[] arr=[] source_code=requests.get(url) plain_text=source_code.text soup=BeautifulSoup(plain_text) for link in soup.findAll('a'): href=link.get('href') href_test=str(href) #if href_test[0]!='/' and href_test[0]!='j' and href_test!='none' and href_test[0]!='#': if is_in_arr(pages,str(href))==False: if "facebook" in href_test or "twitter" in href_test or "google" in href_test: lin=getGoodLink(url) pages.append(lin+str(href)) newArr=deleteDuplicates(pages) for page in newArr: socialFile.write(page) socialFile.write("\n") allFile.write("Social-Media-Links: \n") for page in newArr: allFile.write(page) allFile.write("\n") except: print "Error at: "+str(url)
def crawlSearch(url,pages): try: arr=[] source_code=requests.get(url) plain_text=source_code.text soup=BeautifulSoup(plain_text) for link in soup.findAll('a'): href=link.get('href') href_test=str(href) #if href_test[0]!='/' and href_test[0]!='j' and href_test!='none' and href_test[0]!='#': if is_in_arr(pages,str(href))==False: if "microsoft" not in href_test and "facebook" not in href_test and "twitter" not in href_test and "google" not in href_test: if href_test.startswith("http"): if "bing" not in href_test: if "scholarships.com" not in href_test: pages.append(href) print str(href) else: if countS<2: crawl(href,pages) print "Crawling "+str(href) countS=countS+1 else: print "Skiping "+str(href) else: pass except: print "Error at: "+str(url)
def convert_links(text, quote="\""): soup = BeautifulSoup(text, "html.parser") for t in soup.findAll(text=True): if has_link_parent(t): continue split = re.split(r"(?:(https?://)|(www\.))([\S]+\.[^\s<>\"\']+)", t) if len(split) == 1: continue r = "" n = 0 split = [s or "" for s in split] while split: if n % 2 == 0: r += split[0] split.pop(0) else: r += "<a href=%shttp://%s%s%s>%s%s%s</a>" % ( quote, split[1], split[2], quote, split[0], split[1], split[2] ) split.pop(0) split.pop(0) split.pop(0) n += 1 t.replaceWith(BeautifulSoup(r, "html.parser")) return str(soup)
def reverseIP(self): #acomodar la url como la necesitamos (www.url.com) if self.url.startswith("http://"): url = self.url.replace("http://","") #remplazar por vacio :v else: url = self.url #se envia por post ya que la pagina usa un formulario para pedir la url a escanear #data son los datos POST que es la url #remoteHost es como se envía el parametro (la url que se especifica en connection) data = {"remoteHost" : url} connection = requests.post( #parametros necesarios para la conexion url="http://www.ipfingerprints.com/scripts/getReverseIP.php", data=data ) #connection.text es el html que retorna la conexion #BeautifulSoup lo parsea menos horrible #html.parser para salida mas limpia beautifulOut = BeautifulSoup(connection.text, "html.parser") #aqui guardaremos todos los links que encontremos en la etiqueta response = list() #find_all busca todas las equitas y 'a' es el parametro para filtrar solo ese tipo de etiqueta for link in beautifulOut.find_all("a"): #href es el nombre del dominio (que es lo unico que nos interesa de toda la etiqueta) currentLink = link.get("href") response.append(currentLink[11:-2]) return response
def htmlfile(url): r = urllib2.urlopen(url) soup = BeautifulSoup(r) html = [] #html- title, css (body width 960px) html.append('<html><head><title>'+soup.title.string+'</title><link rel="stylesheet" type="text/css" href="page.css"></head><body>') #parses for content only in article div - depends on site oblicously content = soup.find('div', {'class': 'layout-block-a'}) #gets hhtml paragraphs and h1 headings - should be alterd for websites style for text in content.find_all(['p', 'h1']): if text.name == 'p': html.append(str(text).decode("ascii", "ignore")) else: html.append(str(text).decode("ascii", "ignore")) html.append('</body></html>') # creates html files here out = open(soup.title.string+'.html', 'a') for line in html: out.write(line) out.close( if __name__ == '__main__': main()
def parse_data(data): page = BeautifulSoup(data) results = page.find("div", id="res") if results is None: raise NoResultsException calc = results.find("img", src="/images/icons/onebox/calculator-40.gif") if calc is not None: calc = results.find("h2", {"class": "r"}) if calc is not None: superscripts = calc.find_all("sup") if superscripts is not None and len(superscripts): for x in superscripts: x.contents[0].replaceWith("^" + x.contents[0]) return [dict(type="string", string=util.strip_html(calc).decode("utf-8"))] nresults = results.find_all("li", {"class": "g"}) if len(nresults) == 0: raise NoResultsException processed_results = [] for x in nresults: a_tag = x.find("a") if a_tag is not None: processed_results.append( dict(type="result", href=urlparse.parse_qs(urlparse.urlparse(a_tag["href"]).query)["q"][0], text=util.strip_html(a_tag).decode("utf-8"))) return processed_results
def insert_push(self): uw = user_website.UserWebsite() userids = uw.get_user_ids_by_website_id(self.website_id) for id in userids: p = push.Push() p.website_id = self.website_id p.user_id = id p.title = "has new notice" soup_diff = BeautifulSoup(self.get_different()) new_link_list = soup_diff.find_all('a') new_link_count = len(new_link_list) if new_link_count == 1: content = "one notice is published:\n" else: content = str(new_link_count) + " notices are published:\n" content += self.get_different() p.content = content p.content = p.content.replace('"',"'") p.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') p.website_id = self.website_id p.content_url = "" p.insert()
'Chrome/80.0.3987.132 Safari/537.36' } url = 'https://www.ptt.cc/bbs/Gossiping/index.html' url_head = 'https://www.ptt.cc' # 建立另一個資料夾 path = './pttGossiping/' if not os.path.exists(path): os.mkdir(path) ss = requests.session() ss.cookies['over18'] = '1' for i in range(0, 3): res = ss.get(url, headers=headers) soup = BeautifulSoup(res.text, 'html.parser') last_page_url = url_head + soup.select('a.btn.wide')[1]['href'] title = soup.select('div.title a') for t in title: article_title = t.text article_url = url_head + t['href'] # 依序進入各個文章 article_res = ss.get(article_url, headers=headers) soup = BeautifulSoup(article_res.text, 'html.parser') try: # 標題、作者、日期的資訊 result = soup.select('span.article-meta-value')
from bs4 import BeautifulSoup import requests page_link = 'https://www.vocabulary.com/lists/274832' page_responce = requests.get(page_link, timeout=5) page_content = BeautifulSoup(page_responce.content, "html.parser") wordContent = [] for i in range(0, 499): wordEntry = page_content.find_all("a", class_="word")[i].text wordDef = page_content.find_all("div", class_="definition")[i].text wordContent.append((wordEntry, wordDef)) f = open("words.txt", "w+") for i in range(len(wordContent)): f.write("{ ") f.write("{},{}\n".format(wordContent[i][0], wordContent[i][1])) #f.write( " %s", " %s", ) % (wordContent[i][0], wordContent[i][1]) print("should have worked") path = os.path.abspath(f) directory = os.path.dirname(path) print("path: {path}, directory: {directory}") f.close()
def create_soup(url): res = requests.get(url) res.raise_for_status() soup = BeautifulSoup(res.text, "lxml") return soup
#5 from bs4 import BeautifulSoup import os import glob import sys from xlrd import open_workbook from xlwt import Workbook import xlsxwriter workbook = xlsxwriter.Workbook('EN4th.xlsx') #NAME OF GENERATED FILE worksheet = workbook.add_worksheet() row = 1 for filename in glob.glob('*.html'): soup = BeautifulSoup(open(filename), 'html.parser') n = 0 c = 0 for b in soup.table(): if (str(b.get('id')) != "None"): n = n + 1 x = str(b.get('id')) for b in soup.table(): if (str(b.get('id')) != "None"): c = c + 1 if (c == n - 1): x = str(b.get('id')) id_selector = x[3:5] print(id_selector) rollnumber = str(soup.find(id='lblRollNo').text) name = str(soup.find(id='lblFullName').text)
def get_book(url): html = request_dangdang(url) soup = BeautifulSoup(html, 'lxml') parse_html(soup)
def get_base_html(self): resp = self._get_url(self.base_url) return BeautifulSoup(resp.text, 'html.parser')
def main(url): with open('files/URL.txt', 'r') as file: soup_string = file.read() soup = BeautifulSoup(soup_string, 'html.parser') status = [] hostname = url h = [(x.start(0), x.end(0)) for x in re.finditer('https://|http://|www.|https://www.|http://www.', hostname)] z = int(len(h)) if z != 0: y = h[0][1] hostname = hostname[y:] h = [(x.start(0), x.end(0)) for x in re.finditer('/', hostname)] z = int(len(h)) if z != 0: hostname = hostname[:h[0][0]] status.append(having_ip_address(url)) status.append(url_length(url)) status.append(shortening_service(url)) status.append(having_at_symbol(url)) status.append(double_slash_redirecting(url)) status.append(prefix_suffix(hostname)) status.append(having_sub_domain(url)) status.append(SSLfinal_State(url)) dns = 1 try: domain = whois.query(hostname) except: dns = -1 if dns == -1: status.append(-1) else: status.append(domain_registration_length(domain)) status.append(favicon(url, soup, hostname)) status.append(https_token(url)) status.append(request_url(url, soup, hostname)) status.append(url_of_anchor(url, soup, hostname)) status.append(links_in_tags(url, soup, hostname)) status.append(sfh(url, soup, hostname)) status.append(submitting_to_email(soup)) if dns == -1: status.append(-1) else: status.append(abnormal_url(domain, url)) status.append(i_frame(soup)) if dns == -1: status.append(-1) else: status.append(age_of_domain(domain)) status.append(dns) status.append(web_traffic(soup)) status.append(page_rank(url)) status.append(google_index(url)) status.append(links_pointing_to_page(url)) status.append(statistical_report(url, hostname)) """ print('\n1. Having IP address\n2. URL Length\n3. URL Shortening service\n4. Having @ symbol\n' '5. Having double slash\n6. Having dash symbol(Prefix Suffix)\n7. Having multiple subdomains\n' '8. SSL Final State\n8. Domain Registration Length\n9. Favicon\n10. HTTP or HTTPS token in domain name\n' '11. Request URL\n12. URL of Anchor\n13. Links in tags\n14. SFH\n15. Submitting to email\n16. Abnormal URL\n' '17. IFrame\n18. Age of Domain\n19. DNS Record\n20. Web Traffic\n21. Google Index\n22. Statistical Reports\n') """ print(status) return status
def main(): parser = ArgumentParser() parser.add_argument("--folder", type=str, dest="folder") args = parser.parse_args() hotelname = args.folder print hotelname + " review start" print open_file = "data/file_hotel_review-" + hotelname + "-" + city + "-" + state + ".html" print open_file input_file = open(open_file, 'r') for line in input_file: # print line url = line # time.sleep(1) review_html = urllib2.urlopen(url) review_bsObj = BeautifulSoup(review_html.read()) #print url # print for link in review_bsObj.findAll("span", {"class": "ratingDate"}): if 'content' in link.attrs: review_date = link.attrs['content'] # print review_date, type(review_date) Date = datetime.datetime.strptime(review_date, "%Y-%m-%d") # print Date, type(Date) index_time = datetime.datetime(2006, 1, 1) if Date > index_time: hotel_id = re.findall(r"-d([0-9]*)", url) # print hotel_id[0], type(hotel_id[0]) r_id = re.findall(r"-r([0-9]*)", url) # print r_id[0] review_id = "review_" + r_id[0] # print review_id Crating = 0 for link in review_bsObj.findAll("div", {"id": review_id}, {"class": "v"}): # for link in review_bsObj.findAll("div", {"class":"rating-list"}): # print link for link1 in link.findAll( "li", {"class": "recommend-answer"}): # print link1 # print link1.text , type(link1.text) l0 = str(link1) # print l0 , type(l0) if "Cleanliness" in l0: # print "l0 is "+ l0 hotel_id = re.findall(r"-d([0-9]*)", url) index_s = "<img alt=\"" r0 = re.findall(r"<img alt=\"[0-9*]", l0) # print r0[0][10:] Crating = int(r0[0][10:]) for link in review_bsObj.findAll( "p", {"id": review_id}, ): # for link in review_bsObj.findAll("div",{"class":"col2of2"},): # print link r0 = str(link) # print r0 , type(r0) r1 = re.compile(r'<.*?>') review = r1.sub('', r0) # r1=r0.replace("^<[A-za-z0-9*\"\=\\]$>","") # print review hotel_review_rate = [] hotel_review_rate = [ hotel_id[0], hotelname, Date, review_id, r_id[0], review, Crating ] hotel_review_rate_list.append(hotel_review_rate) # print hotel_review_rate_list heading = [ 'ID', 'HotelName', 'ReviewDate', 'ReviewId', 'ReviewId_number', 'Review', 'Cleanliness' ] print " hotel_review_rate_list saving in csv file .. " outfile = "data/hotel_review_rate_list_" + hotel_id[ 0] + "_" + hotelname + ".csv" with open(outfile, 'wb') as f: writer = csv.writer(f, delimiter=';') writer.writerow(heading) for row in hotel_review_rate_list: writer.writerow(row) f.close()
reviews = pd.DataFrame(results) reviews.to_excel('reviewscount-es-error.xlsx', index=False) sys.exit() else: pass if etreeee1.xpath("*//form[@action='/errors/validateCaptcha']"): print('需要验证码') reviews = pd.DataFrame(results) reviews.to_excel('reviewscount-es-error.xlsx', index=False) sys.exit() else: print('爬取成功') html1 = r1.content amazonreviews = BeautifulSoup(html1, 'lxml') fmt_vrp_reviews = amazonreviews.find_all('div', attrs={'class': 'a-section a-spacing-medium'}) for c1 in fmt_vrp_reviews: try: contents1 = c1.span.string fmt_vrp_review = contents1.split(' ', 4)[3] except: print('error') continue ''' time.sleep(1.0) a2 = (url1 + asin + url3) r2 = requests.get(url =a2, headers = headers) etreeee2 = fromstring(r1.text)
from bs4 import BeautifulSoup import requests import html5lib #india-times r = requests.get("https://timesofindia.indiatimes.com/briefs") rc = r.content soup = BeautifulSoup(rc , "html5lib") soup = soup.find_all('h2') #hindu-times r2 = requests.get("https://www.hindustantimes.com/india-news/") rc2 = r2.content soup2 = BeautifulSoup(rc2 , "html5lib") soup2 = soup2.find_all("div" , {"class":"headingfour"})
def convert(self): result = self.create_metadata() root = BeautifulSoup(self.content, 'html.parser') result += self.handle_element(root) return result
def scrape_info(): browser = init_browser() # Visit https://mars.nasa.gov/news/ url1 = 'https://mars.nasa.gov/news/' browser.visit(url1) time.sleep(3) # Scrape page into Soup html = browser.html soup = BeautifulSoup(html, "html.parser") news_titles = soup.find('div', class_="content_title") news_title = news_titles.text print(news_title) time.sleep(3) news_ps = soup.find('div', class_="article_teaser_body") news_p = news_ps.text print(news_p) #Find the src for the featured image url2 = 'http://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url2) time.sleep(2) html2 = browser.html soup = BeautifulSoup(html2, 'html.parser') img = soup.find_all('a', class_="button fancybox") for a in img: print(a["data-fancybox-href"]) url9 = "http://www.jpl.nasa.gov/" featured_image_url = url9 + a["data-fancybox-href"] url3 = 'https://twitter.com/marswxreport?lang=en' browser.visit(url3) time.sleep(3) soup = BeautifulSoup(browser.html, 'html.parser') mars_weather = soup.find(class_='tweet-text').text url4 = 'https://space-facts.com/mars/' browser.visit(url4) time.sleep(10) html4 = browser.html soup = BeautifulSoup(html4, 'html.parser') marsfacts = soup.find_all('table', class_="tablepress tablepress-id-p-mars") marsfacts url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url5) time.sleep(5) html5 = browser.html soup = BeautifulSoup(html5, 'html.parser') hemis_search = soup.find_all('a', class_="itemLink product-item") url10 = "https://astrogeology.usgs.gov" img_url = [] for a in hemis_search: print(a['href']) img_url.append(a['href']) url11 = url10 + img_url[0] url12 = url10 + img_url[2] url13 = url10 + img_url[4] url14 = url10 + img_url[6] browser.visit(url11) html11 = browser.html time.sleep(5) soup = BeautifulSoup(html11, 'html.parser') hemis_search2 = soup.find_all('img', class_="wide-image") for a in hemis_search2: print(a['src']) url15 = url10 + (a['src']) print(url15) browser.visit(url12) html12 = browser.html time.sleep(5) soup = BeautifulSoup(html12, 'html.parser') hemis_search3 = soup.find_all('img', class_="wide-image") for a in hemis_search3: print(a['src']) url16 = url10 + (a['src']) print(url16) browser.visit(url13) html13 = browser.html time.sleep(5) soup = BeautifulSoup(html13, 'html.parser') hemis_search4 = soup.find_all('img', class_="wide-image") for a in hemis_search4: print(a['src']) url17 = url10 + (a['src']) print(url17) browser.visit(url14) html14 = browser.html time.sleep(5) soup = BeautifulSoup(html14, 'html.parser') hemis_search4 = soup.find_all('img', class_="wide-image") for a in hemis_search4: print(a['src']) url18 = url10 + (a['src']) print(url18) hemisphere_image_url = [ {"title": "Cerberus Hemisphere", "img_url": url15}, {"title": "Schiaparelli Hemisphere", "img_url": url16}, {"title": "Syrtis Major Hemisphere", "img_url": url17}, {"title": "Valles Marineris Hemisphere", "img_url": url18} ] # Store data in a dictionary mars_data = { "news_title": news_title, "news_p": news_p, "featured_image_url": featured_image_url, "mars_weather": mars_weather, "url15": url15, "url16": url16, "url17": url17, "url18": url18 } # Close the browser after scraping browser.quit() # Return results return mars_data
#-----------Web Scraping Program------------# import requests from bs4 import BeautifulSoup import csv response = requests.get("https://www.rithmschool.com/blog") soup = BeautifulSoup(response.text, "html.parser") articles = soup.find_all("article") with open("blog_data.csv", "w") as csv_file: csv_writer = csv.writer(csv_file) csv_writer.writerow(["title","link","date"]) for article in articles: a_tag = article.find("a") url = a_tag["href"] title = a_tag.get_text() datetime = article.find("time")["datetime"] csv_writer.writerow([title,url,datetime])
import urllib.request import time import pyautogui if __name__ == "__main__": loopIdx = 0 loopLimit = 12000 while True: # uptempo = "https://smartstore.naver.com/neulhaerangmask/products/4632987981?site_preference=device&NaPm=" uptempo = 'https://smartstore.naver.com/hana-water/products/4832110630?NaPm=' req = urllib.request.Request(uptempo) res = urllib.request.urlopen(req) data = res.read() soup = BeautifulSoup(data.decode("utf-8"), 'html.parser') ready = False for span in soup.find_all("span"): if span.get('class') == None: continue if span.get('class')[0] == 'cart': for s in span: if s.get('class')[0] == 'mask2': continue elif s.get('class')[0] == '_stopDefault': ready = False else: ready = True break
def loadPage(): html = chrome.page_source soup = BeautifulSoup(html, 'lxml')
import os os.system("sudo pip install --upgrade beautifulsoup4") from bs4 import BeautifulSoup as BS html = input("HTML file? \t") f = open(html,"r+") soup = BS(f.read(), "html.parser") f.seek(0,0) f.write(soup.prettify()) f.close()
def home(request): """ Parameters: request[HttpRequest] -------------------------------------------- Returns: render(request, 'init.html') [HttpResponse] => init.html is returned as HttpResponse Logic: it takes the inputted code from frontend request and sends it to hackerearth API if the code doesn't compile, then it finds the necessary keyword from error messages and searches for it on google with regex matching and suggests debug links """ if request.method == 'POST': # POST goes here . is_ajax is must to capture ajax requests. if request.is_ajax(): lang = request.POST.get('lang') source = request.POST.get('source') inputl = request.POST.get('input') data = {"lang": lang, "source": source, "input": inputl} data = { 'client_secret': CLIENT_SECRET, 'async': 0, 'source': source, 'lang': lang, 'input': inputl, 'time_limit': 5, 'memory_limit': 262144, } # Post data to HackerEarth API s = requests.Session() s.mount("http://", requests.adapters.HTTPAdapter(max_retries=5)) s.mount("https://", requests.adapters.HTTPAdapter(max_retries=5)) r = s.post(RUN_URL, data=data) key_words = [] compile_status = r.json()['compile_status'].strip() current_json = r.json() if compile_status != 'OK': rk = Rake() rk.extract_keywords_from_text(compile_status) for keyword in rk.get_ranked_phrases(): if 'hackerearth' in keyword: continue key_words.append(keyword) # filter extra information if len(key_words) >= 3: key_words = key_words[-2:] key_words = list(reversed(key_words)) key_words.append(compile_status) links = [] desc = [] import re for word in key_words: page = s.get("https://www.google.co.in/search?q=" + word) soup = BeautifulSoup(page.content, 'lxml') for link in soup.find_all("a", href=re.compile("(?<=/url\?q=)(htt.*://.*)")): debug_url = link["href"].replace("/url?q=", "").split('&')[0] if 'webcache.googleusercontent.com' in debug_url: continue links.append(debug_url) desc.append(link.text + ":" + get_domain(debug_url)) current_json['debug_urls'] = links[:10] current_json['descriptions'] = desc[:10] return JsonResponse(current_json, safe=False) # A normal get request goes here return render(request, 'init.html')
def fillUnivList(ulist, html): soup = BeautifulSoup(html, "html.parser") for tr in soup.find('tbody').children: if isinstance(tr, bs4.element.Tag): tds = tr('td') ulist.append([tds[0].string, tds[1].string, tds[3].string])
from bs4 import BeautifulSoup import requests, pprint, random, time, string url = requests.get( "https://www.imdb.com/india/top-rated-indian-movies/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=8a7876cd-2844-4017-846a-2c0876945b7b&pf_rd_r=C6ZKX5N78115F6BM14Y3&pf_rd_s=right-5&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_india_tr_rhs_1" ) soup = BeautifulSoup(url.text, 'lxml') table = soup.find('tbody', class_='lister-list') body = table.find_all('tr') random_var = random.randint(1, 5) time.sleep(random_var) _list = [] for i in body: _dict = {} data = i.find('td', class_="titleColumn") no = '' for j in data.text: no += j if j == '.': break _dict['No'] = no.strip() _dict['Movie'] = data.find('a').text _dict['Year'] = int(data.find('span').text.strip('(').strip(')')) _dict['Rating'] = i.find('strong').text _dict['Link'] = "https://www.imdb.com" + i.find('a')['href'] _list.append(_dict) pprint.pprint(_list) def scrapped_movie(mov_link): new_url = requests.get(mov_link).text soup = BeautifulSoup(new_url, 'lxml')
raise Exception() # 크롬 드라이버 인스턴스 생성 chrome = generate_chrome( driver_path=driver_path, headless=False, download_path=DOWNLOAD_DIR) # 페이지 요청 url = 'https://news.naver.com/main/ranking/popularDay.nhn' chrome.get(url) time.sleep(3) html = chrome.page_source soup = BeautifulSoup(html, 'lxml') collecttime = str(datetime.utcnow().replace(microsecond=0) + timedelta(hours=9))[:16] es=Elasticsearch() def loadPage(): html = chrome.page_source soup = BeautifulSoup(html, 'lxml') def backToMainPage(): chrome.get(url) def getTopFive(i): loadPage() elements=soup.select('#wrap > table > tbody > tr > td.content > div > div:nth-child('+str(i)+') > ol> li')
from urllib.request import urlopen from bs4 import BeautifulSoup url = 'https://www.apple.com/itunes/charts/songs' conn = urlopen(url) raw_data = conn.read() text = raw_data.decode('utf8') soup = BeautifulSoup(text, "html.parser") ul = soup.find('section', 'section chart-grid') li_list = ul.find_all("li") item_list = [] for li in li_list: a = li.h3.a b = li.h4.a song_name = a.string artist = b.string item = {"Song_names": song_name, "Artist": artist} item_list.append(item) import pyexcel pyexcel.save_as(records=item_list, dest_file_name="itunes100.xlsx") from youtube_dl import YoutubeDL for song in item_list: options = { 'default_search': 'ytsearch', 'max_dowloads': 10, 'format': 'bestaudio/audio' } dl = YoutubeDL(options)
import urllib import pymysql import db from bs4 import BeautifulSoup params = urllib.parse.urlencode({'page' :1}) url='https://movie.naver.com/movie/point/af/list.nhn?&%s' % params print(url) response = urllib.request.urlopen(url) navigator = BeautifulSoup(response, 'html.parser') table = navigator.find('table', class_ = 'list_netizen') print(table) list_records=[] for i,r in enumerate(table.find_all('tr')): for j,c in enumerate(r.find_all('td')): if j==0: record=int(c.text.strip()) elif j==2: record1=int(c.text.strip()) elif j==3: record2= str(c.find('a', class_ = 'movie').text.strip()) record3= str(c.text).split('\n')[2] elif j==4: record4 = str(c.find('a', class_ = 'author').text.strip()) record5= str(c.text).split('****')[1] try: record_t=tuple([record,record1,record2,record3,record4,record5]) list_records.append(record_t)
from requests import get from bs4 import BeautifulSoup url = "https://helion.pl/search?qa=&serwisyall=&szukaj=python&wprzyg=&wsprzed=&wyczerp=" response = get(url) html_soup = BeautifulSoup(response.text, 'html.parser') books = html_soup.find_all('div', class_="book-info") for b in books: print(b.a.text)
def main(): merchantFilePath = os.path.dirname( os.path.abspath(__file__)) + "/merchants.json" if os.path.exists(merchantFilePath): json_open = open(merchantFilePath, "r", encoding="utf8") merchants = json.load(json_open) else: merchants = {"data": [], "names": []} findMerchants = [] page = 0 while True: page += 1 print("----- Page {page} -----".format(page=page)) html = requests.get( "https://www.gotoeat-tochigi.jp/merchant/index.php?word=&sort=2&page={page}" .format(page=page)) html.encoding = html.apparent_encoding soup = BeautifulSoup(html.content, "html.parser") lists = soup.find("ul", { "class": "serch_result" }).findChildren("li", recursive=False) if (len(lists) == 0): break for merchant in lists: merchant_name = merchant.find("p", {"class": "name"}).text merchant_type = merchant.find("p", { "class": "name" }).find("span").text merchant_name = re.sub( r"{merchant_type}$".format(merchant_type=merchant_type), "", merchant_name) _merchant_address = merchant.find("div", { "class": "add" }).findAll("p")[0].text merchant_postal_code = re.sub(r"所在地〒([0-9\-]+) (.+)", r"\1", _merchant_address) merchant_address = re.sub(r"所在地〒([0-9\-]+) (.+)", r"\2", _merchant_address) if len(merchant.find("div", {"class": "add"}).findAll("p")) >= 2: merchant_tel = merchant.find("div", { "class": "add" }).findAll("p")[1].text merchant_tel = re.sub(r"TEL(.+)", r"\1", merchant_tel) print(merchant_name + " - " + merchant_address) findMerchants.append(merchant_name) if merchant_name in merchants["names"]: continue lat, lng = getLatLng(merchant_address) print(str(lat) + " " + str(lng)) merchants["data"].append({ "name": merchant_name, "type": merchant_type, "address": merchant_address, "postal_code": merchant_postal_code, "tel": merchant_tel, "lat": lat, "lng": lng }) merchants["names"].append(merchant_name) with open(merchantFilePath, mode="w", encoding="utf8") as f: f.write(json.dumps(merchants, indent=4, ensure_ascii=False)) if (soup.find("li", {"class": "next"}) == None): break else: time.sleep(1) merchants = checkRemovedMerchant(merchants, findMerchants) with open(merchantFilePath, mode="w", encoding="utf8") as f: f.write(json.dumps(merchants, indent=4, ensure_ascii=False))
def get_urls_for_date(date): response = requests.get( 'http://www.allsides.com/?date_filter[value][date]=' + date) soup = BeautifulSoup(response.content, "html.parser") return [el.a.get('href') for el in soup.find_all('div', "news-title")]
username = "******" url = "http://www.twitter.com/" + username response = None try: response = requests.get(url) except Exception as e: print(repr(e)) sys.exit(1) if response.status_code != 200: print("Non success status code returned "+str(response.status_code)) sys.exit(1) soup = BeautifulSoup(response.text, 'html.parser') if soup.find("div", {"class": "errorpage-topbar"}): print("\n\n Error: Invalid username.") sys.exit(1) tweets = soup.find_all("p", {"class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"}) tweetList = [] for tweet in tweets: text = tweet.text.encode('utf-8') tweetList.append(text) for tweetText in tweetList: print(tweetText)
<tr> <th>전화번호</th> <td>02-2345-2323</td> <td>033-223-2323</td> <td>051-121-1212</td> </tr> <tr> <th>대표메일</th> <td>[email protected]</td> <td>[email protected]</td> <td>[email protected]</td> </tr> </table> ''' soup = BeautifulSoup(html, 'html.parser') companies = {} data = {} for i, tr in enumerate(soup.select('tr')): if i == 0: for j, th in enumerate(tr.select('th')): if j == 0: continue companies[th.text] = j - 1 else: item_name = tr.select_one('th').text lst = [] print(item_name) for td in tr.select('td'): lst.append(td.text)
import csv import os import requests from bs4 import BeautifulSoup url='https://karki23.github.io/Weather-Data/assignment.html' page=requests.get(url) src=BeautifulSoup(page.content, "html.parser") all_cities=src.find_all('a') os.mkdir("dataset") for i in all_cities: s=i.get('href')[0:len(i)-5:] url1='https://karki23.github.io/Weather-Data/'+i.get('href') page1=requests.get(url1) src1=BeautifulSoup(page1.content, "html.parser") rows=src1.find_all('tr') rows.pop(0) file_name="dataset\\"+s+"csv" f=open(file_name, "w", newline="") headings=src1.find_all('th') headings_new=[i.text for i in headings] writer=csv.writer(f) writer.writerow(headings_new) for i in rows: columns=i.find_all('td') column_new=[j.text for j in columns] writer.writerow(column_new) f.close()
urls = ['https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=0', 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=10', 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=20', 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=30', 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=40', 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=50', 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=60', 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=70', 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=80', 'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=90', ] for index, url in enumerate(urls): print(url) content = urlopen(url).read() soup = BeautifulSoup(content, "lxml") for author in soup.find_all('h3'): AUTHOR_LIST.append(author.a.contents[0]) for email in soup.find_all('div', class_='gsc_oai_eml'): if len(email.contents) == 1: EMAIL_LIST.append(re.search("Verified email at (.*)",str(email.contents[0])).group(1)) else: EMAIL_LIST.append("N/A") for topic in soup.find_all('div', class_='gsc_oai'): temp = [] for i in topic.find_all('a', class_='gsc_oai_one_int'): temp += i.contents TOPIC_LIST.append(temp) workbook = xlsxwriter.Workbook('google_scholar_label:ai.xlsx')
def build(self, run_epubcheck: bool, build_kobo: bool, build_kindle: bool, output_directory: Path, proof: bool, build_covers: bool) -> None: """ Entry point for `se build` """ # Check for some required tools if build_kindle: which_ebook_convert = shutil.which("ebook-convert") if which_ebook_convert: ebook_convert_path = Path(which_ebook_convert) else: # Look for default Mac calibre app path if none found in path ebook_convert_path = Path("/Applications/calibre.app/Contents/MacOS/ebook-convert") if not ebook_convert_path.exists(): raise se.MissingDependencyException("Couldn’t locate [bash]ebook-convert[/]. Is [bash]calibre[/] installed?") if run_epubcheck: if not shutil.which("java"): raise se.MissingDependencyException("Couldn’t locate [bash]java[/]. Is it installed?") # Check the output directory and create it if it doesn't exist try: output_directory = output_directory.resolve() output_directory.mkdir(parents=True, exist_ok=True) except Exception: raise se.FileExistsException(f"Couldn’t create output directory: [path][link=file://{output_directory}]{output_directory}[/][/].") # All clear to start building! metadata_xml = self.metadata_xml with tempfile.TemporaryDirectory() as temp_directory: work_directory = Path(temp_directory) work_epub_root_directory = work_directory / "src" copy_tree(self.path, str(work_directory)) try: shutil.rmtree(work_directory / ".git") except Exception: pass # By convention the ASIN is set to the SHA-1 sum of the book's identifying URL try: identifier = self.metadata_dom.xpath("//dc:identifier")[0].inner_xml().replace("url:", "") asin = sha1(identifier.encode("utf-8")).hexdigest() except: raise se.InvalidSeEbookException(f"Missing [xml]<dc:identifier>[/] element in [path][link=file://{self.metadata_file_path}]{self.metadata_file_path}[/][/].") if not self.metadata_dom.xpath("//dc:title"): raise se.InvalidSeEbookException(f"Missing [xml]<dc:title>[/] element in [path][link=file://{self.metadata_file_path}]{self.metadata_file_path}[/][/].") output_filename = identifier.replace("https://standardebooks.org/ebooks/", "").replace("/", "_") url_author = "" for author in self.metadata_dom.xpath("//dc:creator"): url_author = url_author + se.formatting.make_url_safe(author.inner_xml()) + "_" url_author = url_author.rstrip("_") epub_output_filename = f"{output_filename}{'.proof' if proof else ''}.epub" epub3_output_filename = f"{output_filename}{'.proof' if proof else ''}.epub3" kobo_output_filename = f"{output_filename}{'.proof' if proof else ''}.kepub.epub" kindle_output_filename = f"{output_filename}{'.proof' if proof else ''}.azw3" # Clean up old output files if any se.quiet_remove(output_directory / f"thumbnail_{asin}_EBOK_portrait.jpg") se.quiet_remove(output_directory / "cover.jpg") se.quiet_remove(output_directory / "cover-thumbnail.jpg") se.quiet_remove(output_directory / epub_output_filename) se.quiet_remove(output_directory / epub3_output_filename) se.quiet_remove(output_directory / kobo_output_filename) se.quiet_remove(output_directory / kindle_output_filename) # Are we including proofreading CSS? if proof: with open(work_epub_root_directory / "epub" / "css" / "local.css", "a", encoding="utf-8") as local_css_file: with importlib_resources.open_text("se.data.templates", "proofreading.css", encoding="utf-8") as proofreading_css_file: local_css_file.write(proofreading_css_file.read()) # Update the release date in the metadata and colophon if self.last_commit: last_updated_iso = regex.sub(r"\.[0-9]+$", "", self.last_commit.timestamp.isoformat()) + "Z" last_updated_iso = regex.sub(r"\+.+?Z$", "Z", last_updated_iso) # In the line below, we can't use %l (unpadded 12 hour clock hour) because it isn't portable to Windows. # Instead we use %I (padded 12 hour clock hour) and then do a string replace to remove leading zeros. last_updated_friendly = f"{self.last_commit.timestamp:%B %e, %Y, %I:%M <abbr class=\"time eoc\">%p</abbr>}".replace(" 0", " ") last_updated_friendly = regex.sub(r"\s+", " ", last_updated_friendly).replace("AM", "a.m.").replace("PM", "p.m.").replace(" <abbr", " <abbr") # Set modified date in content.opf self.metadata_xml = regex.sub(r"<meta property=\"dcterms:modified\">[^<]+?</meta>", f"<meta property=\"dcterms:modified\">{last_updated_iso}</meta>", self.metadata_xml) with open(work_epub_root_directory / "epub" / "content.opf", "w", encoding="utf-8") as file: file.seek(0) file.write(self.metadata_xml) file.truncate() # Update the colophon with release info with open(work_epub_root_directory / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: xhtml = file.read() xhtml = xhtml.replace("<p>The first edition of this ebook was released on<br/>", f"<p>This edition was released on<br/>\n\t\t\t<b>{last_updated_friendly}</b><br/>\n\t\t\tand is based on<br/>\n\t\t\t<b>revision {self.last_commit.short_sha}</b>.<br/>\n\t\t\tThe first edition of this ebook was released on<br/>") file.seek(0) file.write(xhtml) file.truncate() # Output the pure epub3 file se.epub.write_epub(work_epub_root_directory, output_directory / epub3_output_filename) # Now add epub2 compatibility. # Include compatibility CSS with open(work_epub_root_directory / "epub" / "css" / "core.css", "a", encoding="utf-8") as core_css_file: with importlib_resources.open_text("se.data.templates", "compatibility.css", encoding="utf-8") as compatibility_css_file: core_css_file.write(compatibility_css_file.read()) # Simplify CSS and tags total_css = "" # Simplify the CSS first. Later we'll update the document to match our simplified selectors. # While we're doing this, we store the original css into a single variable so we can extract the original selectors later. for root, _, filenames in os.walk(work_epub_root_directory): for filename_string in fnmatch.filter(filenames, "*.css"): filename = Path(root) / filename_string with open(filename, "r+", encoding="utf-8") as file: css = file.read() # Before we do anything, we process a special case in core.css if filename.name == "core.css": css = regex.sub(r"abbr{.+?}", "", css, flags=regex.DOTALL) total_css = total_css + css + "\n" file.seek(0) file.write(se.formatting.simplify_css(css)) file.truncate() # Now get a list of original selectors # Remove @supports(){} total_css = regex.sub(r"@supports.+?{(.+?)}\s*}", "\\1}", total_css, flags=regex.DOTALL) # Remove CSS rules total_css = regex.sub(r"{[^}]+}", "", total_css) # Remove trailing commas total_css = regex.sub(r",", "", total_css) # Remove comments total_css = regex.sub(r"/\*.+?\*/", "", total_css, flags=regex.DOTALL) # Remove @ defines total_css = regex.sub(r"^@.+", "", total_css, flags=regex.MULTILINE) # Construct a dictionary of the original selectors selectors = {line for line in total_css.splitlines() if line != ""} # Get a list of .xhtml files to simplify for root, _, filenames in os.walk(work_epub_root_directory): for filename_string in fnmatch.filter(filenames, "*.xhtml"): filename = (Path(root) / filename_string).resolve() # Don't mess with the ToC, since if we have ol/li > first-child selectors we could screw it up if filename.name == "toc.xhtml": continue with open(filename, "r+", encoding="utf-8") as file: # We have to remove the default namespace declaration from our document, otherwise # xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python xhtml = file.read().replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "") processed_xhtml = xhtml try: tree = etree.fromstring(str.encode(xhtml)) except Exception as ex: raise se.InvalidXhtmlException(f"Error parsing XHTML file: [path][link=file://{filename}]{filename}[/][/]. Exception: {ex}") # Now iterate over each CSS selector and see if it's used in any of the files we found for selector in selectors: try: # Add classes to elements that match any of our selectors to simplify. For example, if we select :first-child, add a "first-child" class to all elements that match that. for selector_to_simplify in se.SELECTORS_TO_SIMPLIFY: while selector_to_simplify in selector: # Potentially the pseudoclass we’ll simplify isn’t at the end of the selector, # so we need to temporarily remove the trailing part to target the right elements. split_selector = regex.split(fr"({selector_to_simplify}(\(.*?\))?)", selector, 1) target_element_selector = ''.join(split_selector[0:2]) replacement_class = split_selector[1].replace(":", "").replace("(", "-").replace("n-", "n-minus-").replace("n+", "n-plus-").replace(")", "") selector = selector.replace(split_selector[1], "." + replacement_class, 1) sel = se.easy_xml.css_selector(target_element_selector) for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES): current_class = element.get("class") if current_class is not None and replacement_class not in current_class: current_class = current_class + " " + replacement_class else: current_class = replacement_class element.set("class", current_class) except lxml.cssselect.ExpressionError: # This gets thrown if we use pseudo-elements, which lxml doesn't support pass except lxml.cssselect.SelectorSyntaxError as ex: raise se.InvalidCssException(f"Couldn’t parse CSS in or near this line: [css]{selector}[/]. Exception: {ex}") # We've already replaced attribute/namespace selectors with classes in the CSS, now add those classes to the matching elements if "[epub|type" in selector: for namespace_selector in regex.findall(r"\[epub\|type\~\=\"[^\"]*?\"\]", selector): sel = se.easy_xml.css_selector(namespace_selector) for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES): new_class = regex.sub(r"^\.", "", se.formatting.namespace_to_class(namespace_selector)) current_class = element.get("class", "") if new_class not in current_class: current_class = f"{current_class} {new_class}".strip() element.set("class", current_class) processed_xhtml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True) # We do this round in a second pass because if we modify the tree like this, it screws up how lxml does processing later. # If it's all done in one pass, we wind up in a race condition where some elements are fixed and some not tree = etree.fromstring(str.encode(processed_xhtml)) for selector in selectors: try: sel = se.easy_xml.css_selector(selector) except lxml.cssselect.ExpressionError: # This gets thrown if we use pseudo-elements, which lxml doesn't support continue except lxml.cssselect.SelectorSyntaxError as ex: raise se.InvalidCssException(f"Couldn’t parse CSS in or near this line: [css]{selector}[/]. Exception: {ex}") # Convert <abbr> to <span> if "abbr" in selector: for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES): # Why would you want the tail to output by default?!? raw_string = etree.tostring(element, encoding=str, with_tail=False) # lxml--crap as usual--includes a bunch of namespace information in every element we print. # Remove it here. raw_string = raw_string.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "") raw_string = raw_string.replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "") raw_string = raw_string.replace(" xmlns:m=\"http://www.w3.org/1998/Math/MathML\"", "") # Now lxml doesn't let us modify the tree, so we just do a straight up regex replace to turn this into a span processed_string = raw_string.replace("<abbr", "<span") processed_string = processed_string.replace("</abbr", "</span") # Now we have a nice, fixed string. But, since lxml can't replace elements, we write it ourselves. processed_xhtml = processed_xhtml.replace(raw_string, processed_string) tree = etree.fromstring(str.encode(processed_xhtml)) # Now we just remove all stray abbr tags that were not styled by CSS processed_xhtml = regex.sub(r"</?abbr[^>]*?>", "", processed_xhtml) # Remove datetime="" attribute in <time> tags, which is not always understood by epubcheck processed_xhtml = regex.sub(r" datetime=\"[^\"]+?\"", "", processed_xhtml) tree = etree.fromstring(str.encode(processed_xhtml)) if processed_xhtml != xhtml: file.seek(0) file.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True).replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\"")) file.truncate() # Done simplifying CSS and tags! # Extract cover and cover thumbnail cover_svg_file = work_epub_root_directory / "epub" / "images" / "cover.svg" if not os.path.isfile(cover_svg_file): raise se.MissingDependencyException("Cover image is missing. Did you run [bash]se build-images[/]?") svg2png(url=str(cover_svg_file), write_to=str(work_directory / "cover.png")) cover = Image.open(work_directory / "cover.png") cover = cover.convert("RGB") # Remove alpha channel from PNG if necessary cover.save(work_epub_root_directory / "epub" / "images" / "cover.jpg") (work_directory / "cover.png").unlink() if build_covers: shutil.copy2(work_epub_root_directory / "epub" / "images" / "cover.jpg", output_directory / "cover.jpg") shutil.copy2(cover_svg_file, output_directory / "cover-thumbnail.svg") # Path arguments must be cast to string svg2png(url=str(output_directory / "cover-thumbnail.svg"), write_to=str(work_directory / "cover-thumbnail.png")) cover = Image.open(work_directory / "cover-thumbnail.png") cover = cover.resize((COVER_THUMBNAIL_WIDTH, COVER_THUMBNAIL_HEIGHT)) cover = cover.convert("RGB") # Remove alpha channel from PNG if necessary cover.save(output_directory / "cover-thumbnail.jpg") (work_directory / "cover-thumbnail.png").unlink() (output_directory / "cover-thumbnail.svg").unlink() cover_svg_file.unlink() # Massage image references in content.opf metadata_xml = metadata_xml.replace("cover.svg", "cover.jpg") metadata_xml = metadata_xml.replace(".svg", ".png") metadata_xml = metadata_xml.replace("id=\"cover.jpg\" media-type=\"image/svg+xml\"", "id=\"cover.jpg\" media-type=\"image/jpeg\"") metadata_xml = metadata_xml.replace("image/svg+xml", "image/png") metadata_xml = regex.sub(r" properties=\"([^\"]*?)svg([^\"]*?)\"", r''' properties="\1\2"''', metadata_xml) # We may also have the `mathml` property metadata_xml = regex.sub(r" properties=\"([^\s]*?)\s\"", r''' properties="\1"''', metadata_xml) # Clean up trailing white space in property attributes introduced by the above line metadata_xml = regex.sub(r" properties=\"\s*\"", "", metadata_xml) # Remove any now-empty property attributes # Add an element noting the version of the se tools that built this ebook metadata_xml = regex.sub(r"<dc:publisher", f"<meta property=\"se:built-with\">{se.VERSION}</meta>\n\t\t<dc:publisher", metadata_xml) # Google Play Books chokes on https XML namespace identifiers (as of at least 2017-07) metadata_xml = metadata_xml.replace("https://standardebooks.org/vocab/1.0", "http://standardebooks.org/vocab/1.0") # Output the modified content.opf so that we can build the kobo book before making more epub2 compatibility hacks with open(work_epub_root_directory / "epub" / "content.opf", "w", encoding="utf-8") as file: file.write(metadata_xml) file.truncate() # Recurse over xhtml files to make some compatibility replacements for root, _, filenames in os.walk(work_epub_root_directory): for filename_string in filenames: filename = Path(root) / filename_string if filename.suffix == ".svg": # For night mode compatibility, give the titlepage a 1px white stroke attribute if filename.name in("titlepage.svg", "logo.svg"): with open(filename, "r+", encoding="utf-8") as file: svg = file.read() paths = svg # What we're doing here is faking the `stroke-align: outside` property, which is an unsupported draft spec right now. # We do this by duplicating all the SVG paths, and giving the duplicates a 2px stroke. The originals are directly on top, # so the 2px stroke becomes a 1px stroke that's *outside* of the path instead of being *centered* on the path border. # This looks much nicer, but we also have to increase the image size by 2px in both directions, and re-center the whole thing. if filename.name == "titlepage.svg": stroke_width = SVG_TITLEPAGE_OUTER_STROKE_WIDTH else: stroke_width = SVG_OUTER_STROKE_WIDTH # First, strip out non-path, non-group elements paths = regex.sub(r"<\?xml[^<]+?\?>", "", paths) paths = regex.sub(r"</?svg[^<]*?>", "", paths) paths = regex.sub(r"<title>[^<]+?</title>", "", paths) paths = regex.sub(r"<desc>[^<]+?</desc>", "", paths) # `paths` is now our "duplicate". Add a 2px stroke. paths = paths.replace("<path", f"<path style=\"stroke: #ffffff; stroke-width: {stroke_width}px;\"") # Inject the duplicate under the old SVG paths. We do this by only replacing the first regex match for <g> or <path> svg = regex.sub(r"(<g|<path)", f"{paths}\\1", svg, 1) # If this SVG specifies height/width, then increase height and width by 2 pixels and translate everything by 1px try: height = int(regex.search(r"<svg[^>]+?height=\"([0-9]+)\"", svg).group(1)) + stroke_width svg = regex.sub(r"<svg([^<]*?)height=\"[0-9]+\"", f"<svg\\1height=\"{height}\"", svg) width = int(regex.search(r"<svg[^>]+?width=\"([0-9]+)\"", svg).group(1)) + stroke_width svg = regex.sub(r"<svg([^<]*?)width=\"[0-9]+\"", f"<svg\\1width=\"{width}\"", svg) # Add a grouping element to translate everything over 1px svg = regex.sub(r"(<g|<path)", "<g transform=\"translate({amount}, {amount})\">\n\\1".format(amount=(stroke_width / 2)), svg, 1) svg = svg.replace("</svg>", "</g>\n</svg>") except AttributeError: # Thrown when the regex doesn't match (i.e. SVG doesn't specify height/width) pass file.seek(0) file.write(svg) file.truncate() # Convert SVGs to PNGs at 2x resolution # Path arguments must be cast to string svg2png(url=str(filename), write_to=str(filename.parent / (str(filename.stem) + ".png")), scale=2) (filename).unlink() if filename.suffix == ".xhtml": with open(filename, "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml # Check if there's any MathML to convert. # We expect MathML to be the "content" type (versus the "presentational" type). # We use an XSL transform to convert from "content" to "presentational" MathML. # If we start with presentational, then nothing will be changed. # Kobo supports presentational MathML. After we build kobo, we convert the presentational MathML to PNG for the rest of the builds. mathml_transform = None for line in regex.findall(r"<(?:m:)?math[^>]*?>(.+?)</(?:m:)?math>", processed_xhtml, flags=regex.DOTALL): mathml_content_tree = se.easy_xml.EasyXhtmlTree("<?xml version=\"1.0\" encoding=\"utf-8\"?><math xmlns=\"http://www.w3.org/1998/Math/MathML\">{}</math>".format(regex.sub(r"<(/?)m:", "<\\1", line))) # Initialize the transform object, if we haven't yet if not mathml_transform: with importlib_resources.path("se.data", "mathmlcontent2presentation.xsl") as mathml_xsl_filename: mathml_transform = etree.XSLT(etree.parse(str(mathml_xsl_filename))) # Transform the mathml and get a string representation # XSLT comes from https://github.com/fred-wang/webextension-content-mathml-polyfill mathml_presentation_tree = mathml_transform(mathml_content_tree.etree) mathml_presentation_xhtml = etree.tostring(mathml_presentation_tree, encoding="unicode", pretty_print=True, with_tail=False).strip() # Plop our string back in to the XHTML we're processing processed_xhtml = regex.sub(r"<(?:m:)?math[^>]*?>\{}\</(?:m:)?math>".format(regex.escape(line)), mathml_presentation_xhtml, processed_xhtml, flags=regex.MULTILINE) if filename.name == "endnotes.xhtml": # iOS renders the left-arrow-hook character as an emoji; this fixes it and forces it to render as text. # See https://github.com/standardebooks/tools/issues/73 # See http://mts.io/2015/04/21/unicode-symbol-render-text-emoji/ processed_xhtml = processed_xhtml.replace("\u21a9", "\u21a9\ufe0e") # Since we added an outlining stroke to the titlepage/publisher logo images, we # want to remove the se:color-depth.black-on-transparent semantic if filename.name in ("colophon.xhtml", "imprint.xhtml", "titlepage.xhtml"): processed_xhtml = regex.sub(r"\s*se:color-depth\.black-on-transparent\s*", "", processed_xhtml) # Add ARIA roles, which are just mostly duplicate attributes to epub:type for role in ARIA_ROLES: processed_xhtml = regex.sub(fr"(epub:type=\"[^\"]*?{role}[^\"]*?\")", f"\\1 role=\"doc-{role}\"", processed_xhtml) # Some ARIA roles can't apply to some elements. # For example, epilogue can't apply to <article> processed_xhtml = regex.sub(r"<article ([^>]*?)role=\"doc-epilogue\"", "<article \\1", processed_xhtml) if filename.name == "toc.xhtml": landmarks_xhtml = regex.findall(r"<nav epub:type=\"landmarks\">.*?</nav>", processed_xhtml, flags=regex.DOTALL) landmarks_xhtml = regex.sub(r" role=\"doc-.*?\"", "", landmarks_xhtml[0]) processed_xhtml = regex.sub(r"<nav epub:type=\"landmarks\">.*?</nav>", landmarks_xhtml, processed_xhtml, flags=regex.DOTALL) # But, remove ARIA roles we added to h# tags, because tyically those roles are for sectioning content. # For example, we might have an h2 that is both a title and dedication. But ARIA can't handle it being a dedication. # See The Man Who Was Thursday by G K Chesterton processed_xhtml = regex.sub(r"(<h[1-6] [^>]*) role=\".*?\">", "\\1>", processed_xhtml) # Google Play Books chokes on https XML namespace identifiers (as of at least 2017-07) processed_xhtml = processed_xhtml.replace("https://standardebooks.org/vocab/1.0", "http://standardebooks.org/vocab/1.0") # We converted svgs to pngs, so replace references processed_xhtml = processed_xhtml.replace("cover.svg", "cover.jpg") processed_xhtml = processed_xhtml.replace(".svg", ".png") # To get popup footnotes in iBooks, we have to change epub:endnote to epub:footnote. # Remember to get our custom style selectors too. processed_xhtml = regex.sub(r"epub:type=\"([^\"]*?)endnote([^\"]*?)\"", "epub:type=\"\\1footnote\\2\"", processed_xhtml) processed_xhtml = regex.sub(r"class=\"([^\"]*?)epub-type-endnote([^\"]*?)\"", "class=\"\\1epub-type-footnote\\2\"", processed_xhtml) # Include extra lang tag for accessibility compatibility. processed_xhtml = regex.sub(r"xml:lang\=\"([^\"]+?)\"", "lang=\"\\1\" xml:lang=\"\\1\"", processed_xhtml) # Typography: replace double and triple em dash characters with extra em dashes. processed_xhtml = processed_xhtml.replace("⸺", f"—{se.WORD_JOINER}—") processed_xhtml = processed_xhtml.replace("⸻", f"—{se.WORD_JOINER}—{se.WORD_JOINER}—") # Typography: replace some other less common characters. processed_xhtml = processed_xhtml.replace("⅒", "1/10") processed_xhtml = processed_xhtml.replace("℅", "c/o") processed_xhtml = processed_xhtml.replace("✗", "×") processed_xhtml = processed_xhtml.replace(" ", f"{se.NO_BREAK_SPACE}{se.NO_BREAK_SPACE}") # em-space to two nbsps # Many e-readers don't support the word joiner character (U+2060). # They DO, however, support the now-deprecated zero-width non-breaking space (U+FEFF) # For epubs, do this replacement. Kindle now seems to handle everything fortunately. processed_xhtml = processed_xhtml.replace(se.WORD_JOINER, se.ZERO_WIDTH_SPACE) # Some minor code style cleanup processed_xhtml = processed_xhtml.replace(" >", ">") processed_xhtml = regex.sub(r"""\s*epub:type=""\s*""", "", processed_xhtml) if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() if filename.suffix == ".css": with open(filename, "r+", encoding="utf-8") as file: css = file.read() processed_css = css # To get popup footnotes in iBooks, we have to change epub:endnote to epub:footnote. # Remember to get our custom style selectors too. processed_css = processed_css.replace("endnote", "footnote") # page-break-* is deprecated in favor of break-*. Add page-break-* aliases for compatibility in older ereaders. processed_css = regex.sub(r"(\s+)break-(.+?:\s.+?;)", "\\1break-\\2\t\\1page-break-\\2", processed_css) # `page-break-*: page;` should be come `page-break-*: always;` processed_css = regex.sub(r"(\s+)page-break-(before|after):\s+page;", "\\1page-break-\\2: always;", processed_css) if processed_css != css: file.seek(0) file.write(processed_css) file.truncate() if build_kobo: with tempfile.TemporaryDirectory() as temp_directory: kobo_work_directory = Path(temp_directory) copy_tree(str(work_epub_root_directory), str(kobo_work_directory)) for root, _, filenames in os.walk(kobo_work_directory): # Add a note to content.opf indicating this is a transform build for filename_string in fnmatch.filter(filenames, "content.opf"): with open(Path(root) / filename_string, "r+", encoding="utf-8") as file: xhtml = file.read() xhtml = regex.sub(r"<dc:publisher", "<meta property=\"se:transform\">kobo</meta>\n\t\t<dc:publisher", xhtml) file.seek(0) file.write(xhtml) file.truncate() # Kobo .kepub files need each clause wrapped in a special <span> tag to enable highlighting. # Do this here. Hopefully Kobo will get their act together soon and drop this requirement. for filename_string in fnmatch.filter(filenames, "*.xhtml"): kobo.paragraph_counter = 1 kobo.segment_counter = 1 filename = (Path(root) / filename_string).resolve() # Don't add spans to the ToC if filename.name == "toc.xhtml": continue with open(filename, "r+", encoding="utf-8") as file: xhtml = file.read() # Note: Kobo supports CSS hyphenation, but it can be improved with soft hyphens. # However we can't insert them, because soft hyphens break the dictionary search when # a word is highlighted. # Kobos don't have fonts that support the ↩ character in endnotes, so replace it with ← if filename.name == "endnotes.xhtml": # Note that we replaced ↩ with \u21a9\ufe0e in an earlier iOS compatibility fix xhtml = regex.sub(r"epub:type=\"backlink\">\u21a9\ufe0e</a>", "epub:type=\"backlink\">←</a>", xhtml) # We have to remove the default namespace declaration from our document, otherwise # xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python try: tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", ""))) except Exception as ex: raise se.InvalidXhtmlException(f"Error parsing XHTML file: [path][link=file://{filename}]{filename}[/][/]. Exception: {ex}") kobo.add_kobo_spans_to_node(tree.xpath("./body", namespaces=se.XHTML_NAMESPACES)[0]) xhtml = etree.tostring(tree, encoding="unicode", pretty_print=True, with_tail=False) xhtml = regex.sub(r"<html:span", "<span", xhtml) xhtml = regex.sub(r"html:span>", "span>", xhtml) xhtml = regex.sub(r"<span xmlns:html=\"http://www.w3.org/1999/xhtml\"", "<span", xhtml) xhtml = regex.sub(r"<html", "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\"", xhtml) file.seek(0) file.write(xhtml) file.truncate() # All done, clean the output # Note that we don't clean .xhtml files, because the way kobo spans are added means that it will screw up spaces inbetween endnotes. for filepath in se.get_target_filenames([kobo_work_directory], (".svg", ".opf", ".ncx")): se.formatting.format_xml_file(filepath) se.epub.write_epub(kobo_work_directory, output_directory / kobo_output_filename) # Now work on more epub2 compatibility # Recurse over css files to make some compatibility replacements. for root, _, filenames in os.walk(work_epub_root_directory): for filename_string in filenames: filename = Path(root) / filename_string if filename.suffix == ".css": with open(filename, "r+", encoding="utf-8") as file: css = file.read() processed_css = css processed_css = regex.sub(r"(page\-break\-(before|after|inside)\s*:\s*(.+))", "\\1\n\t-webkit-column-break-\\2: \\3 /* For Readium */", processed_css) processed_css = regex.sub(r"^\s*hyphens\s*:\s*(.+)", "\thyphens: \\1\n\tadobe-hyphenate: \\1\n\t-webkit-hyphens: \\1\n\t-epub-hyphens: \\1\n\t-moz-hyphens: \\1", processed_css, flags=regex.MULTILINE) processed_css = regex.sub(r"^\s*hyphens\s*:\s*none;", "\thyphens: none;\n\tadobe-text-layout: optimizeSpeed; /* For Nook */", processed_css, flags=regex.MULTILINE) if processed_css != css: file.seek(0) file.write(processed_css) file.truncate() # Sort out MathML compatibility has_mathml = "mathml" in metadata_xml if has_mathml: # We import this late because we don't want to load selenium if we're not going to use it! from se import browser # pylint: disable=import-outside-toplevel # We wrap this whole thing in a try block, because we need to call # driver.quit() if execution is interrupted (like by ctrl + c, or by an unhandled exception). If we don't call driver.quit(), # Firefox will stay around as a zombie process even if the Python script is dead. try: driver = browser.initialize_selenium_firefox_webdriver() mathml_count = 1 for root, _, filenames in os.walk(work_epub_root_directory): for filename_string in filenames: filename = Path(root) / filename_string if filename.suffix == ".xhtml": with open(filename, "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml replaced_mathml: List[str] = [] # Check if there's MathML we want to convert # We take a naive approach and use some regexes to try to simplify simple MathML expressions. # For each MathML expression, if our round of regexes finishes and there is still MathML in the processed result, we abandon the attempt and render to PNG using Firefox. for line in regex.findall(r"<(?:m:)?math[^>]*?>(?:.+?)</(?:m:)?math>", processed_xhtml, flags=regex.DOTALL): if line not in replaced_mathml: replaced_mathml.append(line) # Store converted lines to save time in case we have multiple instances of the same MathML mathml_tree = se.easy_xml.EasyXhtmlTree("<?xml version=\"1.0\" encoding=\"utf-8\"?>{}".format(regex.sub(r"<(/?)m:", "<\\1", line))) processed_line = line # If the mfenced element has more than one child, they are separated by commas when rendered. # This is too complex for our naive regexes to work around. So, if there is an mfenced element with more than one child, abandon the attempt. if not mathml_tree.css_select("mfenced > * + *"): processed_line = regex.sub(r"</?(?:m:)?math[^>]*?>", "", processed_line) processed_line = regex.sub(r"<!--.+?-->", "", processed_line) processed_line = regex.sub(r"<(?:m:)?mfenced/>", "()", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi)>(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "<i>\\4</i><\\2><i>\\6</i></\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi)>(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "<i>\\4</i><\\2>\\6</\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mn)>(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "\\4<\\2>\\6</\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mn)>(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "\\4<\\2><i>\\6</i></\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi) mathvariant=\"normal\">(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "\\4<\\2><i>\\6</i></\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi) mathvariant=\"normal\">(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "\\4<\\2>\\6</\\2>", processed_line) processed_line = regex.sub(fr"<(?:m:)?mo>{se.FUNCTION_APPLICATION}</(?:m:)?mo>", "", processed_line, flags=regex.IGNORECASE) # The ignore case flag is required to match here with the special FUNCTION_APPLICATION character, it's unclear why processed_line = regex.sub(r"<(?:m:)?mfenced><((?:m:)(?:mo|mi|mn|mrow))>(.+?)</\1></(?:m:)?mfenced>", "(<\\1>\\2</\\1>)", processed_line) processed_line = regex.sub(r"<(?:m:)?mrow>([^>].+?)</(?:m:)?mrow>", "\\1", processed_line) processed_line = regex.sub(r"<(?:m:)?mi>([^<]+?)</(?:m:)?mi>", "<i>\\1</i>", processed_line) processed_line = regex.sub(r"<(?:m:)?mi mathvariant=\"normal\">([^<]+?)</(?:m:)?mi>", "\\1", processed_line) processed_line = regex.sub(r"<(?:m:)?mo>([+\-−=×])</(?:m:)?mo>", " \\1 ", processed_line) processed_line = regex.sub(r"<((?:m:)?m[no])>(.+?)</\1>", "\\2", processed_line) processed_line = regex.sub(r"</?(?:m:)?mrow>", "", processed_line) processed_line = processed_line.strip() processed_line = regex.sub(r"</i><i>", "", processed_line, flags=regex.DOTALL) # Did we succeed? Is there any more MathML in our string? if regex.findall("</?(?:m:)?m", processed_line): # Failure! Abandon all hope, and use Firefox to convert the MathML to PNG. se.images.render_mathml_to_png(driver, regex.sub(r"<(/?)m:", "<\\1", line), work_epub_root_directory / "epub" / "images" / f"mathml-{mathml_count}.png", work_epub_root_directory / "epub" / "images" / f"mathml-{mathml_count}-2x.png") processed_xhtml = processed_xhtml.replace(line, f"<img class=\"mathml epub-type-se-image-color-depth-black-on-transparent\" epub:type=\"se:image.color-depth.black-on-transparent\" src=\"../images/mathml-{mathml_count}.png\" srcset=\"../images/mathml-{mathml_count}-2x.png 2x, ../images/mathml-{mathml_count}.png 1x\" />") mathml_count = mathml_count + 1 else: # Success! Replace the MathML with our new string. processed_xhtml = processed_xhtml.replace(line, processed_line) if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() except KeyboardInterrupt as ex: # Bubble the exception up, but proceed to `finally` so we quit the driver raise ex finally: try: driver.quit() except Exception: # We might get here if we ctrl + c before selenium has finished initializing the driver pass # Include epub2 cover metadata cover_id = self.metadata_dom.xpath("//item[@properties=\"cover-image\"]/@id")[0].replace(".svg", ".jpg") metadata_xml = regex.sub(r"(<metadata[^>]+?>)", f"\\1\n\t\t<meta content=\"{cover_id}\" name=\"cover\" />", metadata_xml) # Add metadata to content.opf indicating this file is a Standard Ebooks compatibility build metadata_xml = metadata_xml.replace("<dc:publisher", "<meta property=\"se:transform\">compatibility</meta>\n\t\t<dc:publisher") # Add any new MathML images we generated to the manifest if has_mathml: for root, _, filenames in os.walk(work_epub_root_directory / "epub" / "images"): filenames = natsorted(filenames) filenames.reverse() for filename_string in filenames: filename = Path(root) / filename_string if filename.name.startswith("mathml-"): metadata_xml = metadata_xml.replace("<manifest>", f"<manifest><item href=\"images/{filename.name}\" id=\"{filename.name}\" media-type=\"image/png\"/>") metadata_xml = regex.sub(r"properties=\"([^\"]*?)mathml([^\"]*?)\"", "properties=\"\\1\\2\"", metadata_xml) metadata_xml = regex.sub(r"properties=\"\s*\"", "", metadata_xml) # Generate our NCX file for epub2 compatibility. # First find the ToC file. toc_filename = self.metadata_dom.xpath("//item[@properties=\"nav\"]/@href")[0] metadata_xml = metadata_xml.replace("<spine>", "<spine toc=\"ncx\">") metadata_xml = metadata_xml.replace("<manifest>", "<manifest><item href=\"toc.ncx\" id=\"ncx\" media-type=\"application/x-dtbncx+xml\" />") # Now use an XSLT transform to generate the NCX with importlib_resources.path("se.data", "navdoc2ncx.xsl") as navdoc2ncx_xsl_filename: toc_tree = se.epub.convert_toc_to_ncx(work_epub_root_directory, toc_filename, navdoc2ncx_xsl_filename) # Convert the <nav> landmarks element to the <guide> element in content.opf guide_xhtml = "<guide>" for element in toc_tree.xpath("//nav[@epub:type=\"landmarks\"]/ol/li/a"): element_xhtml = element.tostring() element_xhtml = regex.sub(r"epub:type=\"([^\"]*)(\s*frontmatter\s*|\s*backmatter\s*)([^\"]*)\"", "type=\"\\1\\3\"", element_xhtml) element_xhtml = regex.sub(r"epub:type=\"[^\"]*(acknowledgements|bibliography|colophon|copyright-page|cover|dedication|epigraph|foreword|glossary|index|loi|lot|notes|preface|bodymatter|titlepage|toc)[^\"]*\"", "type=\"\\1\"", element_xhtml) element_xhtml = element_xhtml.replace("type=\"copyright-page", "type=\"copyright page") # We add the 'text' attribute to the titlepage to tell the reader to start there element_xhtml = element_xhtml.replace("type=\"titlepage", "type=\"title-page text") element_xhtml = regex.sub(r"type=\"\s*\"", "", element_xhtml) element_xhtml = element_xhtml.replace("<a", "<reference") element_xhtml = regex.sub(r">(.+)</a>", " title=\"\\1\" />", element_xhtml) # Replace instances of the `role` attribute since it's illegal in content.opf element_xhtml = regex.sub(r" role=\".*?\"", "", element_xhtml) guide_xhtml = guide_xhtml + element_xhtml guide_xhtml = guide_xhtml + "</guide>" metadata_xml = metadata_xml.replace("</package>", "") + guide_xhtml + "</package>" # Guide is done, now write content.opf and clean it. # Output the modified content.opf before making more epub2 compatibility hacks. with open(work_epub_root_directory / "epub" / "content.opf", "w", encoding="utf-8") as file: file.write(metadata_xml) file.truncate() # All done, clean the output for filepath in se.get_target_filenames([work_epub_root_directory], (".xhtml", ".svg", ".opf", ".ncx")): se.formatting.format_xml_file(filepath) # Write the compatible epub se.epub.write_epub(work_epub_root_directory, output_directory / epub_output_filename) if run_epubcheck: # Path arguments must be cast to string for Windows compatibility. with importlib_resources.path("se.data.epubcheck", "epubcheck.jar") as jar_path: try: epubcheck_result = subprocess.run(["java", "-jar", str(jar_path), "--quiet", str(output_directory / epub_output_filename)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False) epubcheck_result.check_returncode() except subprocess.CalledProcessError: output = epubcheck_result.stdout.decode().strip() # Get the epubcheck version to print to the console version_output = subprocess.run(["java", "-jar", str(jar_path), "--version"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False).stdout.decode().strip() version = regex.search(r"[0-9]+\.([0-9]+\.?)*", version_output, flags=regex.MULTILINE).group(0) # The last two lines from epubcheck output are not necessary. Remove them here. # Remove them as lines instead of as a matching regex to work with localized output strings. split_output = output.split("\n") output = "\n".join(split_output[:-2]) # Try to linkify files in output if we can find them try: output = regex.sub(r"(ERROR\(.+?\): )(.+?)(\([0-9]+,[0-9]+\))", lambda match: match.group(1) + "[path][link=file://" + str(self.path / "src" / regex.sub(fr"^\..+?\.epub{os.sep}", "", match.group(2))) + "]" + match.group(2) + "[/][/]" + match.group(3), output) except: # If something goes wrong, just pass through the usual output pass raise se.BuildFailedException(f"[bash]epubcheck[/] v{version} failed with:\n{output}") if build_kindle: # There's a bug in Calibre <= 3.48.0 where authors who have more than one MARC relator role # display as "unknown author" in the Kindle interface. # See: https://bugs.launchpad.net/calibre/+bug/1844578 # Until the bug is fixed, we simply remove any other MARC relator on the dc:creator element. # Once the bug is fixed, we can remove this block. with open(work_epub_root_directory / "epub" / "content.opf", "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml for match in regex.findall(r"<meta property=\"role\" refines=\"#author\" scheme=\"marc:relators\">.*?</meta>", xhtml): if ">aut<" not in match: processed_xhtml = processed_xhtml.replace(match, "") if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() # Kindle doesn't go more than 2 levels deep for ToC, so flatten it here. with open(work_epub_root_directory / "epub" / toc_filename, "r+", encoding="utf-8") as file: xhtml = file.read() soup = BeautifulSoup(xhtml, "lxml") for match in soup.select("ol > li > ol > li > ol"): match.parent.insert_after(match) match.unwrap() file.seek(0) file.write(str(soup)) file.truncate() # Rebuild the NCX with importlib_resources.path("se.data", "navdoc2ncx.xsl") as navdoc2ncx_xsl_filename: toc_tree = se.epub.convert_toc_to_ncx(work_epub_root_directory, toc_filename, navdoc2ncx_xsl_filename) # Clean just the ToC and NCX for filepath in [work_epub_root_directory / "epub" / "toc.ncx", work_epub_root_directory / "epub" / toc_filename]: se.formatting.format_xml_file(filepath) # Convert endnotes to Kindle popup compatible notes if (work_epub_root_directory / "epub/text/endnotes.xhtml").is_file(): with open(work_epub_root_directory / "epub/text/endnotes.xhtml", "r+", encoding="utf-8") as file: xhtml = file.read() # We have to remove the default namespace declaration from our document, otherwise # xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python try: tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", ""))) except Exception as ex: raise se.InvalidXhtmlException(f"Error parsing XHTML [path][link=file://{(work_epub_root_directory / 'epub/text/endnotes.xhtml').resolve()}]endnotes.xhtml[/][/]. Exception: {ex}") notes = tree.xpath("//li[@epub:type=\"endnote\" or @epub:type=\"footnote\"]", namespaces=se.XHTML_NAMESPACES) processed_endnotes = "" for note in notes: note_id = note.get("id") note_number = note_id.replace("note-", "") # First, fixup the reference link for this endnote try: ref_link = etree.tostring(note.xpath("p[last()]/a[last()]")[0], encoding="unicode", pretty_print=True, with_tail=False).replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "").strip() except Exception: raise se.InvalidXhtmlException(f"Can’t find ref link for [url]#{note_id}[/].") new_ref_link = regex.sub(r">.*?</a>", ">" + note_number + "</a>.", ref_link) # Now remove the wrapping li node from the note note_text = regex.sub(r"^<li[^>]*?>(.*)</li>$", r"\1", etree.tostring(note, encoding="unicode", pretty_print=True, with_tail=False), flags=regex.IGNORECASE | regex.DOTALL) # Insert our new ref link result = regex.subn(r"^\s*<p([^>]*?)>", "<p\\1 id=\"" + note_id + "\">" + new_ref_link + " ", note_text) # Sometimes there is no leading <p> tag (for example, if the endnote starts with a blockquote # If that's the case, just insert one in front. note_text = result[0] if result[1] == 0: note_text = "<p id=\"" + note_id + "\">" + new_ref_link + "</p>" + note_text # Now remove the old ref_link note_text = note_text.replace(ref_link, "") # Trim trailing spaces left over after removing the ref link note_text = regex.sub(r"\s+</p>", "</p>", note_text).strip() # Sometimes ref links are in their own p tag--remove that too note_text = regex.sub(r"<p>\s*</p>", "", note_text) processed_endnotes += note_text + "\n" # All done with endnotes, so drop them back in xhtml = regex.sub(r"<ol>.*</ol>", processed_endnotes, xhtml, flags=regex.IGNORECASE | regex.DOTALL) file.seek(0) file.write(xhtml) file.truncate() # While Kindle now supports soft hyphens, popup endnotes break words but don't insert the hyphen characters. So for now, remove soft hyphens from the endnotes file. with open(work_epub_root_directory / "epub" / "text" / "endnotes.xhtml", "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml processed_xhtml = processed_xhtml.replace(se.SHY_HYPHEN, "") if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() # Do some compatibility replacements for root, _, filenames in os.walk(work_epub_root_directory): for filename_string in filenames: filename = Path(root) / filename_string if filename.suffix == ".xhtml": with open(filename, "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml # Kindle doesn't recognize most zero-width spaces or word joiners, so just remove them. # It does recognize the word joiner character, but only in the old mobi7 format. The new format renders them as spaces. processed_xhtml = processed_xhtml.replace(se.ZERO_WIDTH_SPACE, "") # Remove the epub:type attribute, as Calibre turns it into just "type" processed_xhtml = regex.sub(r"epub:type=\"[^\"]*?\"", "", processed_xhtml) if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() # Include compatibility CSS with open(work_epub_root_directory / "epub" / "css" / "core.css", "a", encoding="utf-8") as core_css_file: with importlib_resources.open_text("se.data.templates", "kindle.css", encoding="utf-8") as compatibility_css_file: core_css_file.write(compatibility_css_file.read()) # Add soft hyphens for filepath in se.get_target_filenames([work_epub_root_directory], (".xhtml",)): se.typography.hyphenate_file(filepath, None, True) # Build an epub file we can send to Calibre se.epub.write_epub(work_epub_root_directory, work_directory / epub_output_filename) # Generate the Kindle file # We place it in the work directory because later we have to update the asin, and the mobi.update_asin() function will write to the final output directory cover_path = work_epub_root_directory / "epub" / self.metadata_dom.xpath("//item[@properties=\"cover-image\"]/@href")[0].replace(".svg", ".jpg") # Path arguments must be cast to string for Windows compatibility. return_code = subprocess.run([str(ebook_convert_path), str(work_directory / epub_output_filename), str(work_directory / kindle_output_filename), "--pretty-print", "--no-inline-toc", "--max-toc-links=0", "--prefer-metadata-cover", f"--cover={cover_path}"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False).returncode if return_code: raise se.InvalidSeEbookException("[bash]ebook-convert[/] failed.") # Success, extract the Kindle cover thumbnail # Update the ASIN in the generated file mobi.update_asin(asin, work_directory / kindle_output_filename, output_directory / kindle_output_filename) # Extract the thumbnail kindle_cover_thumbnail = Image.open(work_epub_root_directory / "epub" / "images" / "cover.jpg") kindle_cover_thumbnail = kindle_cover_thumbnail.convert("RGB") # Remove alpha channel from PNG if necessary kindle_cover_thumbnail = kindle_cover_thumbnail.resize((432, 648)) kindle_cover_thumbnail.save(output_directory / f"thumbnail_{asin}_EBOK_portrait.jpg")