class ParserBS(AbstractParser): """ The custom parser over BeautifulSoup """ def __init__(self, html_raw: str, parser_bs_type: str = "html.parser"): self.html_parsed = BeautifulSoup(html_raw, parser_bs_type) @property def html_raw(self) -> str: return self.html_parsed.__str__() @cached_property def title(self) -> str: title = self.html_parsed.find("title") return title and title.text or "" @cached_property def anchor_nodes(self) -> Iterable[ResultSet]: return self.html_parsed.find_all("a", attrs={"href": True}) def get_related_anchors_href(self) -> Iterable[str]: collection: Set[str] = set() for node in self.anchor_nodes: href: str = node.attrs.get("href") if not ParserBS._is_href_url_related(href): continue collection.add(href) return collection def __repr__(self): return self.html_parsed.__repr__()
def catFromContrib(username): url = r'https://bn.wikipedia.org/w/index.php?title=%E0%A6%AC%E0%A6%BF%E0%A6%B6%E0%A7%87%E0%A6%B7:%E0%A6%85%E0%A6%AC%E0%A6%A6%E0%A6%BE%E0%A6%A8/' + username + '&offset=&limit=500' response = requests.get(url) div = BeautifulSoup(unescape(response.text), "html.parser").find_all( "ul", {"class": "mw-contributions-list"}) pagenamelist = re.findall( r'<a.*?class="mw-contributions-title".*?>(.*?)</a>', div.__str__()) pagenamelist = [page for page in pagenamelist if isMainspace(page)] pagenamelist = list(dict.fromkeys(pagenamelist)) catlist = [] for page in pagenamelist: response = requests.get('https://bn.wikipedia.org/w/api.php', params={ 'action': 'query', 'format': 'json', 'titles': page, 'prop': 'revisions', 'rvprop': 'content' }).json() data = next(iter( response['query']['pages'].values()))['revisions'][0]['*'] l = re.findall(r'\[\[বিষয়শ্রেণী:(.*?)\]\]', data) catlist = catlist + l catlist = list(dict.fromkeys(catlist)) return catlist
def get_detail_data(url_detail): time.sleep(2) page = urlopen(url_detail) soup = BeautifulSoup(page, 'html.parser') company_info = soup.findAll("div", {"class": "detail_intro"}).__str__() mobile = has_inside(re.findall(r'Mobile.+"([0-9\s]+)"', company_info)) fax = has_inside(re.findall(r'Fax.+"([0-9\s]+)"', company_info)) services = list_of_services( soup.findAll("a", {"class": "servicesglossary"})) # type of surveyor extraction tos = soup.findAll("div", {"class": "rCol"})[0].__str__().replace('\n', '') buss_type = has_inside(re.findall(r'<h4>Business type<\/h4><p>(.+?)<', tos)) tos = soup.__str__().replace('\n', '') type_of_srv = type_of_surveyor( has_inside(re.findall(r'<h4>Type of surveyor<\/h4><p>(.+?)<', tos))) # managers tos = soup.findAll("p") for t in tos: if 'Mr' in t.getText() or 'Mrs' in t.getText(): mng_list = list(mng.rstrip().lstrip() for mng in t.getText().split('•')) for _ in range(5 - len(mng_list)): mng_list.append('') break else: mng_list = [] contact1, contact2, contact3, contact4, contact5 = mng_list[:5] return mobile, fax, services, buss_type, type_of_srv, contact1, contact2, contact3, contact4, contact5
def crawl_article_content(url, title): res = requests.get(url) # 爬取文章的内容 bs_obj = BeautifulSoup(res.content, "html.parser") while bs_obj.find("img", id="seccodeImage") is not None: WechatArticleCrawler.headers["Cookie"] = input("输入新的Cookie") bs_obj = BeautifulSoup(res.content, "html.parser") return "" if res.status_code == 404: print(url) return "" js_content = bs_obj.find("div", id="js_content") if js_content is None: return "" # WechatArticleCrawler.save_file(bs_obj.__str__(), title + ".html") # 保存源代码 parse_cnt = WechatArticleCrawler.parse_js_content( js_content) # 获取转化后的形式 # WechatArticleCrawler.save_file(parse_cnt, url[28:]) cnt_file_name = url.replace("/", "").replace(":", "") WechatArticleCrawler.save_file(parse_cnt, cnt_file_name) abstract_cnt = js_content.get_text()[:51].replace("\n", "") # 获取文章的摘要,并保存摘要 # WechatArticleCrawler.save_file(abstract_cnt, url[28:] + "_abstract") WechatArticleCrawler.save_file(abstract_cnt, cnt_file_name + "_abstract") image_url = re.search(WechatArticleCrawler.pattern, bs_obj.__str__()).group(1) return image_url
def clean(raw): t = BeautifulSoup(raw, "lxml").find('article') t.find('h2').decompose() [x.decompose() for x in t.find_all('a')] [x.decompose() for x in t.find_all('div', style=lambda v: v)] [x.decompose() for x in t.find_all('div', {'class': 'spacer'})] [x.decompose() for x in t.find_all('nav')] [x.decompose() for x in t.find_all('div', {'id': 'comments'})] [x.decompose() for x in t.find_all('div', {'class': 'helpers'})] [x.decompose() for x in t.find_all('div', {'class': 'cat'})] [x.decompose() for x in t.find_all('div', {'class': 'com'})] t = t.__str__().replace('\n\n\n\n', '').replace('<p>', '').replace('</p>', '') t = t.replace('</article>', '').replace('</div>', '').replace('\r', '') t = t.replace('<small>', '').replace('</small>', '') t = t.replace('<em>', '').replace('</em>', '') t = t.replace('<sup>', '').replace('</sup>', '') t = t.replace('<br style="clear:both;"/>', '') # special tokens t = t.replace('<article>', '[A]') t = t.replace('<div class="n">', '[N]') t = t.replace('<div class="b">', '[B]') t = t.replace('<div class="m1">', '[M1]') t = t.replace('<div class="m2">', '[M2]') t = t.replace('<div class="b2">', '[B2]') return t
def removeHtml(string): result = BeautifulSoup(string).findAll(text=True) strResult = str(result.__str__()) strResult = strResult.strip("[u'") strResult = strResult.strip("']") strResult = strResult.strip("\n") strResult = strResult.strip("u'") return strResult
def get_content(url): r = requests.get(url).content str_content = r.decode('utf-8') soup = BeautifulSoup(str_content, 'html.parser') # content = soup.select('p') fp = open("contents.txt", "w", encoding='utf-8') # for c in content: fp.write(soup.__str__()) fp.close()
def parse(self, code, filter=None): if filter is None: filter = BasicFilter() soup = BeautifulSoup(code, "html.parser") filter.clean(soup) # return soup.prettify(encoding=None); return soup.__str__()
def find_answer(name, keywords, path): logger.info("Start to find sentence.") f = open(path, 'r+', encoding='utf8') lines = f.read() soup = BeautifulSoup(lines) for s in soup('script'): s.extract() for s in soup('style'): s.extract() lines = soup.__str__() reg2 = re.compile('<[^>]*>') lines = reg2.sub('', lines) reg3 = re.compile('-->') lines = reg3.sub('', lines) reg4 = re.compile('&(\S)?gt') lines = reg4.sub('', lines) reg5 = re.compile('New!') lines = reg5.sub('', lines) reg6 = re.compile(' ') lines = reg6.sub('', lines) ans = lines.split("\n") content = [] for sentence in ans: if not sentence == '': content.append(sentence) if sentence == '法律声明': break # print(content) num = 0 sentence_num = 0 aim_sentence = content[0] flag = 0 for sentence in reversed(content): sentence_num += 1 if sentence_num % 100 == 0: logger.info("Find {0:d} sentence.".format(sentence_num)) if sentence == name: # 排除与节点名相同的句子 flag = 1 continue # if flag == 1: # aim_sentence = sentence # break match_key = jieba.analyse.extract_tags(sentence, topK=10, withWeight=True) # 从输入中提取关键词 match_key = [word for word in match_key if word not in stoplist] # 去停用词 if calculate_sentence_vector(keywords, match_key) > num: num = calculate_sentence_vector(keywords, match_key) aim_sentence = sentence logger.info("Sentence has already been found.") return aim_sentence
def process(self): self.cursor1.execute( """select page_id, text.old_text from page join text on text.old_id = page.page_latest where page_id = 18938265 OR page_id = 3732122""" ) row1 = self.cursor1.fetchone() counter = 0 parsed_content = creole2html(row1[1].decode("utf-8")) parsed_content = parsed_content.replace("<", "<") parsed_content = parsed_content.replace(">", ">") soup = BeautifulSoup(parsed_content) raw = nltk.clean_html(soup.__str__()) row1 = self.cursor1.fetchone() parsed_content = creole2html(row1[1].decode("utf-8")) parsed_content = parsed_content.replace("<", "<") parsed_content = parsed_content.replace(">", ">") soup = BeautifulSoup(parsed_content) raw2 = nltk.clean_html(soup.__str__()) # lda.print_lda([raw],[raw2]) print lda.get_similarity(raw, raw2) counter += 1 self.cursor1.close() self.db.close()
def parse_article(article_content): article_soup = BeautifulSoup(article_content, 'html.parser') for table in article_soup.find_all('table', {'class': 'infobox'}): table.decompose() [ a.decompose() for a in article_soup.find_all('a', {'class': 'mw-jump-link'}) ] [a.decompose() for a in article_soup.find_all('a', {'class': 'image'})] article_soup.find('div', id='catlinks').decompose() html_test = os.path.join('/tmp', 'test.html') with open(html_test, 'w') as f: f.write(article_soup.__str__())
def recent_stats(category: str, page_num: int) -> str: """ 1. DataCollection/TwitterStatsBatch/var/UraakaPickUp/recent.htmlからデータをロードする 2. コンテンツを画面にフィットさせるためにJSが必要でそれを読み込むために、soupも必要 """ if isinstance(page_num, str): page_num = int(page_num) with open( f'{TOP_DIR}/DataCollection/TwitterStatsBatch/var/UraakaPickUp/recents/recent_{category}_50000_{page_num}.html' ) as fp: html = fp.read() soup = BeautifulSoup(html, "lxml") # print(BeautifulSoup(ResponsibleDevices.responsible_devices(), "lxml")) soup.find("body").insert( 0, BeautifulSoup(ResponsibleDevices.responsible_devices(), "lxml")) return soup.__str__()
def inCategory(categoryname): http = re.findall(r'https://', categoryname) if http: url = categoryname else: url = 'https://bn.wikipedia.org/wiki/' + categoryname response = requests.get(url) l = BeautifulSoup(unescape(response.text), "html.parser").find_all("div", {"id": "mw-pages"}) pages = re.findall('<li><a.*?>(.*?)</a></li>', l.__str__()) next = [ i.get("href") for i in BeautifulSoup(unescape(response.text), "html.parser").find_all("a") if i.text == "পরবর্তী পাতা" ] if next: pages = pages + inCategory('https://bn.wikipedia.org/' + next[0]) return pages
def load_player_stat(url): # PhantomJS files have different extensions # under different operating systems if platform.system() == 'Windows': PHANTOMJS_PATH = './phantomjs.exe' else: PHANTOMJS_PATH = './phantomjs' # here we'll use pseudo browser PhantomJS, # but browser can be replaced with browser = webdriver.FireFox(), # which is good for debugging. browser = webdriver.PhantomJS(PHANTOMJS_PATH) # browser.get('http://www.basketball-reference.com/leagues/NBA_2017.html#team-stats-base::none') browser.get(url) # let's parse our html soup = BeautifulSoup(browser.page_source, "html.parser") browser.quit() ps_df = pd.read_html(soup.__str__())[0] # use first row as header ps_df.columns = ps_df.columns = ps_df.iloc[0] ps_df = ps_df.reindex(ps_df.index.drop(0)) # remove special characters in player name ps_df['Player'] = ps_df['Player'].str.replace("[',.-]", '') # normalize names ps_df['Player'] = ps_df['Player'].str.replace('Jose Juan Barea', 'JJ Barea') ps_df['Player'] = ps_df['Player'].str.replace('Glenn Robinson', 'Glenn Robinson III') ps_df['Player'] = ps_df['Player'].str.replace('Kelly Oubre', 'Kelly Oubre Jr') ps_df['Player'] = ps_df['Player'].str.replace('Nene', 'Nene Hilario') ps_df['Player'] = ps_df['Player'].str.replace('Juan Hernangomez', 'Juancho Hernangomez') ps_df['Player'] = ps_df['Player'].str.replace('Willy Hernangomez', 'Guillermo Hernangomez') ps_df['Player'] = ps_df['Player'].str.replace('Luc Mbah a Moute', 'Luc Richard Mbah a Moute') return ps_df
def cleanup(self, data): data = data.replace('</uniqueindex1>', '</uniqueindex>') data = data.replace('</fulltextindex2>', '</fulltextindex>') data = data.replace('</index3>', '</index>') data = data.replace('<uniqueindex1 ', '<uniqueindex ') data = data.replace('<fulltextindex2 ', '<fulltextindex ') data = data.replace('<index3 ', '<index ') data = data.replace(strComma, ",") data = data.replace("\n>", ">") data = data.replace(strReferenced_tbl_End, "") soup = BeautifulSoup(data) soup.prettify() data = soup.__str__() data = data.replace("<property", "\t<property") data = data.replace("<primarykey", "\t<primarykey") data = data.replace("</primarykey", "\t</primarykey") data = data.replace("<index", "\t<index") data = data.replace("</index", "\t</index") data = data.replace("<column", "\t\t<column") data = data.replace("<constraint", "\t<constraint") data = data.replace("</constraint", "\t</constraint") data = data.replace("<reference", "\t\t<reference") data = data.replace("></entity>", ">\n</entity>") data = data.replace("<uniqueindex", "\t<uniqueindex") data = data.replace("</uniqueindex", "\t</uniqueindex") data = data.replace(",=\"\"", "") data = data.replace(",=\"\" ", "") data = data.replace("'=\"\" ", "") newdata = data.replace("\n\n", "\n") while data != newdata: data = newdata newdata = data.replace("\n\n", "\n") return data
def handler_audio(act1): #query = input('What are you searching for: ') url = 'http://www.google.com/search?q=' page = requests.get(url + str(act1.name)) soup = BeautifulSoup(page.text, 'html.parser') h3 = soup.find_all("h3", class_="r") for elem in h3: pb['value'] = 100 time.sleep(1.5) elem = elem.contents[0] link = ("https://www.google.com" + elem["href"]) if link.find('music.yandex.ru') != -1: print('Его нельзя: ' + link) elif link.find('youtube') != -1: print('Его нельзя: ' + link) elif link.find('text-lyrics.ru') != -1 or link.find( 'genius.com') != -1: f = open('pir.txt', 'a') f.write(link + '\n') f.close() print('Яма: ' + link) else: print(link) response = requests.get(link) soup = BeautifulSoup(response.text, 'html.parser').find('div', class_='download') print(soup) if soup != None: soup = soup.__str__() for i in BeautifulSoup(soup, 'html.parser').find_all('a', href=True): wget.download(i['href'], act1.name + '_test.mp3') audio = MP3(act1.name) print("Track: " + audio.get("TIT2").text[0]) print('Lenght: ' + str(audio.info.length)) print('Info: ' + audio.info.pprint()) audio2 = MP3(act1.name + "_test.mp3") print('Info: ' + audio2.info.pprint()) if audio2.get("TIT2") == audio.get( "TIT2" ) and audio2.info.length == audio.info.length and audio2.info.pprint( ) == audio.info.pprint(): print("Это подлинный") label['text'] = "Это подлинный" else: print('Пиратская копия') label['text'] = 'Пиратская копия' f = open('pir.txt', 'a') f.write(link + '\n') f.close() print(i['href']) window = Tk() window.title("СПИСОК САЙТОВ С ПИРАТСКИМ КОНТЕНТОМ") window.geometry("600x150") window.resizable(False, False) f = open('pir.txt', 'r') for line in f.readlines(): print(line) f.close() window.mainloop()
# print(5//2) import requests from bs4 import BeautifulSoup res = requests.get(r'https://sspai.com/post/23631') soup = BeautifulSoup(res.text, 'lxml') print(soup.__str__())
with open('/tmp/prince.input.html', 'w') as f: f.write(des.format(title=title, content=content)) options = Options() profile = webdriver.FirefoxProfile() profile.set_preference('permissions.default.image', 2) profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') ff = webdriver.Firefox(options=options, firefox_profile=profile) ff.get('file:///tmp/prince.input.html') while True: time.sleep(0.1) if ff.execute_script('return document.readyState') == 'complete': break soup = BeautifulSoup(ff.page_source, 'lxml') ff.quit() scripts = soup.find_all('script') for elem in scripts: elem.extract() with open('output.html', 'w') as f: f.write(soup.__str__()) subprocess.call(['prince', 'output.html']) os.remove('output.html') os.remove('geckodriver.log')
def yj_html_replace(html: str, digest: str) -> str: """ 必要なフォーマットに変換して、htmlを返却 """ soup = BeautifulSoup(html, 'html5lib') try: for a in soup.find('head').find_all('script'): a.decompose() for a in soup.find('body').find_all('script'): a.decompose() for a in soup.find_all('iframe'): a.decompose() """ 2020/06/09追加 """ if soup.find(attrs={"id":"msthd"}): soup.find(attrs={"id":"msthd"}).decompose() if soup.find(attrs={"id":"yjnHeader_nav"}): soup.find(attrs={"id":"yjnHeader_nav"}).decompose() if soup.find(attrs={"id":"uamods-also_read"}): soup.find(attrs={"id":"uamods-also_read"}).decompose() if soup.find(attrs={"id":"newsFeed"}): soup.find(attrs={"id":"newsFeed"}).decompose() if soup.find(attrs={"id":"yjSLink"}): soup.find(attrs={"id":"yjSLink"}).decompose() """ sectionの中に’関連記事’の文字列が含まれていたら削除 """ for section in soup.find_all("section"): if "【関連記事】" in section.__str__(): section.decompose() """ comment disable button """ if soup.find(attrs={"class":"checkbox"}): soup.find(attrs={"class":"checkbox"}).decompose() """ 2020/06 古い削除ルールセット """ for key, value in [("class", "listPaneltype"), ("class", "mainYdn"), ("id", "timeline"), ("id", "yjSLink"), ("class", "ynDetailRelArticle"), ("class", "commentBox"), ("id", "contentsFooter"), ("id", "footer"), ("id", "stream_title"), ("id", "contentsHeader"), ("id", "yjnFooter")]: if soup.find(attrs={key:value}): soup.find(attrs={key: value}).decompose() """ 画像の説明のhrefを消す """ if soup.find(attrs={"class": "photoOffer"}): del soup.find(attrs={"class": "photoOffer"}).find("a")["href"] """ contents中のリンクを削除 """ for key, value in [("id", "uamods"), ("id", "paragraph")]: paragraph = soup.find(attrs={key: value}) if paragraph is None: continue for a in paragraph.find_all("a"): # if a.get("href"): # del a["href"] """ a -> spanに変更 """ a.name = "span" """ テキストリンクの装飾を消す """ for a in soup.find_all(attrs={"class": "yjDirectSLinkHl"}): del a["class"] """ fontをWeb Fontの明朝に変更""" soup.find("head").insert(-1, BeautifulSoup('<link href="https://fonts.googleapis.com/css?family=Noto+Serif+JP:400,700&display=swap&subset=japanese" rel="stylesheet">', 'lxml')) soup.find("body")["style"] = "font-family: 'Noto Serif JP' !important;" """ javascriptによるクリック発火を抑制 """ for a in soup.find_all("a", {"onmousedown": True}): del a["onmousedown"] """ stylesheetの一部を削除 """ # soup.find(attrs={"href": "https://s.yimg.jp/images/jpnews/cre/article/pc/css/article_pc_v7.0.css"}).decompose() """ 次のページをパースして統合 """ next_page_li = soup.find("li", attrs={"class": "next"}) if next_page_li is None: next_page_li = soup.find("li", attrs={"class": "pagination_item pagination_item-next"}) if next_page_li and next_page_li.find("span"): next_paragraphs: List[BeautifulSoup] = [] get_nexts(next_page_li.find("span").get("href"), next_paragraphs) print("total page size", len(next_paragraphs)) for idx, next_paragraph in enumerate(next_paragraphs): if soup.find(attrs={"class": "articleMain"}): soup.find(attrs={"class": "articleMain"}).insert(-1, next_paragraph) soup.find(attrs={"class": "articleMain"}).insert(-1, BeautifulSoup(f"""<p align="center"> Page {idx+2} </p>""", "lxml")) elif soup.find(attrs={"id": "uamods"}): soup.find(attrs={"id": "uamods"}).insert(-1, next_paragraph) soup.find(attrs={"id": "uamods"}).insert(-1, BeautifulSoup(f"""<p align="center"> Page {idx+2} </p>""", "lxml")) # print(next_paragraph) """ pageを示すフッターを消す """ # soup.find(attrs={"class": "marT10"}).decompose() # soup.find(attrs={"class": "fdFt"}).decompose() """ page送りを消す """ if soup.find(attrs={"class": "pagination_items"}): for pagination_item in soup.find_all(attrs={"class": "pagination_items"}): pagination_item.decompose() """ footerを最後以外のものを消す """ footers = soup.find_all("footer") if footers.__len__() >= 2: for footer in footers[:-1]: footer.decompose() """ 次ページは:の文字を消す """ for a in soup.find_all("a", attrs={"class": re.compile("sc-.*?")}): if "次ページは:" in a.__str__(): a.decompose() """ remove headers without head """ for header in soup.find_all("header")[2:]: header.decompose() """ もとURLを挿入 """ original_url = soup.find("meta", attrs={"property": "og:url"}).get("content") if soup.find(attrs={"class": "contentsWrap"}): soup.find(attrs={"class": "contentsWrap"}).insert(-1, BeautifulSoup(f"""<a href="{original_url}"><p align="center">オリジナルURL</p></a>""", "lxml")) # paragraph.find("a", {"class":None, "href":True}).decompose() except Exception as exc: tb_lineno = sys.exc_info()[2].tb_lineno print(f'[{FILE}] decompose error, exc = {exc}, tb_lineno = {tb_lineno}', file=sys.stderr) print(f'[{FILE}] accessing to {TOP_DIR}/var/YJ/comments/{digest}', file=sys.stdout) comment_html = '' comment_html_below = '' fns = sorted(glob.glob(f'{TOP_DIR}/var/YJ/comments/{digest}/*.pkl')) if len(fns) == 0: comment_html = '誰もコメントしていません' else: # last is lastest comment fn = fns[-1] with open(fn, 'rb') as fp: try: comments: YJComment = pickle.load(fp) except EOFError as exc: tb_lineno = sys.exc_info()[2].tb_lineno print(f"[{FILE}] exc = {exc}, tb_lineno = {tb_lineno}", file=sys.stderr) Path(fn).unlink() comments = [] for comment in list(reversed(sorted(comments, key=lambda x: x.ts)))[:20]: tmp = f'''<div class="comment"> <div class="username">😃{comment.username}</div> <div class="text">{comment.comment}</div> <div class="ts-view" style="font-size:xx-small;text-align:right;">{comment.ts}</div> <div class="good-bad">👍x{comment.good} 👎x{comment.bad}</div> </div><br>''' comment_html += tmp for comment in list(reversed(sorted(comments, key=lambda x: x.ts)))[20:]: tmp = f'''<div class="comment"> <div class="username">😃{comment.username}</div> <div class="text">{comment.comment}</div> <div class="ts-view" style="font-size:xx-small;text-align:right;">{comment.ts}</div> <div class="good-bad">👍x{comment.good} 👎x{comment.bad}</div> </div><br>''' comment_html_below += tmp """ 1. ログインしたユーザのコメント等も乗せる 2. コメント欄も表示 3. {TOP_DIR}/var/YJ/YJComment/{digest} に時系列の名前で、json形式で入っている """ this_site_comments = "" for fn in reversed(sorted(glob.glob(f"{TOP_DIR}/var/YJ/YJComment/{digest}/*"))): obj = json.load(open(fn)) tmp = f'''<div class="comment"> <div class="username">😃{obj["screen_name"]}</div> <div class="text">{obj["YJComment"]}</div> <div class="ts-view" style="font-size:xx-small;text-align:right;">{obj["datetime"]}</div> <div class="good-bad">👍x{0} 👎x{0}</div> </div><br>''' this_site_comments += tmp try: # print(soup) if soup.find("div", {"id":"sub"}) is not None: target_id = "sub" else: target_id = "yjnSub" with open(f"{HOME}/tmp", "w") as fp: fp.write(soup.__str__()) soup.find('div', {'id': target_id}).string = '' soup.find('div', {'id': target_id}).insert(1, BeautifulSoup(comment_html, 'html5lib')) if soup.find(attrs={"id": "contentsWrap"}): target_id = "contentsWrap" else: target_id = "main" soup.find('div', {'id': target_id}).append(BeautifulSoup(get_form_html(digest), 'html5lib')) soup.find('div', {'id': target_id}).append(BeautifulSoup(this_site_comments, 'html5lib')) soup.find('div', {'id': target_id}).append(BeautifulSoup(comment_html_below, 'html5lib')) except Exception as exc: tb_lineno = sys.exc_info()[2].tb_lineno print(f'[{FILE}] exc = {exc}, tb_lineno = {tb_lineno}', file=sys.stderr) return f"[{FILE}] Cannnot handle this page, exc = {exc}, tb_lineno = {tb_lineno}" return str(soup)
def get_links_from_manga_page(self): if not os.path.exists("Links"): os.mkdir("Links") if self.__manga_page_link is None: raise ValueError( "panggil set_manga_page_link(manga_page_link) dulu") r = requests.get(self.__manga_page_link, headers=header, timeout=10, stream=True) content = r.text banner_link_image = '' links = {} if "komikcast" in self.__manga_page_link: span_elements = BS(content, 'html.parser').findAll('span', {'class': 'leftoff'}) a_elements = BS(span_elements.__str__(), 'html.parser').find_all('a') for a in a_elements: if 'end' in a.text.lower(): links[a.text + ' END'] = a.attrs['href'] else: links[a.text] = a.attrs['href'] banner_link_image = BS(content, 'html.parser').find( 'img', { 'class': 'attachment-post-thumbnail' }).attrs['src'] elif "komikgue" in self.__manga_page_link: a_elements = BS(content, 'html.parser').findAll( 'a', {'style': 'text-decoration:none;'}) span_elements = BS(a_elements.__str__(), 'html.parser').find_all('span') for a, span in zip(a_elements, span_elements): if 'end' in a.text.lower(): links["Chapter {} END".format(span.text)] = a.attrs['href'] else: links["Chapter {}".format(span.text)] = a.attrs['href'] banner_link_image = \ BS(content, 'html.parser').find('img', {'class': 'img-responsive', 'itemprop': 'image'}).attrs['src'] elif "komikone" in self.__manga_page_link: span_elements = BS(content, 'html.parser').findAll('span', {'class': 'lchx'}) a_elements = BS(span_elements.__str__(), 'html.parser').find_all('a') for a in a_elements: if 'end' in a.text.lower(): links[a.text + ' END'] = a.attrs['href'] else: links[a.text] = a.attrs['href'] banner_link_image = BS(content, 'html.parser').find( 'img', { 'class': 'attachment-post-thumbnail' }).attrs['src'] elif "mangazuki" or "yomanga" in self.__manga_page_link: li_elements = BS(content, 'html.parser').find_all( 'li', {'class': 'wp-manga-chapter'}) a_elements = BS(li_elements.__str__(), 'html.parser').find_all('a') a_elements = [a for a in a_elements if a.text] for a in a_elements: chapter_number = re.search(regex_number, a.text).group(0) lin = a.attrs['href'] if "style=list" not in lin: lin += "?style=list" if 'end' in a.text.lower(): links["Chapter {} END".format(chapter_number)] = lin else: links["Chapter {}".format(chapter_number)] = lin try: banner_link_image = BS(content, 'html.parser').find( 'div', { 'class': 'summary_image' }).__str__() banner_link_image = BS(banner_link_image, 'html.parser').find('img') if banner_link_image is not None: banner_link_image = banner_link_image.attrs['data-src'] else: banner_link_image = BS(content, 'html.parser').find( 'div', {'class': 'c-blog__thumbnail'}) banner_link_image = banner_link_image.find('img') banner_link_image = banner_link_image.attrs['data-src'] except Exception: pass else: raise RuntimeError("Sumber manga tidak didukung") if banner_link_image: if not os.path.exists(self.__main_dir + self.__manga_name): os.mkdir(self.__main_dir + self.__manga_name) file_banner = self.__main_dir + self.__manga_name + "/1." + self.__manga_name + ".jpg" if not os.path.isfile(file_banner): result, url_image_banner, content = urlretrieve( banner_link_image) if result: with open(file_banner, 'wb') as f: f.write(content) # im = PIL.Image.open(file_banner+".jpg") # icon_sizes = [(16, 16), (24, 24), (32, 32), (48, 48), (64, 64), (128, 128), (255, 255)] # im.save(file_banner+".ico", sizes=icon_sizes) links = sorted(links.items(), key=cmp_to_key(sort)) self.__link = links with open(self.manga_dir + "chapters.json", 'w') as f: json.dump(links, f, indent=4)
log_error('Error during requests to {0} : {1}'.format(url, str(e))) return None def is_good_response(resp): """ Returns True if the response seems to be HTML, False otherwise. """ content_type = resp.headers['Content-Type'].lower() return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1) def log_error(e): """ It is always a good idea to log errors. This function just prints them, but you can make it do anything. """ print(e) raw_html = simple_get('https://www.davidjaybuckley.com') # print(raw_html) html = BeautifulSoup(raw_html, 'html.parser') f = open("output.html", "w") f.write(html.__str__()) # print(html) for i, row in enumerate(html.select('.students div.row')): print(i, row.text)
import urllib.parse import urllib.request from bs4 import BeautifulSoup scrapeUrl = "https://weixin.sogou.com/weixin?p=01030402&query=%E6%A5%BC%E5%B8%82%E5%8F%82%E8%80%83&type=2&ie=utf8" req = urllib.request.Request(scrapeUrl) req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') response = urllib.request.urlopen(req) html = response.read() bsObj = BeautifulSoup(html, "html.parser") print(bsObj.__str__())
#設置最舊頁碼 #1 page_S = div[0].select('a')[0].get('href').strip()[div[0].select('a')[0].get('href').find('index') +5: div[0].select('a')[0].get('href').find('.')] #設置第二新頁碼 page_E = div[0].select('a')[1].get('href').strip()[div[0].select('a')[1].get('href').find('index') +5: div[0].select('a')[1].get('href').find('.')] print (page_S) category_end = 0 for tnum in range(int(page_E)+1,int(page_S)-1,-1): print ("頁數: "+str(tnum)) res = rs.get('https://www.ptt.cc/bbs/'+website+'/index'+str(tnum)+'.html') outer_soup = BeautifulSoup(res.text, 'html.parser') div = outer_soup.findAll('div', {'class': 'r-ent'}) #檢查排除熱門區塊的熱門文章 page_span = re.sub('<div class=.r-list-sep.>(.*\n.*)*', '', outer_soup.__str__()) gerneral_tile_num = page_span.count('r-ent') for item in div: print (gerneral_tile_num) #熱門文章不在,不在首頁做存取 if gerneral_tile_num == 0: break gerneral_tile_num = gerneral_tile_num - 1 #當有刪除文章的情況,跳過 if u'刪除)' in item.findAll('div', {'class': 'title'})[0].text or '-' == item('div', {'class': 'author'})[0].text: #u'[公告]' in item.findAll('div', {'class': 'title'})[0].text or\ #u'[公告]' in item.findAll('div', {'class': 'title'})[0].text or\ continue
def tweet_hyoron_(day_name: str, digest: str) -> str: """ Args: - day_name: digestは<day_name>のフォルダごとに分類されている(冗長かもしれない) - digest: Twitterの評論対象のdigest Returns: - html: HTML POSTs: - TweetComment: str """ if request.method == 'POST': obj = request.form if obj.get("TweetComment"): TweetComment = obj["TweetComment"] out_dir = f"{TOP_DIR}/var/Twitter/TweetComment/{digest}" Path(out_dir).mkdir(exist_ok=True, parents=True) now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") with open(f"{out_dir}/{now}", "w") as fp: if twitter.authorized: json.dump({"TweetComment": TweetComment, "datetime": now, "screen_name": twitter.token["screen_name"]}, fp, ensure_ascii=False) else: json.dump({"TweetComment": TweetComment, "datetime": now, "screen_name": "名無しちゃん"}, fp, ensure_ascii=False) head = '<html><head><title>Twitter評論</title></head><body>' body = '' with open(f'{TOP_DIR}/var/Twitter/tweet/{day_name}/{digest}') as fp: html = fp.read() soup = BeautifulSoup(html, features='lxml') div = soup.find('body').find('div') if div.find(attrs={'class': 'EmbeddedTweet'}): div.find(attrs={'class': 'EmbeddedTweet'})["style"] = "margin: 0 auto; margin-top: 30px;" imagegrids = soup.find_all('a', {'class': 'ImageGrid-image'}) for imagegrid in imagegrids: src = imagegrid.find('img').get('src') imagegrid['href'] = src mediaassets = soup.find_all('a', {'class': 'MediaCard-mediaAsset'}) for mediaasset in mediaassets: if mediaasset.find('img') and mediaasset.find('img').get('alt') != 'Embedded video': mediaasset['href'] = mediaasset.find('img').get('src') tweetCommentSubmitContainer = BeautifulSoup(soup.find(attrs={"class": "SandboxRoot"}).__str__(), "lxml") tweetCommentSubmitContainer.find(attrs={"class": "Tweet-body"}).clear() tweetCommentSubmitContainer.find(attrs={"class": "CallToAction"}).clear() comment_html = f""" <form action="/TweetHyoron/{day_name}/{digest}" class="form" method="post" style="position: relative;"><textarea value="コメント" name="TweetComment" rows="5" id="TweetComment" style="width: 90%; margin: 0 auto; margin-left:5%; margin-top: 5px;" ></textarea><br/> <input type="submit" name="TweetSubmit" value="Submit" style="-webkit-appearance: none;-webkit-border-radius: 4px;-moz-border-radius: 4px;-ms-border-radius: 4px;-o-border-radius: 4px;border-radius: 4px;-webkit-background-clip: padding;-moz-background-clip: padding;margin: 0;padding: 3px 10px;text-shadow: white 0 1px 1px;text-decoration: none;vertical-align: top;width: auto; margin-right: 10%; margin-left:80%;"> </from> """ tweetCommentSubmitContainer.find(attrs={"class": "EmbeddedTweet-tweetContainer"}).insert(-1, BeautifulSoup(comment_html, "lxml")) tweetCommentSubmitContainer.find(attrs={"class": "TweetAuthor-name"}).string = f"コメントする" tweetCommentSubmitContainer.find(attrs={"class": "TweetAuthor-screenName"}).string = f"@concertion" for a in tweetCommentSubmitContainer.find_all("a", {"class": "Tweet-header", "href": True}): del a["href"] a.name = "p" tweetCommentSubmitContainer.find(attrs={"class": "Avatar"})["src"] = "https://pm1.narvii.com/6923/51394fd5f6e385f59bb51efa0f409e253e718a69r1-2048-1536v2_00.jpg" buzz_css = soup.find('body').find('style').__str__() if soup.find('body').find('style') else "" """ Tweetのコメントをパース TODO: 要デザイン TODO: 要外だし """ comments = [] for fn in reversed(sorted(glob.glob(f'{TOP_DIR}/var/Twitter/TweetComment/{digest}/*'))): try: obj = json.load(open(fn)) comment = f'''<div class="TweetComment"> <p>{obj["screen_name"]}</p><br/> <p>{obj["datetime"]}</p><br/> <p>{obj["TweetComment"]}</p><br/> </div>''' comments.append(comment) except Exception as exc: print(f"[{FILE}] exc = {exc}", file=sys.stderr) Path(fn).unlink() other_comments_html = "".join(comments) body += div.__str__() + buzz_css + tweetCommentSubmitContainer.__str__() + buzz_css + other_comments_html tail = '</body></html>' html = head + body + tail return html
elem = elem.contents[0] link = ("https://www.google.com" + elem["href"]) if link.find('music.yandex.ru') != -1: print('Его нельзя: ' + link) elif link.find('youtube') != -1: print('Его нельзя: ' + link) elif link.find('text-lyrics.ru') != -1: print('Яма: ' + link) else: print(link) response = requests.get(link) soup = BeautifulSoup(response.text, 'html.parser').find('div', class_='download') print(soup) if soup != None: soup = soup.__str__() for i in BeautifulSoup(soup, 'html.parser').find_all('a', href=True): wget.download(i['href'], 'Oxxymiron_where_test.mp3') audio = MP3("Oxxymiron_where_test.mp3") print("Track: " + audio.get("TIT2").text[0]) #try:print("Text: " + audio.get("USLT")) #except AttributeError: print('Нет текста') print('Lenght: ' + str(audio.info.length)) print('Info: ' + audio.info.pprint()) audio2 = MP3("Oxxymiron_where.mp3") if audio2.get("TIT2") == audio.get( "TIT2" ) and audio2.info.length == audio.info.length and audio2.info.pprint(
def reflect_html(key: int, day: str, digest: str) -> Union[None, bool]: from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait """ 1. すでに処理したファイルが存在していたらスキップ """ out_filename = f"{TOP_DIR}/var/Twitter/tweet/{day}/{digest}" if Path(out_filename).exists(): return True options = Options() options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("window-size=1024x1024") options.add_argument( f"user-data-dir=/tmp/{FILE.replace('.py', '')}_{key:06d}") options.binary_location = shutil.which("google-chrome") try: driver = webdriver.Chrome(executable_path=shutil.which("chromedriver"), options=options) driver.get(f"http://localhost/twitter/input/{day}/{digest}") print('ebug', f"http://localhost/twitter/input/{day}/{digest}") html = driver.page_source time.sleep(5) html = driver.page_source driver.save_screenshot(f"/home/gimpei/{digest}.png") driver.switch_to.frame(driver.find_element_by_tag_name("iframe")) # elm = driver.find_element_by_xpath("/html") time.sleep(1) inner_html = driver.page_source # print("inner", inner_html) # inner_html = driver.page_source # print(html) """get shadow-root""" # elm = driver.execute_script("""return document.querySelector("twitter-widget").shadowRoot""") # elm = driver.execute_script("""return document.querySelector("twitter-widget").shadowRoot""") # inner_html = elm.get_attribute("innerHTML") cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) # print(inner_html) soup = BeautifulSoup(inner_html, "lxml") imported_csses = [ el for el in soup.find_all("style", {"type": "text/css"}) ] # replace css text to local css for css in imported_csses: if "@import url" in css.text: css_url = re.search(r'url\("(.*?)"\)', css.text).group(1) css_digest = GetDigest.get_digest(css_url) # print(css_url, css_digest) with requests.get(css_url) as r: css_text = r.text Path(f"{TOP_DIR}/var/Twitter/css").mkdir(exist_ok=True, parents=True) with open(f"{TOP_DIR}/var/Twitter/css/{css_digest}", "w") as fp: fp.write(css_text) css.string = f'@import url("/twitter/css/{css_digest}")' # replace image src for img in soup.find_all(attrs={"src": True}): url = img.get("src") o = urlparse(url) if o.scheme == "": o = o._replace(scheme="https") url = o.geturl() url_digest = GetDigest.get_digest(url) if "format=jpg" in url or re.search(".jpg$", url) or re.search( ".jpeg$", url) or re.search(".JPG$", url): with requests.get(url, timeout=30) as r: binary = r.content Path(f"{TOP_DIR}/mnt/twitter_jpgs").mkdir(exist_ok=True, parents=True) with open(f"{TOP_DIR}/mnt/twitter_jpgs/{url_digest}", "wb") as fp: fp.write(binary) # print(f"downloaded! {TOP_DIR}/mnt/twitter_jpgs/{url_digest}") img["src"] = f"/twitter/jpgs/{url_digest}" elif "format=png" in url or re.search(".png$", url): with requests.get(url, timeout=30) as r: binary = r.content Path(f"{TOP_DIR}/var/Twitter/pngs").mkdir(exist_ok=True, parents=True) with open(f"{TOP_DIR}/var/Twitter/pngs/{url_digest}", "wb") as fp: fp.write(binary) img["src"] = f"/twitter/pngs/{url_digest}" elif "normal" in url or ".js" in url or ".svg" in url: continue else: continue # raise Exception(f"unsupported image! url={url}") """adhoc style edit""" if soup.find(attrs={"class": "EmbeddedTweet"}): soup.find(attrs={"class": "EmbeddedTweet" })["style"] = "margin: 0 auto; margin-top: 150px;" out_dir = f"{TOP_DIR}/var/Twitter/tweet/{day}" Path(out_dir).mkdir(exist_ok=True, parents=True) with open(f"{out_dir}/{digest}", "w") as fp: fp.write(soup.__str__()) driver.close() # if E.get("DEBUG"): print( f"[{NAME}] ordinally done, day = {day} digest = {digest}, filename = {out_dir}/{digest}" ) except Exception as exc: tb_lineno = sys.exc_info()[2].tb_lineno print( f"[{NAME}] exc = {exc}, tb_lineno = {tb_lineno}, day = {day}, digest = {digest}, filename = {out_filename}", file=sys.stderr) out_filename = f"{TOP_DIR}/var/Twitter/tweet/{day}/{digest}" Path(f"{TOP_DIR}/var/Twitter/tweet/{day}").mkdir(exist_ok=True, parents=True) # パースに失敗したやつを無視する時、有効にする # Path(out_filename).touch() time.sleep(5) return None return f"/twitter/tweet/{day}/{digest}"
from bs4 import BeautifulSoup from urllib.request import urlopen as ureq import sys # Wikipedia url = "https://en.wikipedia.org/wiki/Main_Page" #sending request to wikipedia client = ureq(url) page = client.read() client.close() #connection closed #parsing the page page_soup = BeautifulSoup(page, 'html.parser') _page_str_ = page_soup.__str__() print(len(_page_str_)) count = 0 final_str = '' for c in _page_str_: try: print(c) final_str = final_str + c except UnicodeEncodeError: print(" Cannot Print this ") count = count + 1 print(final_str)
pex12.insert_after(pex13) pex13.insert_after(pex14) pex14.insert_after(pex15) pex15.insert_after(pex16) pex16.insert_after(pex17) pex17.insert_after(pex18) pex18.insert_after(pex19) pex19.insert_after(pex20) pex20.insert_after(pex21) pex21.insert_after(pex22) pex22.insert_after(pex23) pex4.insert_after(pex24) rm_me = soup.find("div", {"class": "loading-indicator"}) rm_me.extract() # output_html = soup.prettify() output_html = soup.__str__() # Create unique ID full_identifier_string = "{}_{}".format( os.path.basename(orig_html_file).split('.')[0], datetime.now().strftime("%d/%m/%Y-%H:%M:%S")) unique_id = hashlib.md5(full_identifier_string.encode('utf-8')).hexdigest() # Write files with open(os.path.join(orig_html_dir, 'index.html'), 'w') as ofile: ofile.write(output_html) with open(os.path.join(orig_html_dir, 'unique_id.txt'), 'w') as ofile: ofile.write(unique_id)
path = 'C:\\Users\\User\\Desktop\\webservv0' # Получаем GET параметр nick переданный из HTML формы form = cgi.FieldStorage() password = form.getfirst("password", "123123") #password = '******' if password == '123123': a = list() n = 0 login = '******' user_secret = '' # org_id = '' auth_html = s.get('' + login + '&pass='******'html.parser') #token = '' token = token.__str__().replace('"', '') print('token = ', token) f = open(path + '\\cgi-bin\\mats\\tokenlogs.txt', 'a', encoding='UTF-8') f.write(str(token) + '\n') f.close() auth_html = s.get('' + token) soupez = BS(auth_html.text, 'html.parser') # print('Сотрудники = ', soupez) f = open(path + '\\cgi-bin\\mats\\sotrudniki.xml', 'w', encoding='UTF-8') # Вытаскивание из списка сотрудников f.write(str(soupez)) # Имя официанта и его ид f.close() f = open(path + '\\cgi-bin\\mats\\db.xml', 'w', encoding='UTF-8') f.write('<?xml version="1.0" encoding="UTF-8"?>\n' + '<r>') tree = ET.parse(path + '\\cgi-bin\\mats\\sotrudniki.xml') root = tree.getroot()
class PageWork: def __init__(self): # 获取当前页面(目前市面上好像都是这么做的,先保存到手机,再获取) self.update() def tapNode(self, text='', name='node', attrs={}): cooridinate = self.getPos(text, name, attrs) if cooridinate: cmd = 'adb shell input tap ' + str(cooridinate[0]) + ' ' + str( cooridinate[1]) os.system(cmd) else: raise RuntimeError('未能找到指定元素') self.update() def slide(self, direct=1, len=100, time=50): wsize = self.getWindowSize() midx = wsize[0] // 2 midy = wsize[1] // 2 if direct == 1: nextx, nexty = str(midx), str(midy + len) elif direct == 2: nextx, nexty = str(midx), str(midy - len) elif direct == 3: nextx, nexty = str(midx - len), str(midy) elif direct == 4: nextx, nexty = str(midx + len), str(midy) else: nextx, nexty = str(midx), str(midy) cmd = 'adb shell input swipe {} {} {} {} {}'.format( str(midx), str(midy), nextx, nexty, time) os.system(cmd) def getPos(self, text='', name='node', attrs={}): attrs['text'] = text cell = self.content.find(name, attrs=attrs) if cell: posstr = cell.attrs.get('bounds') match = re.search( r'\[(?P<lt0>\d+),(?P<lt1>\d+)\]\[(?P<rb0>\d+),(?P<rb1>\d+)\]', posstr) point_lt = [int(match.group('lt0')), int(match.group('lt1'))] point_rb = [int(match.group('rb0')), int(match.group('rb1'))] centerpos = [(point_lt[0] + point_rb[0]) // 2, (point_lt[1] + point_rb[1]) // 2] return centerpos else: return None def update(self): os.popen('adb shell uiautomator dump /sdcard/ui.xml').read() # popen是异步的,在执行的同时会开启下面,如果下面调用了上面的结果,可能会出问题 # read()是在主进程操作的,这就相当于进行了阻塞,必须等popen结束了之后才能read(),保护了下方代码执行的安全性 # os.system('adb shell uiautomator dump /sdcard/ui.xml') os.popen( r'adb pull /sdcard/ui.xml E:\Workplace\Workplace_Python\wp_project\pyGreat\application\ctrlmobile' ).read() with open( r'E:\Workplace\Workplace_Python\wp_project\pyGreat\application\ctrlmobile\ui.xml', encoding='utf8') as f: self.content = BeautifulSoup(f.read(), 'lxml') def getWindowSize(self): cmd = 'adb shell wm size' res_size = os.popen(cmd).read() match = re.search(r'size: (?P<width>\d+)x(?P<height>\d+)', res_size) return (int(match.group('width')), int(match.group('height'))) def save(self, path): with open(path, 'w') as f: f.write(self.content.__str__())
def extractBios(): global job_list global user_bio global job_list_lock global user_lock_list global user_lock_list_lock while True: time.sleep(random.randint(100,300)) job_list_lock.acquire() if len(job_list) == 0: job_list_lock.release() return category, url = job_list.pop() job_list_lock.release() logging.info('grilling category-url pair: %s %s', category, url) base_url = url # Now go through all the users of the category and Store for each user their twitter handle and bio try: while(True): time.sleep(random.randint(0,50)) # Get the web page # headers = {'User-agent':'Mozilla/5.0'} r = requests.get(url, timeout=20) logging.debug("Url request successful for url: %s", url) # Extract information from webpage. All the users are in # the div with class 'search-cat-user', and the first <a> # tag in the div has the twitter user handle as the href # field. Add tags and super-tags accordingly (super-tags # for each category are stored in the global dictionary). print r.status_code soup = BeautifulSoup(r.text) all_users = soup.find_all(attrs={"class":"search-cat-user"}) for user in all_users: try: user_id = user.find('a')['href'][1:] if (user_id == '') or (user_id in user_bio): continue logging.debug('Operating on user: %s', user_id) # Get user bio now user_soup = BeautifulSoup(user.__str__()) [s.extract() for s in user_soup.find_all(attrs={'class':'search-cat-user-name'})] found = user_soup.find(attrs={'class':'search-cat-user-bio'}) found = BeautifulSoup(found.__str__()) bio = re.compile('\n').sub('', found.string) if user_id not in user_lock_list: user_lock_list_lock.acquire() # Test again if it hasn't already been added by another thread if user_id not in user_lock_list: user_lock_list[user_id] = threading.Lock() logging.debug("Creating lock for user: %s", user_id) user_lock_list_lock.release() user_lock_list[user_id].acquire() user_bio[user_id] = bio.strip().encode('ascii', 'ignore') logging.info('user_id:%s bio:%s', user_id, bio.strip().encode('ascii', 'ignore')) print 'user_id:', user_id, 'bio', bio.strip().encode('ascii', 'ignore') user_lock_list[user_id].release() except: logging.error("Error in: %s", user.find('a')) logging.error(traceback.format_exc()) # Release any unreleased locks: if already released, would through an exception - hence the try-catch blocks try: user_lock_list_lock.release() except: pass try: user_lock_list[user_id].release() except: pass # When all the users of a page are done, go to the next page and next till you reach the last page, which is verfied from the information at the bottom of the page. pagination = soup.find(attrs={"class":"pagination"}) last_page = int(pagination.find(attrs={'title':'Last Page'})['href'].split('/')[-1]) current_page = int(pagination.find(attrs={'class':'current'})['href'].split('/')[-1]) logging.debug("Finished operating on url: %s", url) if current_page == last_page: logging.info("Last page operation finished for CATEGORY: %s", category) break url = base_url+ '/page/' + str(current_page+1) logging.debug("Starting on the next page url: %s", url) except: logging.error(traceback.format_exc())
path = '../../data/data/' out_file = open('../../data/corpus.txt', 'a', encoding='utf8') filelist = os.listdir(path) # 该文件夹下所有的文件(包括文件夹) print(filelist) for files in filelist: # 遍历所有文件 if not fnmatch.fnmatch(files, '*.html'): continue print(path + files) f = open(path + files, 'r+', encoding='utf8') txt = f.read() soup = BeautifulSoup(txt) for s in soup('script'): s.extract() for s in soup('style'): s.extract() txt = soup.__str__() reg2 = re.compile('<[^>]*>') txt = reg2.sub('', txt) reg3 = re.compile('-->') txt = reg3.sub('', txt) reg4 = re.compile('&(\S)?gt') txt = reg4.sub('', txt) reg5 = re.compile('New!') txt = reg5.sub('', txt) ans = txt.split("\n") print(ans) for word in ans: if not word == '': out_file.write(word) out_file.write('\n') if word == '法律声明':
def saveBlog(self, destHtml, destPdf, realNamePdf, realNameHtml, artical_url): htmlContent = self.fixSynaxHighLighter( self.login(artical_url, referer=artical_url)) if htmlContent == False: # 针对隐藏页面消失的跳过处理 return soup = BeautifulSoup(htmlContent, features='html5lib') self.print("处理图片中......") # 处理图片为BASE64 imgSoup = soup.findAll(name="img") id = 0 for img in imgSoup: if not str(img['src']).__contains__('file:'): if str(img['src']).startswith('//'): img['src'] = 'https' + img['src'] id = id + 1 self.login(img['src'], isImage=True, imageId=str(self.identifier) + '_' + str(id)) img['src'] = "data:image/jpeg;base64," + self.get_image_file_as_base64_data( self.blogDir + self.bName + self.blogImage + str(self.identifier) + '_' + str(id) + '.jpg').decode('UTF-8') try: del (img['width']) del (img['height']) except: pass # 处理公式为BASE64 self.print("处理公式中......") ind = 0 for (typename, typefsize) in { 'math/tex': '16px', 'math/tex; mode=display': '26px' }.items(): latexsoup = soup.findAll(name="script", attrs={"type": typename}) for l in latexsoup: ind = ind + 1 if ind % 10 == 0: time.sleep(10) formulatext = "$$" + str(l.text) + "$$" formulatext = formulatext.replace(' ', '') all_url = 'http://quicklatex.com/latex3.f' Para = { 'formula': formulatext, 'fsize': typefsize, 'fcolor': '000000', 'mode': '0', 'out': '1', 'remhost': 'quicklatex.com', 'rnd': random.uniform(0, 100) } start_html = requests.post(all_url, data=Para) img_url = start_html.text.replace("\r\n", " ") img_url = img_url.split(' ') img_url = img_url[1] time.sleep(1.0) img = requests.get(img_url) f = open( self.blogDir + self.bName + "images\\" + 'formula_' + str(self.identifier) + '_' + str(ind) + '.png', 'wb') f.write(img.content) f.close() latex_img = "<span style=\"position: relative;\"><img style=\"MARGIN:0; PADDING:0\" src=\"data:image/png;base64," + self.get_image_file_as_base64_data( self.blogDir + self.bName + "images\\" + 'formula_' + str(self.identifier) + '_' + str(ind) + '.png').decode('UTF-8') + "\"/></span>" l.insert_after(BeautifulSoup(latex_img, features='html5lib')) try: hidinginfosoup = soup.findAll( name="div", attrs={ "class": "hide-article-box hide-article-pos text-center" })[0] hidinginfosoup['style'] = 'display: none;' except: pass htmlContent = soup.__str__() with open(destHtml, 'w', encoding='utf-8') as f: f.write(htmlContent) self.print("正在转换为PDF......") # 获取页面Cookie http_cookie = list() cj = self.session.cookies.get_dict() for (k, v) in cj.items(): if '%' in v: v = urllib.parse.unquote(v) http_cookie.append((k, v)) pdfkit.from_file( destHtml, destPdf, options={ 'custom-header': [('Origin', 'https://blog.csdn.net'), ('Referer', 'https://blog.csdn.net'), ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0' )], 'cookie': http_cookie, 'enable-local-file-access': '', 'images': '' }) while True: isExist = os.path.exists(destPdf) if isExist: break time.sleep(1) isExist = os.path.exists(realNamePdf) if isExist: realNamePdf = realNamePdf[:-4] + str(int(time.time())) + '.pdf' realNameHtml = realNameHtml[:-5] + str(int(time.time())) + '.html' os.rename(destPdf, realNamePdf) os.rename(destHtml, realNameHtml) self.print("已保存网页: " + realNameHtml) time.sleep(2) self.print("已保存PDF: " + realNamePdf)