def get_blog_content(file_path, result=[]): #判断当前页面没有结果了,终止程序 if len(result) == 0: sys.exit() #遍历搜索结果,取得博客内容 for i in range(0, len(result)): h = get_url_html(result[i]) h = html.fromstring(str(h)) res_title = h.xpath( '//*[@id="mainBox"]/main/div[1]/div/div/div[1]/h1/text()') if res_title: title = res_title[0] res_content = h.xpath('//*[@id="content_views"]') if res_content: res_content = html.tostring(res_content[0]) content = HTMLParser().unescape(res_content.decode()) content = re.sub('<p.*?>', '', content) content = re.sub('<div.*?>', '', content) content = re.sub('<a.*?>', '', content) content = re.sub('</.*?>', '', content) content = re.sub('<h.*?>', '', content) content = re.sub('<strong>', '', content) content = re.sub('<ol>', '', content) content = re.sub('<li>', '', content) content = re.sub('<br>', '', content) content = re.sub('<span.*?>', '', content) content = content.strip() print(title) title = re.sub('/', '', title) file = open(file_path + '/' + title + '.txt', mode='w', encoding='utf-8') file.write(content) file.close
def transform_title(title, site): """Miscellaneous title transformations. Handle some unicode, unescape HTML, simplify hierarchical titles, ... """ title = HTMLParser().unescape(title) title = title.strip() title = parse_fancy_titles(title, site) return title
def clean_str(string): # 去除html标签 dr = re.compile(r'<[^>]+>', re.S) string = dr.sub('', string) # 统一全角标点 for c in en_punctuation_set: if c in string: string = string.replace(c, semi_angle_to_sbc(c)) # 去除html中的特殊字符 string = HTMLParser().unescape(string) # 将字母统一转成小写 string = string.lower() # 去除重复的符号 string = clean_redundant(string, '?') string = clean_redundant(string, ',') string = clean_redundant(string, '……') string = clean_redundant(string, '。') return string.strip()
class XMLParser(object): '''Class that tries to parse a xml file, using zenity to display any error in process''' def __init__(self): self.file_name = None self.xml_to_be_parsed = None def get_file(self): try: self.file_name = sys.argv[1] except IndexError: os.system( 'zenity --info --title="Error" --text="You must specify a file name!"' ) def parse_xml(self): if os.path.isfile(self.file_name) and os.access( self.file_name, os.R_OK): read_only_file = open(self.file_name, 'r') self.xml_to_be_parsed = HTMLParser().unescape( read_only_file.read()) try: self.xml_to_be_parsed = minidom.parseString( self.xml_to_be_parsed).toprettyxml() except ExpatError as bad_xml: os.system( 'zenity --info --title="Error Parsing File" --text="XML file %s"' % str(bad_xml)) # Handle issue with CDATA section due minidom add extraspace #before/after CDATA self.xml_to_be_parsed = re.sub('>\s+<!', '><!', self.xml_to_be_parsed) self.xml_to_be_parsed = re.sub(']>\s+<', ']><', self.xml_to_be_parsed) writable_file = open(self.file_name, 'w') writable_file.write("".join([ s for s in self.xml_to_be_parsed.strip().splitlines(True) if s.strip() ])) writable_file.close() else: os.system( 'zenity --info --title="Error" --text="File is missing or is not readable!"' )
def zeroclick(irc, source, msgtarget, args): params = {"q":args[0]} url = "http://duckduckgo.com/lite/?" #try: data = requests.get(url, params=params).content.decode() search = re.findall("""\t<td>.\t\s+(.*?).<\/td>""",data,re.M|re.DOTALL) if search: answer = HTMLParser().unescape(search[-1].replace("<br>"," ").replace("<code>"," ").replace("</code>"," ")) answer = re.sub("<[^<]+?>"," ",answer) out = re.sub("\s+"," ",answer.strip()) if out: #if len(out.split(" More at")[0].split("}")[-1].strip()) < 400: irc.msg(msgtarget, out.split(" More at")[0].split("}")[-1].strip()) #else: # irc.msg(source.split("!")[0], out.split(" More at")[0].split("}")[-1].strip()) else: irc.msg(msgtarget, "No results") else: irc.msg(msgtarget, "No results found.")
def zeroclick(irc, source, msgtarget, args): params = {"q": args[0]} url = "http://duckduckgo.com/lite/?" #try: data = requests.get(url, params=params).content.decode() search = re.findall("""\t<td>.\t\s+(.*?).<\/td>""", data, re.M | re.DOTALL) if search: answer = HTMLParser().unescape(search[-1].replace("<br>", " ").replace( "<code>", " ").replace("</code>", " ")) answer = re.sub("<[^<]+?>", " ", answer) out = re.sub("\s+", " ", answer.strip()) if out: #if len(out.split(" More at")[0].split("}")[-1].strip()) < 400: irc.msg(msgtarget, out.split(" More at")[0].split("}")[-1].strip()) #else: # irc.msg(source.split("!")[0], out.split(" More at")[0].split("}")[-1].strip()) else: irc.msg(msgtarget, "No results") else: irc.msg(msgtarget, "No results found.")
def get_fingerprint(torrent_name): """ Tries to obtain a fingerprint from the torrent name that will uniquely identify it's group (TV show). """ # Minimize typing differences torrent_name = torrent_name.replace("ё", "е") # Unescape HTML entities torrent_name = HTMLParser().unescape(torrent_name) # Drop all tags torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name) # Drop any additional info: timestamps, release versions, etc. # --> square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$") preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$") round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$") angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$") date_regex = re.compile( r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$") # Unable to merge it into date_regex due to some strange behaviour of re # module. additional_date_regex = re.compile( r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$" ) release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$") old_torrent_name = None while torrent_name != old_torrent_name: old_torrent_name = torrent_name for regex in ( additional_date_regex, date_regex, preceding_square_braces_regex, square_braces_regex, round_braces_regex, angle_braces_regex, release_counter_regex, ): torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,")) torrent_name = re.sub(r"\s+/.*", "", torrent_name) # <-- # We need all names in lowercase for easier analysis torrent_name = torrent_name.lower() # Try to get most possible short fingerprint --> torrent_name = re.sub(r"^(national\s+geographic\s*:|наука\s+2\.0)\s+", "", torrent_name) torrent_name = re.sub(r"^«([^»]{6,})»", r"\1", torrent_name) torrent_name = re.sub(r'^"([^»]{6,})"', r"\1", torrent_name) torrent_name = re.sub( r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name) # Try to get most possible short fingerprint <-- # Drop all punctuation and other non-alphabet characters characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя" torrent_name = torrent_name.replace(".", " ") torrent_name = "".join(c for c in torrent_name if c in " " + characters) # Drop any additional info: timestamps, release versions, etc. # --> torrent_name = torrent_name.replace("г.", "") while True: new_torrent_name = re.sub( r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name) if new_torrent_name == torrent_name: break torrent_name = new_torrent_name for month in ( "январь", "января", "февраль", "февраля", "март", "марта", "апрель", "апреля", "май", "мая", "июнь", "июня", "июль", "июля", "август", "августа", "сентябрь", "сентября", "октябрь", "октября", "ноябрь", "ноября", "декабрь", "декабря", ): torrent_name = re.sub(r"\b" + month + r"\b", "", torrent_name) # <-- # Drop several spaces torrent_name = re.sub(r"\s+", " ", torrent_name).strip() return torrent_name.strip()
def strip_html(text): result = re.sub(r"\<.*?>", " ", text, 0, re.MULTILINE) result = HTMLParser().unescape(result) result = " ".join(result.split()) return result.strip()
def get_fingerprint(torrent_name): """ Tries to obtain a fingerprint from the torrent name that will uniquely identify it's group (TV show). """ # Minimize typing differences torrent_name = torrent_name.replace("ё", "е") # Unescape HTML entities torrent_name = HTMLParser().unescape(torrent_name) # Drop all tags torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name) # Drop any additional info: timestamps, release versions, etc. # --> square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$") preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$") round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$") angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$") date_regex = re.compile(r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$") # Unable to merge it into date_regex due to some strange behaviour of re # module. additional_date_regex = re.compile(r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$") release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$") old_torrent_name = None while torrent_name != old_torrent_name: old_torrent_name = torrent_name for regex in ( additional_date_regex, date_regex, preceding_square_braces_regex, square_braces_regex, round_braces_regex, angle_braces_regex, release_counter_regex, ): torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,")) torrent_name = re.sub(r"\s+/.*", "", torrent_name) # <-- # We need all names in lowercase for easier analysis torrent_name = torrent_name.lower() # Try to get most possible short fingerprint --> torrent_name = re.sub( r"^«([^»]{6,})»", r"\1", torrent_name) torrent_name = re.sub( r'^"([^»]{6,})"', r"\1", torrent_name) torrent_name = re.sub( r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name) # Try to get most possible short fingerprint <-- # Drop all punctuation and other non-alphabet characters characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя" torrent_name = torrent_name.replace(".", " ") torrent_name = "".join( c for c in torrent_name if c in " " + characters) # Drop any additional info: timestamps, release versions, etc. # --> torrent_name = torrent_name.replace("г.", "") while True: new_torrent_name = re.sub(r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name) if new_torrent_name == torrent_name: break torrent_name = new_torrent_name for month in ( "январь", "января", "февраль", "февраля", "март", "марта", "апрель", "апреля", "май", "мая", "июнь", "июня", "июль", "июля", "август", "августа", "сентябрь", "сентября", "октябрь", "октября", "ноябрь", "ноября", "декабрь", "декабря", ): torrent_name = torrent_name.replace(month, "") # <-- # Drop several spaces torrent_name = re.sub(r"\s+", " ", torrent_name).strip() return torrent_name.strip()