def read_from_text(self, dat_raw_text): self.raw_text = dat_raw_text resu = dat_raw_text.rstrip().split('<>') if len(resu) != 5: return # todo: throw exception self.user_name = resu[0] self.email = resu[1] info = resu[2] content = resu[3] self.title = resu[4] # parse date/time/id field ma = re.match(r'(.*) ID:(.*)', info) if ma: self.info_datetime = ma.group(1) self.info_id = ma.group(2) else: self.info_datetime = info self.info_id = '' # convert content to plain text content = html.unescape(content) self.content = re.sub(r'<br>', '\n', content) self.content_html = self.make_html(self.content) # scan content text to find anchors for m in re.finditer(r">>(\d+)", self.content): self.anchor.append(int(m.group(1)) - 1)
def get_css_report(url_datas): datas = [] for row in url_datas: print(row["pid"], "を処理しています") time.sleep(1) try: res = requests.get(row["url"], verify=False) doc = res.text dom = lxml.html.fromstring(res.content) link_tag_lists = [] style_tag_lists = [] style_attr_strs = "" for link in dom.cssselect("link"): if link.get("rel") == "stylesheet": link_tag_lists.append(link.get("href")) for style in dom.cssselect("style"): style_str = html.unescape(lxml.html.tostring(style).decode()) style_tag_lists.append(style_str) for style_attr in re.findall(r"style=\".*?\"", doc, re.DOTALL): style_attr_strs += style_attr + "," datas.append({ "pid": row["pid"], "link-tag": link_tag_lists, "style-tag": style_tag_lists, "style-attr": style_attr_strs }) except requests.exceptions.SSLError: print(row["pid"], "はエラーのためスキップします") return datas
def fetch_css_datas(self, cr_pid, cr_url): print(cr_pid, "を処理しています") time.sleep(self.shortWait) datas = [] try: res = requests.get(cr_url, verify=False) doc = res.text dom = lxml.html.fromstring(res.content) dom.make_links_absolute(res.url) link_tags = [] style_tags = [] style_atts = "" for link in dom.cssselect("link"): if link.get("rel") == "stylesheet": link_tags.append(link.get("href")) for style in dom.cssselect("style"): style_str = html.unescape(lxml.html.tostring(style).decode()) style_tags.append(style_str) for style_attr in re.findall(r'style=".*?"', doc, re.DOTALL): style_atts += style_attr + "," datas.append({ "pid": cr_pid, "url": cr_url, "link_tags": link_tags, "style_tags": style_tags, "style_atts": style_atts }) except requests.exceptions.SSLError: print(cr_pid, "はエラーのためスキップします") return datas
def net_html_unescape(html_str:str)->str: ''' Decodes HTML escaped symbols: "That's an example" -> "That's an example" ''' return html.unescape(html_str)
def strip_html(doc): _blank_if_none = lambda x : "" if x is None else x _empty_div_if_none = lambda x : "<div></div>" if x is None else x _nbsp_del = lambda x : re.sub(no_nbsp,"",x) _no_nl = lambda x : re.sub(no_nl, " ", x) _quote_fix = lambda x : re.sub(quote, "'", x) _no_ellipsis = lambda x : re.sub(ellipsis, ".", x) _unescape = lambda x : html.unescape(_empty_div_if_none(doc)) _printable = lambda x : "".join(list(filter(lambda x : x in string.printable, x))) return _printable(_no_ellipsis(lxml.html.fromstring(clean_html(_no_nl(_unescape(doc)))).text_content()))
def video(url): attempts = 0 accessible = 0 #results_file = open("results.txt", 'w') #sys.stdout = results_file if "https://youtu.be/" in str( url) or "https://www.youtube.com/watch?v=" in str(url): try: get_tor_session().get("https://ip.seeip.org/") except IOError: return "Tor service is down serverside. Please try again later." else: return "Invalid input." page_data = requests.get(url).text parse_title = str( re.findall('<title>(.*?) - YouTube</title><meta name="title" content=', page_data)) title = html.unescape(parse_title.split("'")[1]) if title == "": print("Video unavailable") else: print(title) print(url + "\n") while attempts < 5: rotate_connection() title_query = "https://www.youtube.com/results?search_query=" + "+".join( title.split()).replace('\n', '') title_search = get_tor_session().get(title_query).text if title_search.find('"title":{"runs":[{"text":"') >= 0: if title_search.find(title) >= 0: accessible += 1 print("[ ✓ ]", end="") else: print("[ X ]", end="") try: r = get_tor_session().get("https://ip.seeip.org/geoip") r_dict = r.json() print(" " + r_dict["country"] + " (" + r_dict["ip"] + ")") except IOError: print(" Unknown location.") attempts += 1 if attempts == accessible and accessible > 0: print("\nNo abnormal behavior detected.") elif attempts > accessible: print("\nQuestionable behavior detected.") elif accessible == 0: print("\nAlarming behavior detected.") #results_file.close() return (open("results.txt", "r").read())
def get_under_link_datas(self, dom_obj): datas = [] for row in dom_obj.xpath( '//*[@id="ModuleContents"]/table[2]/tr/td[2]/table/tr/td/a'): atag = html.unescape(lxml.html.tostring(row).decode()) atag_text = self.fetch_atag_text(atag) atag_url = self.fetch_atag_href(atag) datas.append({ "key_addr": self.url, "title": atag_text, "url": self.base_url + atag_url }) return datas
def get_calendar_td_link_datas(self, dom_obj): datas = [] for td in dom_obj.xpath( '//*[@id="ModuleContents"]/table[1]/tr/td/table[2]/tr/td'): td_tag = html.unescape(lxml.html.tostring(td).decode()) td_tag = self.table_tag_raise(td_tag) for aline in re.findall(r'<a.+?>.+?</a>', td_tag, re.DOTALL): if self.is_calendar_td_raise_link(aline): continue else: atag_text = self.fetch_atag_text(aline) atag_url = self.fetch_atag_href(aline) datas.append({ "key_addr": self.url, "title": atag_text, "url": self.base_url + atag_url }) return self.make_uniq_datas(datas)
def fetch(self): datas = [] res = requests.get(self.url) dom = lxml.html.fromstring(self.reki_decode(res.text)) datas.append({ "key_addr": "---", "title": dom.cssselect("title")[0].text, "url": self.url }) for row in dom.xpath('//*[@id="webphoto_box_photo_a"]/table/tr/td[2]/a[2]'): atg = html.unescape(lxml.html.tostring(row).decode()) title_str = self.fetch_atag_text(atg) title_url = self.fetch_atag_href(atg) datas.append({ "key_addr": self.url, "title": title_str.strip(), "url": title_url }) return datas
def unescape(cls, text): """ Removes HTML or XML character references and entities from a text string. Return the plain text, as a Unicode string, if necessary. Argument: text -- The HTML (or XML) source text. """ if text is None: return None try: # Python 2.6-2.7 from HTMLParser import HTMLParser h = HTMLParser() return h.unescape(text) except ImportError: # Python 3 import html return html.unescape(text)
def untextlink_decode(elm): ret = html.unescape(lxml.html.tostring(elm).decode()) ret = re.sub(r"</*a.*?>", "", ret) return ret
# listData = re.compile('"LISTDATE":"(.*?)",', re.S).findall(r) # print('listData, ', len(listData), listData) # ZGJ = re.compile('"ZGJ_HQ":"(.*?)",', re.S).findall(r) # print('ZGJ', ZGJ) # data_parse = HTMLParser.HTMLParser() # for i in ZGJ: # newdata = data_parse.unescape(i) # print(newdata) # num = 0xEA5DF3C3F3C3E4E5 ret = html.unescape('.') # print(ret) # ret = html.unescape('') # print(ret) # ret = html.unescape('') # print(ret) # ret = html.unescape('.') # print(ret) # ret = html.unescape('') # print(ret) # for i in ZGJ: # ret = html.unescape(i) # print(ret, i) # for i in range(0, len(myKzz)): # kzz = myKzz[i]
def sub_dict(d, keys): import html return {k: html.unescape(d[k]) for k in d if k in keys}
"return_label":"ERL", "return_label_coc":"ERL", "WWEFMT":"", "WWEFIT":"", "ctc_second_submit":"10"} headers = { "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With": "XMLHttpRequest"} try: r = requests.post(url, data=payload, headers=headers)#data.encode('utf-8'), headers=headers) # print(r.text, file=open('html.txt', 'w')) r = lxml.html.fromstring(r.text); costString = r.xpath("//td[@class='ship_total']/strong")[0].text try: deliveryTime = lxml.html.tostring(r.xpath('//*[@id="servicerow0"]/td[2]/dl/dd')[0]).decode('utf-8') deliveryTime = html.unescape(deliveryTime.split('<br>')[2]).strip().replace('\xa0', ' ') deliveryTime = max(1, (datetime.strptime(deliveryTime, "%B %d, %Y").date() - datetime.now().date()).days) except ValueError: deliveryTime = "" condition = r.xpath('//*[@id="servicerow0"]/td[1]/dl[1]')[0] condition = '\n'.join([(child.text or "").strip() for child in condition.iterdescendants()]) clean = lambda cost: "".join(filter(lambda x: x.isdigit() or x == '.', cost)) print(json.dumps({ "price": clean(costString), "time": str(deliveryTime), "condition": condition, })) except: pass #regexp way #r = re.search("[0-9,]+.\\d{0,2}", costString)
def clean_string(string): # we don't want soft hyphen in the db soft_hyphen_html = "­" # we don't want line breaks in the db return html.unescape( string.replace(soft_hyphen_html, "").replace("\n", " "))
def html_unscape(term): return html.unescape(term)
def get(self, id): url = 'https://airbnb.com/rooms/{id}'.format(id=id) if DEMO: res = open('tmp.txt', 'r').read() else: res = request(url) with open('tmp.txt', 'w') as out_file: out_file.write(res) # CHECK OFFER EXISTING checker = etree.HTML(res).xpath('.//link[@rel="canonical"]/@href') checker = checker[0] if checker else False if checker and checker == 'https://www.airbnb.com/': return { 'error': True, 'errors': { 'airbnb_id': 'Offer not found' } } # CHECK IF IP IS BANNED has_title = etree.HTML(res).xpath('.//title/text()') has_title = has_title[0] if has_title else False if not has_title: return { 'error': True, 'errors': { 'airbnb_id': 'Proxy is banned' } } # IF STATUS CODE IS ERROR if type(res) == dict and 'error' in res: return res # False if offer not found in airbnb page content = False for r in res.splitlines(): if '"bootstrapData"' in r: content = re.search('<!--{"behavioralUid"(.*?)-->', res).group(0) content = re.search('<!--(.*?)-->', content).group(1) content = json.loads(content) break # content = re.search('<!--(.*?)-->', r).group(1) # content = json.loads(content) # break if not content: return 'Offer not found' listingInfo = content['bootstrapData']['reduxData']['homePDP'][ 'listingInfo']['listing'] listingExtra = listingInfo['p3_event_data_logging'] # http://jsonviewer.stack.hu/ # jprint(listingInfo) # GET PRICE price = self.getPrice(id=id, currency=listingInfo['native_currency']) # tree = etree.HTML(res) # price = tree.xpath('.//meta[@name="description"]/@content')[0] # price = re.search('\s€(\w+).\s', price) # if price: # price = price.group(1) # else: # price = False # print("PRICE" ,price) # AMENTIES amenties_arr = [] listing_amenities = [{ 'id': c['id'], 'name': c['name'], 'tag': c['tag'] } for c in listingInfo['listing_amenities']] for c in listingInfo['see_all_amenity_sections']: def getAmenties(id): for c in listing_amenities: if c['id'] == id: return c if not c['title'] == 'Not included': amenties_arr.append({ 'group': c['title'], 'items': [getAmenties(a) for a in c['amenity_ids']] }) # PHOTOS photos_arr = [{ 'src': c['xx_large'], 'index': c['sort_order'] } for c in listingInfo['photos']] photos_arr = [{ 'src': c['src'], 'index': c['index'] } for c in sorted( photos_arr, key=itemgetter('index'), reverse=False)] # DESCRIPTION description = listingInfo['sectioned_description'] description_text = '' description_html = '' if description['description']: description_text = html.unescape(description['description']) description_html = description_text.replace('\r', '<br />').replace( '\n', '<br />') description = { 'text': description_text, 'html': description_html, 'lang': description['localized_language_name'], } # print(description) results = { 'id': listingInfo['id'], 'price': price, 'name': html.unescape(listingInfo['name']), 'bathroom_label': listingInfo['bathroom_label'], 'bed_label': listingInfo['bed_label'], 'bedroom_label': listingInfo['bedroom_label'], 'guest_label': listingInfo['guest_label'], # 'person_capacity': listingInfo['person_capacity'], 'star_rating': listingInfo['star_rating'], 'calendar_last_updated_at': listingInfo['calendar_last_updated_at'], 'min_nights': listingInfo['min_nights'], 'location_title': listingInfo['location_title'], 'lat': listingInfo['lat'], 'lng': listingInfo['lng'], 'room_and_property_type': listingInfo['room_and_property_type'], 'room_type_category': listingInfo['room_type_category'], 'guest_controls': { i: c for i, c in listingInfo['guest_controls'].items() if 'allows_' in i }, 'photos': photos_arr, 'description': description, 'primary_host': listingInfo['primary_host'], 'amenties': amenties_arr, # 'listing_amenities': listing_amenities, 'check_in': listingInfo['localized_check_in_time_window'], 'check_out': listingInfo['localized_check_out_time'], # EXTRA... 'description_language': listingExtra['description_language'], 'is_superhost': listingExtra['is_superhost'], 'home_tier': listingExtra['home_tier'], 'checkin_rating': listingExtra['checkin_rating'], 'cleanliness_rating': listingExtra['cleanliness_rating'], 'communication_rating': listingExtra['communication_rating'], 'location_rating': listingExtra['location_rating'], 'accuracy_rating': listingExtra['accuracy_rating'], 'value_rating': listingExtra['value_rating'], # '': listingInfo[''], } # jprint(results) return results
import urllib.request import lxml.html import re, json, html # Make a GET request and parse the HTML into a tree # urlKs = 'https://www.kickstarter.com/projects/agfa/agfa-and-something-weird' # htmlRaw = urllib.request.urlopen(urlKs).read() # text_file = open("project.html", "w") # text_file.write(htmlRaw.decode("utf-8")) # text_file.close() text_file = open("project.html") tree = lxml.html.fromstring(text_file.read()) text_file.close() # print(tree) # Find the script that contains the JSON object script = tree.xpath('//script[contains(text(), "window.current_project")]')[0] # Get the JSON object from the script jsonRaw = re.search('window.current_project = "([^"]*)', script.text).group(1) # Fix some odd double escaping jsonRaw = jsonRaw.replace('\\\\"', '\\"') # Unescape jsonRaw = html.unescape(jsonRaw) # Parse and print jsonParsed = json.loads(jsonRaw) print(json.dumps(jsonParsed, indent=4))
def _filter(item): values = [] for key in required: decoded = html.unescape(item[key]) values.append(decoded) return OrderedDict(zip(required, values))
def process_queue(): while True: try: url = urls.pop() except IndexError: print('download completed') break else: h = downloader(url, 5) if html is not None: houses = {} tree = lxml.html.fromstring(h) # 标题 title = tree.cssselect( 'li.clear > div.info.clear > div.title') title = list(map(lxml.html.HtmlElement.text_content, title)) houses['title'] = title # 房屋简介 houseInfo = tree.cssselect( 'li.clear > div.info.clear > div.address > div.houseInfo' ) houseInfo = list( map(lxml.html.HtmlElement.text_content, houseInfo)) houses['houseInfo'] = houseInfo # 楼层信息 positionInfo = tree.cssselect( 'li.clear > div.info.clear > div.flood > div.positionInfo' ) positionInfo = list( map(lxml.html.HtmlElement.text_content, positionInfo)) houses['positionInfo'] = positionInfo # 地铁, 交税信息, 看房时间 tag = re.findall( '<div class="tag">(.*?)</div><div class="priceInfo">', html.unescape(h.decode())) subway = [] taxfree = [] haskey = [] for x in tag: # 地铁 swy = re.findall('<span class="subway">(.*?)</span>', str(x)) subway.extend(swy) if len(swy) else subway.extend('无') # 交税信息 tf = re.findall('<span class="taxfree">(.*?)</span>', str(x)) taxfree.extend(tf) if len(tf) else taxfree.extend('无') # 看房时间 hk = re.findall('<span class="haskey">(.*?)</span>', str(x)) haskey.extend(hk) if len(hk) else haskey.extend('无') houses['subway'] = subway houses['taxfree'] = taxfree houses['haskey'] = haskey # 总价 totalPrice = tree.cssselect( 'li.clear > div.info.clear > div.priceInfo > div.totalPrice' ) totalPrice = list( map(lxml.html.HtmlElement.text_content, totalPrice)) houses['totalPrice'] = totalPrice # 单价 unitPrice = tree.cssselect( 'li.clear > div.info.clear > div.priceInfo > div.unitPrice' ) unitPrice = list( map(lxml.html.HtmlElement.text_content, unitPrice)) houses['unitPrice'] = unitPrice # 将房屋信息转换为DataFrame格式 df = pd.DataFrame(houses) df.to_csv( '/home/wangf/PycharmProjects/lianjiawang/house_infoes.csv', mode='a')
} headers = { "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "X-Requested-With": "XMLHttpRequest" } try: r = requests.post(url, data=payload, headers=headers) #data.encode('utf-8'), headers=headers) # print(r.text, file=open('html.txt', 'w')) r = lxml.html.fromstring(r.text) costString = r.xpath("//td[@class='ship_total']/strong")[0].text try: deliveryTime = lxml.html.tostring( r.xpath('//*[@id="servicerow0"]/td[2]/dl/dd')[0]).decode('utf-8') deliveryTime = html.unescape( deliveryTime.split('<br>')[2]).strip().replace('\xa0', ' ') deliveryTime = max( 1, (datetime.strptime(deliveryTime, "%B %d, %Y").date() - datetime.now().date()).days) except ValueError: deliveryTime = "" condition = r.xpath('//*[@id="servicerow0"]/td[1]/dl[1]')[0] condition = '\n'.join([(child.text or "").strip() for child in condition.iterdescendants()]) clean = lambda cost: "".join( filter(lambda x: x.isdigit() or x == '.', cost)) print( json.dumps({ "price": clean(costString), "time": str(deliveryTime),