Example #1
0
    def read_from_text(self, dat_raw_text):
        self.raw_text = dat_raw_text
        resu = dat_raw_text.rstrip().split('<>')
        if len(resu) != 5:
            return  # todo: throw exception
        self.user_name = resu[0]
        self.email = resu[1]
        info = resu[2]
        content = resu[3]
        self.title = resu[4]

        # parse date/time/id field
        ma = re.match(r'(.*) ID:(.*)', info)
        if ma:
            self.info_datetime = ma.group(1)
            self.info_id = ma.group(2)
        else:
            self.info_datetime = info
            self.info_id = ''

        # convert content to plain text
        content = html.unescape(content)
        self.content = re.sub(r'<br>', '\n', content)
        self.content_html = self.make_html(self.content)

        # scan content text to find anchors
        for m in re.finditer(r">>(\d+)", self.content):
            self.anchor.append(int(m.group(1)) - 1)
Example #2
0
def get_css_report(url_datas):
    datas = []
    for row in url_datas:
        print(row["pid"], "を処理しています")
        time.sleep(1)
        try:
            res = requests.get(row["url"], verify=False)
            doc = res.text
            dom = lxml.html.fromstring(res.content)
            link_tag_lists = []
            style_tag_lists = []
            style_attr_strs = ""
            for link in dom.cssselect("link"):
                if link.get("rel") == "stylesheet":
                    link_tag_lists.append(link.get("href"))
            for style in dom.cssselect("style"):
                style_str = html.unescape(lxml.html.tostring(style).decode())
                style_tag_lists.append(style_str)
            for style_attr in re.findall(r"style=\".*?\"", doc, re.DOTALL):
                style_attr_strs += style_attr + ","
            datas.append({
                "pid": row["pid"],
                "link-tag": link_tag_lists,
                "style-tag": style_tag_lists,
                "style-attr": style_attr_strs
            })
        except requests.exceptions.SSLError:
            print(row["pid"], "はエラーのためスキップします")
    return datas
Example #3
0
 def fetch_css_datas(self, cr_pid, cr_url):
     print(cr_pid, "を処理しています")
     time.sleep(self.shortWait)
     datas = []
     try:
         res = requests.get(cr_url, verify=False)
         doc = res.text
         dom = lxml.html.fromstring(res.content)
         dom.make_links_absolute(res.url)
         link_tags = []
         style_tags = []
         style_atts = ""
         for link in dom.cssselect("link"):
             if link.get("rel") == "stylesheet":
                 link_tags.append(link.get("href"))
         for style in dom.cssselect("style"):
             style_str = html.unescape(lxml.html.tostring(style).decode())
             style_tags.append(style_str)
         for style_attr in re.findall(r'style=".*?"', doc, re.DOTALL):
             style_atts += style_attr + ","
         datas.append({
             "pid": cr_pid,
             "url": cr_url,
             "link_tags": link_tags,
             "style_tags": style_tags,
             "style_atts": style_atts
         })
     except requests.exceptions.SSLError:
         print(cr_pid, "はエラーのためスキップします")
     return datas
Example #4
0
def net_html_unescape(html_str:str)->str:
	'''
	Decodes HTML escaped symbols:
		
		"That&#039;s an example" -> "That's an example"

	'''
	return html.unescape(html_str)
Example #5
0
def strip_html(doc):
	_blank_if_none = lambda x : "" if x is None else x
	_empty_div_if_none = lambda x : "<div></div>" if x is None else x
	_nbsp_del 	= lambda x : re.sub(no_nbsp,"",x)
	_no_nl 		= lambda x : re.sub(no_nl, " ", x)
	_quote_fix 	= lambda x : re.sub(quote, "'", x)
	_no_ellipsis = lambda x : re.sub(ellipsis, ".", x)
	_unescape 	= lambda x : html.unescape(_empty_div_if_none(doc))
	_printable = lambda x : "".join(list(filter(lambda x : x in string.printable, x)))
	return _printable(_no_ellipsis(lxml.html.fromstring(clean_html(_no_nl(_unescape(doc)))).text_content()))
Example #6
0
def video(url):
    attempts = 0
    accessible = 0
    #results_file = open("results.txt", 'w')
    #sys.stdout = results_file

    if "https://youtu.be/" in str(
            url) or "https://www.youtube.com/watch?v=" in str(url):
        try:
            get_tor_session().get("https://ip.seeip.org/")
        except IOError:
            return "Tor service is down serverside. Please try again later."
    else:
        return "Invalid input."

    page_data = requests.get(url).text
    parse_title = str(
        re.findall('<title>(.*?) - YouTube</title><meta name="title" content=',
                   page_data))
    title = html.unescape(parse_title.split("'")[1])

    if title == "":
        print("Video unavailable")
    else:
        print(title)
    print(url + "\n")

    while attempts < 5:
        rotate_connection()
        title_query = "https://www.youtube.com/results?search_query=" + "+".join(
            title.split()).replace('\n', '')
        title_search = get_tor_session().get(title_query).text
        if title_search.find('"title":{"runs":[{"text":"') >= 0:
            if title_search.find(title) >= 0:
                accessible += 1
                print("[ ✓ ]", end="")
            else:
                print("[ X ]", end="")
            try:
                r = get_tor_session().get("https://ip.seeip.org/geoip")
                r_dict = r.json()
                print(" " + r_dict["country"] + " (" + r_dict["ip"] + ")")
            except IOError:
                print(" Unknown location.")
            attempts += 1

    if attempts == accessible and accessible > 0:
        print("\nNo abnormal behavior detected.")
    elif attempts > accessible:
        print("\nQuestionable behavior detected.")
    elif accessible == 0:
        print("\nAlarming behavior detected.")

    #results_file.close()
    return (open("results.txt", "r").read())
Example #7
0
 def get_under_link_datas(self, dom_obj):
     datas = []
     for row in dom_obj.xpath(
             '//*[@id="ModuleContents"]/table[2]/tr/td[2]/table/tr/td/a'):
         atag = html.unescape(lxml.html.tostring(row).decode())
         atag_text = self.fetch_atag_text(atag)
         atag_url = self.fetch_atag_href(atag)
         datas.append({
             "key_addr": self.url,
             "title": atag_text,
             "url": self.base_url + atag_url
         })
     return datas
Example #8
0
 def get_calendar_td_link_datas(self, dom_obj):
     datas = []
     for td in dom_obj.xpath(
             '//*[@id="ModuleContents"]/table[1]/tr/td/table[2]/tr/td'):
         td_tag = html.unescape(lxml.html.tostring(td).decode())
         td_tag = self.table_tag_raise(td_tag)
         for aline in re.findall(r'<a.+?>.+?</a>', td_tag, re.DOTALL):
             if self.is_calendar_td_raise_link(aline):
                 continue
             else:
                 atag_text = self.fetch_atag_text(aline)
                 atag_url = self.fetch_atag_href(aline)
                 datas.append({
                     "key_addr": self.url,
                     "title": atag_text,
                     "url": self.base_url + atag_url
                 })
     return self.make_uniq_datas(datas)
Example #9
0
    def fetch(self):
        datas = []
        res = requests.get(self.url)
        dom = lxml.html.fromstring(self.reki_decode(res.text))
        datas.append({
            "key_addr": "---",
            "title": dom.cssselect("title")[0].text,
            "url": self.url
        })

        for row in dom.xpath('//*[@id="webphoto_box_photo_a"]/table/tr/td[2]/a[2]'):
            atg = html.unescape(lxml.html.tostring(row).decode())
            title_str = self.fetch_atag_text(atg)
            title_url = self.fetch_atag_href(atg)
            datas.append({
                "key_addr": self.url,
                "title": title_str.strip(),
                "url": title_url
            })
        return datas
Example #10
0
    def unescape(cls, text):
        """
        Removes HTML or XML character references and entities from a text string. Return the plain text, as a Unicode string, if necessary.
        
        Argument:
        text -- The HTML (or XML) source text.
        """

        if text is None:
            return None

        try:
            # Python 2.6-2.7 
            from HTMLParser import HTMLParser
            h = HTMLParser()
            return h.unescape(text)
        except ImportError:
            # Python 3
            import html
            return html.unescape(text)

            
Example #11
0
def untextlink_decode(elm):
    ret = html.unescape(lxml.html.tostring(elm).decode())
    ret = re.sub(r"</*a.*?>", "", ret)
    return ret
Example #12
0
# listData = re.compile('"LISTDATE":"(.*?)",', re.S).findall(r)

# print('listData, ', len(listData), listData)

# ZGJ = re.compile('"ZGJ_HQ":"(.*?)",', re.S).findall(r)
# print('ZGJ', ZGJ)


# data_parse = HTMLParser.HTMLParser()
# for i in ZGJ:
#     newdata = data_parse.unescape(i)
#     print(newdata)

# num = 0xEA5DF3C3F3C3E4E5
ret = html.unescape('&#xEA5D;&#xF3C3;.&#xF3C3;&#xE4E5;')
# print(ret)
# ret = html.unescape('&#xEA5D')
# print(ret)
# ret = html.unescape('&#xF3C3;')
# print(ret)
# ret = html.unescape('.&#xF3C3;')
# print(ret)
# ret = html.unescape('&#xE4E5;')
# print(ret)
# for i in ZGJ:
#     ret = html.unescape(i)
#     print(ret, i)

# for i in range(0, len(myKzz)):
#     kzz = myKzz[i]
Example #13
0
 def sub_dict(d, keys):
     import html
     return {k: html.unescape(d[k]) for k in d if k in keys}
Example #14
0
"return_label":"ERL",
"return_label_coc":"ERL",
"WWEFMT":"",
"WWEFIT":"",
"ctc_second_submit":"10"}
headers = { "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
            "X-Requested-With": "XMLHttpRequest"}
try:
	r = requests.post(url, data=payload, headers=headers)#data.encode('utf-8'), headers=headers)
	# print(r.text, file=open('html.txt', 'w'))
	r = lxml.html.fromstring(r.text);
	costString = r.xpath("//td[@class='ship_total']/strong")[0].text

	try:
		deliveryTime = lxml.html.tostring(r.xpath('//*[@id="servicerow0"]/td[2]/dl/dd')[0]).decode('utf-8')
		deliveryTime = html.unescape(deliveryTime.split('<br>')[2]).strip().replace('\xa0', ' ')
		deliveryTime = max(1, (datetime.strptime(deliveryTime, "%B %d, %Y").date() - datetime.now().date()).days)
	except ValueError:
		deliveryTime = ""
		
	condition = r.xpath('//*[@id="servicerow0"]/td[1]/dl[1]')[0]
	condition = '\n'.join([(child.text or "").strip() for child in condition.iterdescendants()])
	clean = lambda cost: "".join(filter(lambda x: x.isdigit() or x == '.', cost))
	print(json.dumps({ "price": clean(costString),
						"time": str(deliveryTime),
						"condition": condition,
						}))
except:
	pass
#regexp way
#r = re.search("[0-9,]+.\\d{0,2}", costString)
Example #15
0
 def clean_string(string):
     # we don't want soft hyphen in the db
     soft_hyphen_html = "&#173;"
     # we don't want line breaks in the db
     return html.unescape(
         string.replace(soft_hyphen_html, "").replace("\n", " "))
Example #16
0
def html_unscape(term):
    return html.unescape(term)
Example #17
0
        def get(self, id):

            url = 'https://airbnb.com/rooms/{id}'.format(id=id)

            if DEMO:
                res = open('tmp.txt', 'r').read()

            else:
                res = request(url)
                with open('tmp.txt', 'w') as out_file:
                    out_file.write(res)

            # CHECK OFFER EXISTING
            checker = etree.HTML(res).xpath('.//link[@rel="canonical"]/@href')
            checker = checker[0] if checker else False
            if checker and checker == 'https://www.airbnb.com/':
                return {
                    'error': True,
                    'errors': {
                        'airbnb_id': 'Offer not found'
                    }
                }

            # CHECK IF IP IS BANNED
            has_title = etree.HTML(res).xpath('.//title/text()')
            has_title = has_title[0] if has_title else False
            if not has_title:
                return {
                    'error': True,
                    'errors': {
                        'airbnb_id': 'Proxy is banned'
                    }
                }

            # IF STATUS CODE IS ERROR
            if type(res) == dict and 'error' in res:
                return res  # False if offer not found in airbnb page

            content = False
            for r in res.splitlines():
                if '"bootstrapData"' in r:

                    content = re.search('<!--{"behavioralUid"(.*?)-->',
                                        res).group(0)
                    content = re.search('<!--(.*?)-->', content).group(1)
                    content = json.loads(content)
                    break

                    # content = re.search('<!--(.*?)-->', r).group(1)
                    # content = json.loads(content)
                    # break

            if not content:
                return 'Offer not found'

            listingInfo = content['bootstrapData']['reduxData']['homePDP'][
                'listingInfo']['listing']
            listingExtra = listingInfo['p3_event_data_logging']

            # http://jsonviewer.stack.hu/
            # jprint(listingInfo)

            # GET PRICE
            price = self.getPrice(id=id,
                                  currency=listingInfo['native_currency'])

            # tree = etree.HTML(res)
            # price = tree.xpath('.//meta[@name="description"]/@content')[0]
            # price = re.search('\s€(\w+).\s', price)
            # if price:
            #     price = price.group(1)
            # else:
            #     price = False

            # print("PRICE" ,price)

            # AMENTIES
            amenties_arr = []
            listing_amenities = [{
                'id': c['id'],
                'name': c['name'],
                'tag': c['tag']
            } for c in listingInfo['listing_amenities']]
            for c in listingInfo['see_all_amenity_sections']:

                def getAmenties(id):
                    for c in listing_amenities:
                        if c['id'] == id:
                            return c

                if not c['title'] == 'Not included':
                    amenties_arr.append({
                        'group':
                        c['title'],
                        'items': [getAmenties(a) for a in c['amenity_ids']]
                    })

            # PHOTOS
            photos_arr = [{
                'src': c['xx_large'],
                'index': c['sort_order']
            } for c in listingInfo['photos']]
            photos_arr = [{
                'src': c['src'],
                'index': c['index']
            } for c in sorted(
                photos_arr, key=itemgetter('index'), reverse=False)]

            # DESCRIPTION
            description = listingInfo['sectioned_description']
            description_text = ''
            description_html = ''
            if description['description']:
                description_text = html.unescape(description['description'])
                description_html = description_text.replace('\r',
                                                            '<br />').replace(
                                                                '\n', '<br />')
            description = {
                'text': description_text,
                'html': description_html,
                'lang': description['localized_language_name'],
            }
            # print(description)

            results = {
                'id': listingInfo['id'],
                'price': price,
                'name': html.unescape(listingInfo['name']),
                'bathroom_label': listingInfo['bathroom_label'],
                'bed_label': listingInfo['bed_label'],
                'bedroom_label': listingInfo['bedroom_label'],
                'guest_label': listingInfo['guest_label'],
                # 'person_capacity': listingInfo['person_capacity'],
                'star_rating': listingInfo['star_rating'],
                'calendar_last_updated_at':
                listingInfo['calendar_last_updated_at'],
                'min_nights': listingInfo['min_nights'],
                'location_title': listingInfo['location_title'],
                'lat': listingInfo['lat'],
                'lng': listingInfo['lng'],
                'room_and_property_type':
                listingInfo['room_and_property_type'],
                'room_type_category': listingInfo['room_type_category'],
                'guest_controls': {
                    i: c
                    for i, c in listingInfo['guest_controls'].items()
                    if 'allows_' in i
                },
                'photos': photos_arr,
                'description': description,
                'primary_host': listingInfo['primary_host'],
                'amenties': amenties_arr,
                # 'listing_amenities': listing_amenities,
                'check_in': listingInfo['localized_check_in_time_window'],
                'check_out': listingInfo['localized_check_out_time'],

                # EXTRA...
                'description_language': listingExtra['description_language'],
                'is_superhost': listingExtra['is_superhost'],
                'home_tier': listingExtra['home_tier'],
                'checkin_rating': listingExtra['checkin_rating'],
                'cleanliness_rating': listingExtra['cleanliness_rating'],
                'communication_rating': listingExtra['communication_rating'],
                'location_rating': listingExtra['location_rating'],
                'accuracy_rating': listingExtra['accuracy_rating'],
                'value_rating': listingExtra['value_rating'],

                # '': listingInfo[''],
            }
            # jprint(results)
            return results
import urllib.request
import lxml.html
import re, json, html

# Make a GET request and parse the HTML into a tree
# urlKs = 'https://www.kickstarter.com/projects/agfa/agfa-and-something-weird'
# htmlRaw = urllib.request.urlopen(urlKs).read()
# text_file = open("project.html", "w")
# text_file.write(htmlRaw.decode("utf-8"))
# text_file.close()

text_file = open("project.html")
tree = lxml.html.fromstring(text_file.read())
text_file.close()

# print(tree)

# Find the script that contains the JSON object
script = tree.xpath('//script[contains(text(), "window.current_project")]')[0]
# Get the JSON object from the script
jsonRaw = re.search('window.current_project = "([^"]*)', script.text).group(1)
# Fix some odd double escaping
jsonRaw = jsonRaw.replace('\\\\&quot;', '\\&quot;')
# Unescape
jsonRaw = html.unescape(jsonRaw)
# Parse and print
jsonParsed = json.loads(jsonRaw)
print(json.dumps(jsonParsed, indent=4))
Example #19
0
 def _filter(item):
     values = []
     for key in required:
         decoded = html.unescape(item[key])
         values.append(decoded)
     return OrderedDict(zip(required, values))
    def process_queue():
        while True:
            try:
                url = urls.pop()
            except IndexError:
                print('download completed')
                break
            else:
                h = downloader(url, 5)
                if html is not None:
                    houses = {}
                    tree = lxml.html.fromstring(h)

                    # 标题
                    title = tree.cssselect(
                        'li.clear > div.info.clear > div.title')
                    title = list(map(lxml.html.HtmlElement.text_content,
                                     title))
                    houses['title'] = title

                    # 房屋简介
                    houseInfo = tree.cssselect(
                        'li.clear > div.info.clear > div.address > div.houseInfo'
                    )
                    houseInfo = list(
                        map(lxml.html.HtmlElement.text_content, houseInfo))
                    houses['houseInfo'] = houseInfo

                    # 楼层信息
                    positionInfo = tree.cssselect(
                        'li.clear > div.info.clear > div.flood > div.positionInfo'
                    )
                    positionInfo = list(
                        map(lxml.html.HtmlElement.text_content, positionInfo))
                    houses['positionInfo'] = positionInfo

                    # 地铁, 交税信息, 看房时间
                    tag = re.findall(
                        '<div class="tag">(.*?)</div><div class="priceInfo">',
                        html.unescape(h.decode()))
                    subway = []
                    taxfree = []
                    haskey = []
                    for x in tag:
                        # 地铁
                        swy = re.findall('<span class="subway">(.*?)</span>',
                                         str(x))
                        subway.extend(swy) if len(swy) else subway.extend('无')
                        # 交税信息
                        tf = re.findall('<span class="taxfree">(.*?)</span>',
                                        str(x))
                        taxfree.extend(tf) if len(tf) else taxfree.extend('无')
                        # 看房时间
                        hk = re.findall('<span class="haskey">(.*?)</span>',
                                        str(x))
                        haskey.extend(hk) if len(hk) else haskey.extend('无')

                    houses['subway'] = subway
                    houses['taxfree'] = taxfree
                    houses['haskey'] = haskey

                    # 总价
                    totalPrice = tree.cssselect(
                        'li.clear > div.info.clear > div.priceInfo > div.totalPrice'
                    )
                    totalPrice = list(
                        map(lxml.html.HtmlElement.text_content, totalPrice))
                    houses['totalPrice'] = totalPrice

                    # 单价
                    unitPrice = tree.cssselect(
                        'li.clear > div.info.clear > div.priceInfo > div.unitPrice'
                    )
                    unitPrice = list(
                        map(lxml.html.HtmlElement.text_content, unitPrice))
                    houses['unitPrice'] = unitPrice

                    # 将房屋信息转换为DataFrame格式
                    df = pd.DataFrame(houses)

                    df.to_csv(
                        '/home/wangf/PycharmProjects/lianjiawang/house_infoes.csv',
                        mode='a')
Example #21
0
}
headers = {
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "X-Requested-With": "XMLHttpRequest"
}
try:
    r = requests.post(url, data=payload,
                      headers=headers)  #data.encode('utf-8'), headers=headers)
    # print(r.text, file=open('html.txt', 'w'))
    r = lxml.html.fromstring(r.text)
    costString = r.xpath("//td[@class='ship_total']/strong")[0].text

    try:
        deliveryTime = lxml.html.tostring(
            r.xpath('//*[@id="servicerow0"]/td[2]/dl/dd')[0]).decode('utf-8')
        deliveryTime = html.unescape(
            deliveryTime.split('<br>')[2]).strip().replace('\xa0', ' ')
        deliveryTime = max(
            1, (datetime.strptime(deliveryTime, "%B %d, %Y").date() -
                datetime.now().date()).days)
    except ValueError:
        deliveryTime = ""

    condition = r.xpath('//*[@id="servicerow0"]/td[1]/dl[1]')[0]
    condition = '\n'.join([(child.text or "").strip()
                           for child in condition.iterdescendants()])
    clean = lambda cost: "".join(
        filter(lambda x: x.isdigit() or x == '.', cost))
    print(
        json.dumps({
            "price": clean(costString),
            "time": str(deliveryTime),
Example #22
0
def html_unscape(term):
    return html.unescape(term)