Example #1
0
def xmlDoc_from_html(response):
    """Return libxml2 doc for HTMLs"""
    utf8body = _body_as_utf8(response) or ' '
    try:
        lxdoc = libxml2.htmlReadDoc(utf8body, response.url, 'utf-8', \
            html_parser_options)
    except TypeError:  # libxml2 doesn't parse text with null bytes
        lxdoc = libxml2.htmlReadDoc(utf8body.replace("\x00", ""), response.url, \
            'utf-8', html_parser_options)
    return lxdoc
Example #2
0
def xmlDoc_from_html(response):
    """Return libxml2 doc for HTMLs"""
    utf8body = body_as_utf8(response) or ' '
    try:
        lxdoc = libxml2.htmlReadDoc(utf8body, response.url, 'utf-8', \
            html_parser_options)
    except TypeError:  # libxml2 doesn't parse text with null bytes
        lxdoc = libxml2.htmlReadDoc(utf8body.replace("\x00", ""), response.url, \
            'utf-8', html_parser_options)
    return lxdoc
Example #3
0
def parse_week_menu(page, year, week, lang):
    print('Parsing menu webpage')
    # replace those pesky non-breakable spaces
    page = page.replace(' ', ' ')

    doc = libxml2.htmlReadDoc(
        page.encode('utf-8'), None, 'utf-8',
        libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR)

    dateComponents = doc.xpathEval(
        "//*[@id='parent-fieldname-title']")[0].content.strip().split()
    # Date description is not consistent, sometimes misses year
    if not dateComponents[-1].isdigit():
        dateComponents.append(str(year))

    # always start from the last day of the week, since it will be in the correct year and month
    friday = datetime.strptime("%s %s %s" % tuple(dateComponents[-3:]),
                               "%d %B %Y").date()

    # verify that this is the week we are searching for
    isocalendar = friday.isocalendar()

    if isocalendar[0] != year or isocalendar[1] != week:
        print('Incorrect information retrieved: expected %s-%s, got %s-%s' %
              (year, week, isocalendar[0], isocalendar[1]))
        return None
    menus = doc.xpathEval("//*[starts-with(@id, 'parent-fieldname-text')]")

    week_menu = Week(year, week)
    week_menu.parse(menus, friday, lang)
    return week_menu
Example #4
0
    def image_scraper(self, image_path):
        print "To Upload Image"
        try:
            result = self.client.upload_from_path(image_path)
        except ImgurClientError as e:
            print(e.error_message)
            print(e.status_code)
        print(result['link'])
        print "Uploaded Image"
        request_url = "https://www.google.com/searchbyimage?&image_url=" + result['link']

        # select user agent
        user_agent = random.choice(self.user_agents).rstrip()


        gis_raw_result = self.get_raw_html_urllib( request_url, user_agent )

        parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING

        doc = libxml2.htmlReadDoc( gis_raw_result, '', None, parse_options)

        scrapeResult = {}
        for key in self.xpath:
            r = self.get_simple_xpath(doc, self.xpath[key])
            scrapeResult[key] = r

        scrapeResults = scrapeResult

        doc.freeDoc()


        # output of the results

        #print json.dumps( scrapeResults, indent=4, sort_keys=False)
        return scrapeResults["bestguess"]
Example #5
0
def parse_week_menu(page, year, week, lang):
    print('Parsing menu webpage')
    # replace those pesky non-breakable spaces
    page = page.replace(' ', ' ')

    doc = libxml2.htmlReadDoc(page.encode('utf-8'), None, 'utf-8', libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR)

    dateComponents = doc.xpathEval("//*[@id='parent-fieldname-title']")[0].content.strip().split()
    # Date description is not consistent, sometimes misses year
    if not dateComponents[-1].isdigit():
        dateComponents.append(str(year))

    # always start from the last day of the week, since it will be in the correct year and month
    friday = datetime.strptime("%s %s %s" % tuple(dateComponents[-3:]), "%d %B %Y").date()

    # verify that this is the week we are searching for
    isocalendar = friday.isocalendar()

    if isocalendar[0] != year or isocalendar[1] != week:
        print('Incorrect information retrieved: expected %s-%s, got %s-%s' %
            (year, week, isocalendar[0], isocalendar[1]))
        return None
    menus = doc.xpathEval("//*[starts-with(@id, 'parent-fieldname-text')]")

    week_menu = Week(year, week)
    week_menu.parse(menus, friday, lang)
    return week_menu
Example #6
0
def parse_pharms_from_html(page):
	print('Parsing webpage to an object tree')
	# replace those pesky non-breakable spaces
	page = page.replace(' ', ' ')

	doc = libxml2.htmlReadDoc(page, None, 'utf-8', libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR)
	menuElement = doc.xpathEval("//*[starts-with(@id, 'listResults')]")
	#1 pharmacist on each row
	rows = menuElement[0].xpathEval('.//tr')[0:]
	pharmacists = {}
	pharmacists['Apotheken']=[]
	for row in rows:
		for name in row.xpathEval('.//b')[0:]:
			#Valid name
			if (str.isupper(name.content[1])):
				name=name.content.strip().title()
				open=row.xpathEval('.//span')[0:][0].content.strip()
				m=re.search(r" {4}([^0-9]*) ([0-9]*) {4}([0-9]{4}) ([A-Z,\-]*)",row.content)
				n=re.search(r"([0-9]{5,})",row.content)
				pharmacist={'name':name,
							'open':open,
							'street': m.group(1).strip(),
							'nr': m.group(2).strip(),
							'zip': m.group(3).strip(),
							'city': m.group(4).strip().title(),
							'tel': n.group(1)
				}
				pharmacists['Apotheken'].extend([pharmacist])
	return pharmacists
Example #7
0
def resolveTotalPageNum(html):
    doc = libxml2.htmlReadDoc(html, None, 'utf8', PARSE_OPTIONS)
    try:
        pgtxt = doc.xpathEval('//span[@class="fp-text"]/i')[0].content
        page_num = int(pgtxt)
        return page_num
    finally:
        doc.freeDoc()
def resolveTotalPageNum(html):
    doc = libxml2.htmlReadDoc(html, None, "utf8", PARSE_OPTIONS)
    try:
        pgtxt = doc.xpathEval('//span[@class="fp-text"]/i')[0].content
        page_num = int(pgtxt)
        return page_num
    finally:
        doc.freeDoc()
    def image_scraper(self):
        gis_raw_result = self.search_image_file(self.img_filepath)
        string_for_output = gis_raw_result.encode('utf8', 'replace')
        parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING
        doc = libxml2.htmlReadDoc(string_for_output, '', None, parse_options)

        scrapeResults = self.get_simple_xpath(doc, self.xpath['bestguess'])
        doc.freeDoc()
        
        return scrapeResults 
Example #10
0
def parse_menu_from_html(page, year, week):
	print('Parsing weekmenu webpage to an object tree')
	# replace those pesky non-breakable spaces
	page = page.replace(' ', ' ')

	doc = libxml2.htmlReadDoc(page, None, 'utf-8', libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR)

	dateComponents = doc.xpathEval("//*[@id='parent-fieldname-title']")[0].content.strip().split()
	locale.setlocale(locale.LC_ALL, 'nl_BE.UTF-8')
	friday = datetime.strptime("%s %s %s" % tuple(dateComponents[-3:]), "%d %B %Y").date()

	# verify that this is the week we are searching for
	isocalendar = friday.isocalendar()
	if isocalendar[0] != year or isocalendar[1] != week:
		print('Incorrect information retrieved: expected %s-%s, got %s-%s' %
			(year, week, isocalendar[0], isocalendar[1]))
		return None

	menuElement = doc.xpathEval("//*[starts-with(@id, 'parent-fieldname-text')]")
	rows = menuElement[0].xpathEval('.//tr')[1:]

	menu = {}
	dayOfWeek = 4
	for row in rows:
		cells = row.xpathEval('.//td')
		if len(cells) <= 3:
			continue

		# first row of a day
		if cells[0].content.strip() != '':
			day = str(friday - timedelta(dayOfWeek))
			dayOfWeek -= 1
			menu[day] = {}

			# check if resto is closed
			if cells[2].content.lower().strip() == 'gesloten':
				menu[day]['open'] = False
			else:
				menu[day]['open'] = True
				menu[day]['soup'] = { 'name': cells[1].content.strip() }
				menu[day]['meat'] = get_meat_and_price(cells[2])
				menu[day]['vegetables'] = [cells[3].content.strip()]

		# second row of a day
		elif cells[1].content.strip() != '' and menu[day]['open']:
			menu[day]['soup']['price'] = cells[1].content.strip()
			menu[day]['meat'].extend(get_meat_and_price(cells[2]))
			menu[day]['vegetables'].append(cells[3].content.strip())

		# the third and fourth row of a day (sometimes it's empty)
		elif cells[2].content.strip() != '' and menu[day]['open']:
			menu[day]['meat'].extend(get_meat_and_price(cells[2]))

	return menu
Example #11
0
def segv_test():
    s = "<html><body><div><a><a></a></a><a></a></div></body></html>"
    options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING
    doc = libxml2.htmlReadDoc(s, None, "utf-8", options).doc
    ctxt = doc.xpathNewContext()
    nodes = ctxt.xpathEval("//body/node()")
    nodes.reverse()
    for note in nodes:
        nexts = note.xpathEval("node()")
        note.unlinkNode()
        note.freeNode()  # freeNode会将该节点及其子节点释放掉
        nexts[0].unlinkNode()
        nexts[0].freeNode()  # 资源已经释放,再次释放会造成段错误
Example #12
0
def parse(url, body, xpaths):
    options = libxml2.HTML_PARSE_RECOVER + \
              libxml2.HTML_PARSE_NOERROR + \
              libxml2.HTML_PARSE_NOWARNING
    doc = libxml2.htmlReadDoc(body, url, 'utf-8', options)
    results = []
    for node in doc.xpathEval(xpaths['root']):
        result = {}
        for k,v in xpaths.iteritems():
            if k != 'root':
                result[k] = node.xpathEval(v)[0].content
        results.append(result)
    return results
Example #13
0
 def parse_peer_response_xml(self, string):
     """Parses an XML peer response. 
     """
     snippet_list = snipdata.SnippetList()
     total_results = None
     string = re.sub(r"xmlns=(\'|\")[^\'\"]*\1", " ", string)  # remove default namespace 
     try:
         if self.mimetype == 'text/html':
             xdoc = libxml2.htmlReadDoc(string, '', None, HTML_PARSE_OPTIONS) 
         else:
             xdoc = libxml2.readDoc(string, '', None, XML_PARSE_OPTIONS)
     except libxml2.treeError:
         raise ValueError('Peer output error.')
     ctxt = xdoc.xpathNewContext()   
     for (name, uri) in re.findall("xmlns:([^\=]*)=[\'\"]([^\'\"]*)", string):
         ctxt.xpathRegisterNs(name, uri) # register all namespaces
         if name == 'opensearch':
             total_results = self.xpath_string_value(ctxt, "//opensearch:totalResults")   
     items = ctxt.xpathEval(self.item_path)
     #print "ITEMS:", items, self.item_path
     right_now = snipdata.right_now()
     for item in items:
         ctxt.setContextNode(item) 
         title     = self.xpath_string_value(ctxt, self.title_path)
         title     = bound_text_no_markup(title, 60)
         link      = self.xpath_link(ctxt, self.link_path)
         attributes = list()
         if self.attribute_paths:
             for key_path in self.attribute_paths.split(','):
                 (key, path) = key_path.split('{', 1)
                 path=path[:-1] # remove trailing '}'
                 value = self.xpath_string_value(ctxt, path)
                 if value:
                     attributes.append((key, value))
         if self.thumbnail_path: # xpath_thumbnail changes: ctxt
             thumbnail = self.xpath_thumbnail(ctxt, self.thumbnail_path)
         else:
             thumbnail = None
         if self.summary_path:
             summary   = self.xpath_string_value(ctxt, self.summary_path)
         else:
             for node in ctxt.xpathEval(self.title_path + '|.//script'):  # remove title and (possibly uncommented) javascript
                 node.unlinkNode()
             summary = self.xpath_string_value(ctxt, '.')
         summary   = bound_text_no_markup(summary, 300)
         snippet = snipdata.Snippet([], link, title, right_now, summary, None, thumbnail, attributes=attributes)
         snippet_list.append(snippet)
     ctxt.xpathFreeContext()
     xdoc.freeDoc()
     new_query = snipdata.Query()
     return (new_query, snipdata.PeerList(), snippet_list, total_results)
def getCategory(tag, sFile):
    print "tag" + tag
    result = alchemyObj.TextGetCategory(tag)

    # analyze results using xpath queries
    parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING
    # parse_options = libxml2.HTML_PARSE_RECOVER+\libxml2.HTML_PARSE_NOERROR+\libxml2.HTML_PARSE_NOWARNING
    doc = libxml2.htmlReadDoc(result, "", None, parse_options)
    print result
    # use xpath to get the required data

    cauth = doc.xpathEval("//category")
    for ca in cauth:
        print "Category: " + ca.content + " " + tag
        sFile.write(tag + " " + ca.content + "\n")
Example #15
0
def segv_test():
    s = "<html><body><div><a><a></a></a><a></a></div></body></html>"
    options = libxml2.HTML_PARSE_RECOVER + \
              libxml2.HTML_PARSE_NOERROR + \
              libxml2.HTML_PARSE_NOWARNING
    doc = libxml2.htmlReadDoc(s, None, 'utf-8', options).doc
    ctxt = doc.xpathNewContext()
    nodes = ctxt.xpathEval('//body/node()')
    nodes.reverse()
    for note in nodes:
        nexts = note.xpathEval('node()')
        note.unlinkNode()
        note.freeNode()  #freeNode会将该节点及其子节点释放掉
        nexts[0].unlinkNode()
        nexts[0].freeNode()  #资源已经释放,再次释放会造成段错误
Example #16
0
 def get_url_from_asx(self, asx, asxurl):
     """Simple ASX parser. Return the content of the first <ref>
     tag."""
     try:
         doc = libxml2.htmlReadDoc(asx, asxurl, None,
                                   libxml2.HTML_PARSE_NOERROR |
                                   libxml2.HTML_PARSE_NOWARNING |
                                   libxml2.HTML_PARSE_NONET)
     except libxml2.treeError:
         debug('Can\'t parse ASX:\n' + asx)
         return None
     root = doc.getRootElement()
     ret = self._get_ref_recursive(root).strip()
     doc.freeDoc()
     return ret
Example #17
0
def read_html_from_url(url):
    f = urllib.urlopen(url)
    doc = libxml2.htmlReadDoc(f.read(), None, 'UTF-8', libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR)
    return doc
Example #18
0
for image_url in args.urls:
    image_url_escaped = urllib.request.quote(image_url, '')
    request_url = args.gisroot + image_url_escaped

    # select user agent
    user_agent = random.choice(user_agents).rstrip()

    if args.verbose:
        print("Search URL: ", request_url)
        print("UA: ", user_agent)

    gis_raw_result = get_raw_html_urllib(request_url, user_agent, args.abuse)

    parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING

    doc = libxml2.htmlReadDoc(gis_raw_result.decode("utf-8"), '', None,
                              parse_options)

    scrapeResult = {}
    for key in xpath:
        r = get_simple_xpath(doc, xpath[key])
        scrapeResult[key] = r

    scrapeResults[image_url] = scrapeResult

    doc.freeDoc()

# output of the results
if args.plainoutput:
    for imagefn in scrapeResults:
        print("")
        print(imagefn)
def resolveProductListFromPage(html):
    product_list = []
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    try:
        doc = libxml2.htmlReadDoc(html, None, "utf8", PARSE_OPTIONS)
        sku_docs = doc.xpathEval("//div[@data-sku]")
        for sku in sku_docs:
            # if True:
            try:
                sku_doc = libxml2.htmlReadDoc("%s" % sku, None, "utf8", PARSE_OPTIONS)

                sku_id = int(sku_doc.xpathEval("//@data-sku")[0].content)
                # 判断是否是JD自营
                if sku_id > 99999999:
                    # 非自营商品
                    continue

                # print '%s' %sku

                sku_url = sku_doc.xpathEval('//div[@class="p-img"]/a/@href')[0].content
                try:
                    sku_thumnail_url = sku_doc.xpathEval('//div[@class="p-img"]/a/img/@data-lazy-img')[0].content
                except:
                    sku_thumnail_url = sku_doc.xpathEval('//div[@class="p-img"]/a/img/@src')[0].content

                sku_title = ""
                try:
                    sku_title = sku_doc.xpathEval('//div[@class="p-name"]/a/@title')[0].content
                except:
                    pass

                if len(sku_title) == 0:
                    sku_title = sku_doc.xpathEval('//div[@class="p-name"]/a/em')[0].content
                comment_count = int(sku_doc.xpathEval('//div[@class="p-commit"]/strong/a')[0].content)

                sku_icon_url = ""
                icon_doc = sku_doc.xpathEval('//div[@class="p-img"]/a/div/@style')
                if len(icon_doc) > 0:
                    sku_icon_url = url_utils.getStringBetween(icon_doc[0].content, 'url("', '")')

                is_global = is_free_gift = is_pay_on_delivery = 0
                price_items = sku_doc.xpathEval('//div[@class="p-price"]/div/i')
                for pitem in price_items:
                    txt = pitem.content
                    if "全球购" in txt:
                        is_global = 1
                    elif "货到付款" in txt:
                        is_pay_on_delivery = 1
                    elif "赠品" in txt:
                        is_free_gift = 1
                    else:
                        print "new-mark found:"
                        print txt

                sku_stock = -1
                try:
                    sku_stock = int(sku_doc.xpathEval("//div[@data-stock_v]/@data-stock_v")[0].content)
                except:
                    pass

                sku_url = __makeUrl__(sku_url)
                sku_thumnail_url = __makeUrl__(sku_thumnail_url)

                tp = (
                    sku_id,
                    nowdate,
                    nowtime,
                    sku_title,
                    sku_url,
                    sku_thumnail_url,
                    sku_stock,
                    comment_count,
                    is_global,
                    is_pay_on_delivery,
                    is_free_gift,
                    sku_icon_url,
                )
                product_list.append(tp)

            except Exception as e:
                logging.error("resolveProductListError: %s, error = %s") % (sku, e)
                continue
            finally:
                sku_doc.freeDoc()

        return product_list
    finally:
        doc.freeDoc()
Example #20
0
 def _get_root(self, encoding='utf8'):
     parse_options = libxml2.HTML_PARSE_RECOVER + \
         libxml2.HTML_PARSE_NOERROR + \
         libxml2.HTML_PARSE_NOWARNING
     return libxml2.htmlReadDoc(self.body, '', encoding, parse_options)
Example #21
0
import urllib2, libxml2

urlPrima = 'http://24.232.0.118/asp/nivelesPrima.asp'
camposImportantes = ['Tx', 'Rx', 'MER']

f = urllib2.urlopen(urlPrima)
html = f.read()
f.close()

parse_options = libxml2.HTML_PARSE_RECOVER + \
	libxml2.HTML_PARSE_NOERROR + \
	libxml2.HTML_PARSE_NOWARNING

doc = libxml2.htmlReadDoc(html, '', None, parse_options)

campos = doc.xpathEval('//td')

print '----------------'
print 'Niveles FiberTel'
print '----------------'

for campo in campos:
		
	if campo.content in camposImportantes:
		
		valor = campo.next.next
		
		print '\033[93m'+campo.content+': '+valor.content+'\033[0m'
		print '--------------'
	
doc.freeDoc()
Example #22
0
for image_url in args.urls:
    image_url_escaped = urllib2.quote(image_url, '')
    request_url = args.gisroot + image_url_escaped

    # select user agent
    user_agent = random.choice(user_agents).rstrip()

    if args.verbose:
        print "Search URL: ", request_url
        print "UA: ", user_agent

    gis_raw_result = get_raw_html_urllib(request_url, user_agent)

    parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING

    doc = libxml2.htmlReadDoc(gis_raw_result, '', None, parse_options)

    scrapeResult = {}
    for key in xpath:
        r = get_simple_xpath(doc, xpath[key])
        scrapeResult[key] = r

    scrapeResults[image_url] = scrapeResult

    doc.freeDoc()

# output of the results
if args.plainoutput:
    for imagefn in scrapeResults:
        print
        print imagefn
Example #23
0
def read_html_from_url(url):
    f = urllib.urlopen(url)
    doc = libxml2.htmlReadDoc(
        f.read(), None, 'UTF-8',
        libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR)
    return doc
Example #24
0
def apply_xslt(buf, encoding, url, xsltfile, params=None):
    """Apply xslt transform from file xsltfile to the string buf
    with parameters params. url is the location of buf. Returns
    the transformed file as a string, or None if the
    transformation couldn't be completed."""
    stylesheet = libxslt.parseStylesheetFile(xsltfile)

    if stylesheet is None:
        #self.log_info('Can\'t open stylesheet %s' % xsltfile, 'warning')
        return None
    try:
        # htmlReadDoc fails if the buffer is empty but succeeds
        # (returning an empty tree) if the buffer is a single
        # space.
        if buf == '':
            buf = ' '

        # Guess whether this is an XML or HTML document.
        if buf.startswith('<?xml'):
            doc = libxml2.readDoc(buf, url, None,
                                  libxml2.XML_PARSE_NOERROR |
                                  libxml2.XML_PARSE_NOWARNING |
                                  libxml2.XML_PARSE_NONET)
        else:
            #self.log_info('Using HTML parser', 'debug')
            doc = libxml2.htmlReadDoc(buf, url, encoding,
                                      libxml2.HTML_PARSE_NOERROR |
                                      libxml2.HTML_PARSE_NOWARNING |
                                      libxml2.HTML_PARSE_NONET)
    except libxml2.treeError:
        stylesheet.freeStylesheet()
        #self.log_info('Can\'t parse XML document', 'warning')
        return None
    resultdoc = stylesheet.applyStylesheet(doc, params)
    stylesheet.freeStylesheet()
    doc.freeDoc()
    if resultdoc is None:
        #self.log_info('Can\'t apply stylesheet', 'warning')
        return None

    # Postprocess the document:
    # Resolve relative URLs in srcurl (TODO: this should be done in XSLT)
    root = resultdoc.getRootElement()
    if root is None:
        resultdoc.freeDoc()
        return None

    node2 = root.children
    while node2 is not None:
        if node2.name not in ['link', 'button']:
            node2 = node2.next
            continue

        node = node2.children
        while node is not None:
            if (node.name == 'ref') or (node.name == 'stream') or \
                    (node.name == 'submission'):
                refurl = node.getContent()

                match = re.search(r'\?.*srcurl=([^&]*)', refurl)
                if match is not None:
                    oldurl = urllib.unquote(match.group(1))
                    absurl = urljoin_query_fix(url, oldurl)
                    newurl = refurl[:match.start(1)] + \
                        urllib.quote(absurl) + \
                        refurl[match.end(1):]
                    node.setContent(resultdoc.encodeSpecialChars(newurl))

            node = node.next
        node2 = node2.next

    ret = resultdoc.serialize('UTF-8')
    resultdoc.freeDoc()
    return ret
Example #25
0
for image_url in args.urls:
    image_url_escaped = urllib2.quote(image_url,'')
    request_url = args.gisroot + image_url_escaped

    # select user agent
    user_agent = random.choice(user_agents).rstrip()

    if args.verbose:
        print "Search URL: ", request_url
        print "UA: ", user_agent

    gis_raw_result = get_raw_html_urllib( request_url, user_agent )

    parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING

    doc = libxml2.htmlReadDoc( gis_raw_result, '', None, parse_options)

    scrapeResult = {}
    for key in xpath:
        r = get_simple_xpath(doc, xpath[key])
        scrapeResult[key] = r

    scrapeResults[image_url] = scrapeResult

    doc.freeDoc()


# output of the results
if args.plainoutput:
    for imagefn in scrapeResults:
        print
Example #26
0
def resolveProductListFromPage(html):
    product_list = []
    nowtime = timeHelper.getNowLong()
    nowdate = timeHelper.getNow()
    try:
        doc = libxml2.htmlReadDoc(html, None, 'utf8', PARSE_OPTIONS)
        sku_docs = doc.xpathEval('//div[@data-sku]')
        for sku in sku_docs:
            #if True:
            try:
                sku_doc = libxml2.htmlReadDoc('%s' % sku, None, 'utf8',
                                              PARSE_OPTIONS)

                sku_id = int(sku_doc.xpathEval('//@data-sku')[0].content)
                # 判断是否是JD自营
                if sku_id > 99999999:
                    # 非自营商品
                    continue

                #print '%s' %sku

                sku_url = sku_doc.xpathEval(
                    '//div[@class="p-img"]/a/@href')[0].content
                try:
                    sku_thumnail_url = sku_doc.xpathEval(
                        '//div[@class="p-img"]/a/img/@data-lazy-img'
                    )[0].content
                except:
                    sku_thumnail_url = sku_doc.xpathEval(
                        '//div[@class="p-img"]/a/img/@src')[0].content

                sku_title = ""
                try:
                    sku_title = sku_doc.xpathEval(
                        '//div[@class="p-name"]/a/@title')[0].content
                except:
                    pass

                if len(sku_title) == 0:
                    sku_title = sku_doc.xpathEval(
                        '//div[@class="p-name"]/a/em')[0].content
                comment_count = int(
                    sku_doc.xpathEval('//div[@class="p-commit"]/strong/a')
                    [0].content)

                sku_icon_url = ""
                icon_doc = sku_doc.xpathEval(
                    '//div[@class="p-img"]/a/div/@style')
                if len(icon_doc) > 0:
                    sku_icon_url = url_utils.getStringBetween(
                        icon_doc[0].content, 'url("', '")')

                is_global = is_free_gift = is_pay_on_delivery = 0
                price_items = sku_doc.xpathEval(
                    '//div[@class="p-price"]/div/i')
                for pitem in price_items:
                    txt = pitem.content
                    if '全球购' in txt:
                        is_global = 1
                    elif '货到付款' in txt:
                        is_pay_on_delivery = 1
                    elif '赠品' in txt:
                        is_free_gift = 1
                    else:
                        print 'new-mark found:'
                        print txt

                sku_stock = -1
                try:
                    sku_stock = int(
                        sku_doc.xpathEval('//div[@data-stock_v]/@data-stock_v')
                        [0].content)
                except:
                    pass

                sku_url = __makeUrl__(sku_url)
                sku_thumnail_url = __makeUrl__(sku_thumnail_url)

                tp = (sku_id, nowdate, nowtime, sku_title, sku_url,
                      sku_thumnail_url, sku_stock, comment_count, is_global,
                      is_pay_on_delivery, is_free_gift, sku_icon_url)
                product_list.append(tp)

            except Exception as e:
                logging.error('resolveProductListError: %s, error = %s') % (
                    sku, e)
                continue
            finally:
                sku_doc.freeDoc()

        return product_list
    finally:
        doc.freeDoc()