def xmlDoc_from_html(response): """Return libxml2 doc for HTMLs""" utf8body = _body_as_utf8(response) or ' ' try: lxdoc = libxml2.htmlReadDoc(utf8body, response.url, 'utf-8', \ html_parser_options) except TypeError: # libxml2 doesn't parse text with null bytes lxdoc = libxml2.htmlReadDoc(utf8body.replace("\x00", ""), response.url, \ 'utf-8', html_parser_options) return lxdoc
def xmlDoc_from_html(response): """Return libxml2 doc for HTMLs""" utf8body = body_as_utf8(response) or ' ' try: lxdoc = libxml2.htmlReadDoc(utf8body, response.url, 'utf-8', \ html_parser_options) except TypeError: # libxml2 doesn't parse text with null bytes lxdoc = libxml2.htmlReadDoc(utf8body.replace("\x00", ""), response.url, \ 'utf-8', html_parser_options) return lxdoc
def parse_week_menu(page, year, week, lang): print('Parsing menu webpage') # replace those pesky non-breakable spaces page = page.replace(' ', ' ') doc = libxml2.htmlReadDoc( page.encode('utf-8'), None, 'utf-8', libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR) dateComponents = doc.xpathEval( "//*[@id='parent-fieldname-title']")[0].content.strip().split() # Date description is not consistent, sometimes misses year if not dateComponents[-1].isdigit(): dateComponents.append(str(year)) # always start from the last day of the week, since it will be in the correct year and month friday = datetime.strptime("%s %s %s" % tuple(dateComponents[-3:]), "%d %B %Y").date() # verify that this is the week we are searching for isocalendar = friday.isocalendar() if isocalendar[0] != year or isocalendar[1] != week: print('Incorrect information retrieved: expected %s-%s, got %s-%s' % (year, week, isocalendar[0], isocalendar[1])) return None menus = doc.xpathEval("//*[starts-with(@id, 'parent-fieldname-text')]") week_menu = Week(year, week) week_menu.parse(menus, friday, lang) return week_menu
def image_scraper(self, image_path): print "To Upload Image" try: result = self.client.upload_from_path(image_path) except ImgurClientError as e: print(e.error_message) print(e.status_code) print(result['link']) print "Uploaded Image" request_url = "https://www.google.com/searchbyimage?&image_url=" + result['link'] # select user agent user_agent = random.choice(self.user_agents).rstrip() gis_raw_result = self.get_raw_html_urllib( request_url, user_agent ) parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadDoc( gis_raw_result, '', None, parse_options) scrapeResult = {} for key in self.xpath: r = self.get_simple_xpath(doc, self.xpath[key]) scrapeResult[key] = r scrapeResults = scrapeResult doc.freeDoc() # output of the results #print json.dumps( scrapeResults, indent=4, sort_keys=False) return scrapeResults["bestguess"]
def parse_week_menu(page, year, week, lang): print('Parsing menu webpage') # replace those pesky non-breakable spaces page = page.replace(' ', ' ') doc = libxml2.htmlReadDoc(page.encode('utf-8'), None, 'utf-8', libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR) dateComponents = doc.xpathEval("//*[@id='parent-fieldname-title']")[0].content.strip().split() # Date description is not consistent, sometimes misses year if not dateComponents[-1].isdigit(): dateComponents.append(str(year)) # always start from the last day of the week, since it will be in the correct year and month friday = datetime.strptime("%s %s %s" % tuple(dateComponents[-3:]), "%d %B %Y").date() # verify that this is the week we are searching for isocalendar = friday.isocalendar() if isocalendar[0] != year or isocalendar[1] != week: print('Incorrect information retrieved: expected %s-%s, got %s-%s' % (year, week, isocalendar[0], isocalendar[1])) return None menus = doc.xpathEval("//*[starts-with(@id, 'parent-fieldname-text')]") week_menu = Week(year, week) week_menu.parse(menus, friday, lang) return week_menu
def parse_pharms_from_html(page): print('Parsing webpage to an object tree') # replace those pesky non-breakable spaces page = page.replace(' ', ' ') doc = libxml2.htmlReadDoc(page, None, 'utf-8', libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR) menuElement = doc.xpathEval("//*[starts-with(@id, 'listResults')]") #1 pharmacist on each row rows = menuElement[0].xpathEval('.//tr')[0:] pharmacists = {} pharmacists['Apotheken']=[] for row in rows: for name in row.xpathEval('.//b')[0:]: #Valid name if (str.isupper(name.content[1])): name=name.content.strip().title() open=row.xpathEval('.//span')[0:][0].content.strip() m=re.search(r" {4}([^0-9]*) ([0-9]*) {4}([0-9]{4}) ([A-Z,\-]*)",row.content) n=re.search(r"([0-9]{5,})",row.content) pharmacist={'name':name, 'open':open, 'street': m.group(1).strip(), 'nr': m.group(2).strip(), 'zip': m.group(3).strip(), 'city': m.group(4).strip().title(), 'tel': n.group(1) } pharmacists['Apotheken'].extend([pharmacist]) return pharmacists
def resolveTotalPageNum(html): doc = libxml2.htmlReadDoc(html, None, 'utf8', PARSE_OPTIONS) try: pgtxt = doc.xpathEval('//span[@class="fp-text"]/i')[0].content page_num = int(pgtxt) return page_num finally: doc.freeDoc()
def resolveTotalPageNum(html): doc = libxml2.htmlReadDoc(html, None, "utf8", PARSE_OPTIONS) try: pgtxt = doc.xpathEval('//span[@class="fp-text"]/i')[0].content page_num = int(pgtxt) return page_num finally: doc.freeDoc()
def image_scraper(self): gis_raw_result = self.search_image_file(self.img_filepath) string_for_output = gis_raw_result.encode('utf8', 'replace') parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadDoc(string_for_output, '', None, parse_options) scrapeResults = self.get_simple_xpath(doc, self.xpath['bestguess']) doc.freeDoc() return scrapeResults
def parse_menu_from_html(page, year, week): print('Parsing weekmenu webpage to an object tree') # replace those pesky non-breakable spaces page = page.replace(' ', ' ') doc = libxml2.htmlReadDoc(page, None, 'utf-8', libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR) dateComponents = doc.xpathEval("//*[@id='parent-fieldname-title']")[0].content.strip().split() locale.setlocale(locale.LC_ALL, 'nl_BE.UTF-8') friday = datetime.strptime("%s %s %s" % tuple(dateComponents[-3:]), "%d %B %Y").date() # verify that this is the week we are searching for isocalendar = friday.isocalendar() if isocalendar[0] != year or isocalendar[1] != week: print('Incorrect information retrieved: expected %s-%s, got %s-%s' % (year, week, isocalendar[0], isocalendar[1])) return None menuElement = doc.xpathEval("//*[starts-with(@id, 'parent-fieldname-text')]") rows = menuElement[0].xpathEval('.//tr')[1:] menu = {} dayOfWeek = 4 for row in rows: cells = row.xpathEval('.//td') if len(cells) <= 3: continue # first row of a day if cells[0].content.strip() != '': day = str(friday - timedelta(dayOfWeek)) dayOfWeek -= 1 menu[day] = {} # check if resto is closed if cells[2].content.lower().strip() == 'gesloten': menu[day]['open'] = False else: menu[day]['open'] = True menu[day]['soup'] = { 'name': cells[1].content.strip() } menu[day]['meat'] = get_meat_and_price(cells[2]) menu[day]['vegetables'] = [cells[3].content.strip()] # second row of a day elif cells[1].content.strip() != '' and menu[day]['open']: menu[day]['soup']['price'] = cells[1].content.strip() menu[day]['meat'].extend(get_meat_and_price(cells[2])) menu[day]['vegetables'].append(cells[3].content.strip()) # the third and fourth row of a day (sometimes it's empty) elif cells[2].content.strip() != '' and menu[day]['open']: menu[day]['meat'].extend(get_meat_and_price(cells[2])) return menu
def segv_test(): s = "<html><body><div><a><a></a></a><a></a></div></body></html>" options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadDoc(s, None, "utf-8", options).doc ctxt = doc.xpathNewContext() nodes = ctxt.xpathEval("//body/node()") nodes.reverse() for note in nodes: nexts = note.xpathEval("node()") note.unlinkNode() note.freeNode() # freeNode会将该节点及其子节点释放掉 nexts[0].unlinkNode() nexts[0].freeNode() # 资源已经释放,再次释放会造成段错误
def parse(url, body, xpaths): options = libxml2.HTML_PARSE_RECOVER + \ libxml2.HTML_PARSE_NOERROR + \ libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadDoc(body, url, 'utf-8', options) results = [] for node in doc.xpathEval(xpaths['root']): result = {} for k,v in xpaths.iteritems(): if k != 'root': result[k] = node.xpathEval(v)[0].content results.append(result) return results
def parse_peer_response_xml(self, string): """Parses an XML peer response. """ snippet_list = snipdata.SnippetList() total_results = None string = re.sub(r"xmlns=(\'|\")[^\'\"]*\1", " ", string) # remove default namespace try: if self.mimetype == 'text/html': xdoc = libxml2.htmlReadDoc(string, '', None, HTML_PARSE_OPTIONS) else: xdoc = libxml2.readDoc(string, '', None, XML_PARSE_OPTIONS) except libxml2.treeError: raise ValueError('Peer output error.') ctxt = xdoc.xpathNewContext() for (name, uri) in re.findall("xmlns:([^\=]*)=[\'\"]([^\'\"]*)", string): ctxt.xpathRegisterNs(name, uri) # register all namespaces if name == 'opensearch': total_results = self.xpath_string_value(ctxt, "//opensearch:totalResults") items = ctxt.xpathEval(self.item_path) #print "ITEMS:", items, self.item_path right_now = snipdata.right_now() for item in items: ctxt.setContextNode(item) title = self.xpath_string_value(ctxt, self.title_path) title = bound_text_no_markup(title, 60) link = self.xpath_link(ctxt, self.link_path) attributes = list() if self.attribute_paths: for key_path in self.attribute_paths.split(','): (key, path) = key_path.split('{', 1) path=path[:-1] # remove trailing '}' value = self.xpath_string_value(ctxt, path) if value: attributes.append((key, value)) if self.thumbnail_path: # xpath_thumbnail changes: ctxt thumbnail = self.xpath_thumbnail(ctxt, self.thumbnail_path) else: thumbnail = None if self.summary_path: summary = self.xpath_string_value(ctxt, self.summary_path) else: for node in ctxt.xpathEval(self.title_path + '|.//script'): # remove title and (possibly uncommented) javascript node.unlinkNode() summary = self.xpath_string_value(ctxt, '.') summary = bound_text_no_markup(summary, 300) snippet = snipdata.Snippet([], link, title, right_now, summary, None, thumbnail, attributes=attributes) snippet_list.append(snippet) ctxt.xpathFreeContext() xdoc.freeDoc() new_query = snipdata.Query() return (new_query, snipdata.PeerList(), snippet_list, total_results)
def getCategory(tag, sFile): print "tag" + tag result = alchemyObj.TextGetCategory(tag) # analyze results using xpath queries parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING # parse_options = libxml2.HTML_PARSE_RECOVER+\libxml2.HTML_PARSE_NOERROR+\libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadDoc(result, "", None, parse_options) print result # use xpath to get the required data cauth = doc.xpathEval("//category") for ca in cauth: print "Category: " + ca.content + " " + tag sFile.write(tag + " " + ca.content + "\n")
def segv_test(): s = "<html><body><div><a><a></a></a><a></a></div></body></html>" options = libxml2.HTML_PARSE_RECOVER + \ libxml2.HTML_PARSE_NOERROR + \ libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadDoc(s, None, 'utf-8', options).doc ctxt = doc.xpathNewContext() nodes = ctxt.xpathEval('//body/node()') nodes.reverse() for note in nodes: nexts = note.xpathEval('node()') note.unlinkNode() note.freeNode() #freeNode会将该节点及其子节点释放掉 nexts[0].unlinkNode() nexts[0].freeNode() #资源已经释放,再次释放会造成段错误
def get_url_from_asx(self, asx, asxurl): """Simple ASX parser. Return the content of the first <ref> tag.""" try: doc = libxml2.htmlReadDoc(asx, asxurl, None, libxml2.HTML_PARSE_NOERROR | libxml2.HTML_PARSE_NOWARNING | libxml2.HTML_PARSE_NONET) except libxml2.treeError: debug('Can\'t parse ASX:\n' + asx) return None root = doc.getRootElement() ret = self._get_ref_recursive(root).strip() doc.freeDoc() return ret
def read_html_from_url(url): f = urllib.urlopen(url) doc = libxml2.htmlReadDoc(f.read(), None, 'UTF-8', libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR) return doc
for image_url in args.urls: image_url_escaped = urllib.request.quote(image_url, '') request_url = args.gisroot + image_url_escaped # select user agent user_agent = random.choice(user_agents).rstrip() if args.verbose: print("Search URL: ", request_url) print("UA: ", user_agent) gis_raw_result = get_raw_html_urllib(request_url, user_agent, args.abuse) parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadDoc(gis_raw_result.decode("utf-8"), '', None, parse_options) scrapeResult = {} for key in xpath: r = get_simple_xpath(doc, xpath[key]) scrapeResult[key] = r scrapeResults[image_url] = scrapeResult doc.freeDoc() # output of the results if args.plainoutput: for imagefn in scrapeResults: print("") print(imagefn)
def resolveProductListFromPage(html): product_list = [] nowtime = timeHelper.getNowLong() nowdate = timeHelper.getNow() try: doc = libxml2.htmlReadDoc(html, None, "utf8", PARSE_OPTIONS) sku_docs = doc.xpathEval("//div[@data-sku]") for sku in sku_docs: # if True: try: sku_doc = libxml2.htmlReadDoc("%s" % sku, None, "utf8", PARSE_OPTIONS) sku_id = int(sku_doc.xpathEval("//@data-sku")[0].content) # 判断是否是JD自营 if sku_id > 99999999: # 非自营商品 continue # print '%s' %sku sku_url = sku_doc.xpathEval('//div[@class="p-img"]/a/@href')[0].content try: sku_thumnail_url = sku_doc.xpathEval('//div[@class="p-img"]/a/img/@data-lazy-img')[0].content except: sku_thumnail_url = sku_doc.xpathEval('//div[@class="p-img"]/a/img/@src')[0].content sku_title = "" try: sku_title = sku_doc.xpathEval('//div[@class="p-name"]/a/@title')[0].content except: pass if len(sku_title) == 0: sku_title = sku_doc.xpathEval('//div[@class="p-name"]/a/em')[0].content comment_count = int(sku_doc.xpathEval('//div[@class="p-commit"]/strong/a')[0].content) sku_icon_url = "" icon_doc = sku_doc.xpathEval('//div[@class="p-img"]/a/div/@style') if len(icon_doc) > 0: sku_icon_url = url_utils.getStringBetween(icon_doc[0].content, 'url("', '")') is_global = is_free_gift = is_pay_on_delivery = 0 price_items = sku_doc.xpathEval('//div[@class="p-price"]/div/i') for pitem in price_items: txt = pitem.content if "全球购" in txt: is_global = 1 elif "货到付款" in txt: is_pay_on_delivery = 1 elif "赠品" in txt: is_free_gift = 1 else: print "new-mark found:" print txt sku_stock = -1 try: sku_stock = int(sku_doc.xpathEval("//div[@data-stock_v]/@data-stock_v")[0].content) except: pass sku_url = __makeUrl__(sku_url) sku_thumnail_url = __makeUrl__(sku_thumnail_url) tp = ( sku_id, nowdate, nowtime, sku_title, sku_url, sku_thumnail_url, sku_stock, comment_count, is_global, is_pay_on_delivery, is_free_gift, sku_icon_url, ) product_list.append(tp) except Exception as e: logging.error("resolveProductListError: %s, error = %s") % (sku, e) continue finally: sku_doc.freeDoc() return product_list finally: doc.freeDoc()
def _get_root(self, encoding='utf8'): parse_options = libxml2.HTML_PARSE_RECOVER + \ libxml2.HTML_PARSE_NOERROR + \ libxml2.HTML_PARSE_NOWARNING return libxml2.htmlReadDoc(self.body, '', encoding, parse_options)
import urllib2, libxml2 urlPrima = 'http://24.232.0.118/asp/nivelesPrima.asp' camposImportantes = ['Tx', 'Rx', 'MER'] f = urllib2.urlopen(urlPrima) html = f.read() f.close() parse_options = libxml2.HTML_PARSE_RECOVER + \ libxml2.HTML_PARSE_NOERROR + \ libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadDoc(html, '', None, parse_options) campos = doc.xpathEval('//td') print '----------------' print 'Niveles FiberTel' print '----------------' for campo in campos: if campo.content in camposImportantes: valor = campo.next.next print '\033[93m'+campo.content+': '+valor.content+'\033[0m' print '--------------' doc.freeDoc()
for image_url in args.urls: image_url_escaped = urllib2.quote(image_url, '') request_url = args.gisroot + image_url_escaped # select user agent user_agent = random.choice(user_agents).rstrip() if args.verbose: print "Search URL: ", request_url print "UA: ", user_agent gis_raw_result = get_raw_html_urllib(request_url, user_agent) parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadDoc(gis_raw_result, '', None, parse_options) scrapeResult = {} for key in xpath: r = get_simple_xpath(doc, xpath[key]) scrapeResult[key] = r scrapeResults[image_url] = scrapeResult doc.freeDoc() # output of the results if args.plainoutput: for imagefn in scrapeResults: print print imagefn
def read_html_from_url(url): f = urllib.urlopen(url) doc = libxml2.htmlReadDoc( f.read(), None, 'UTF-8', libxml2.XML_PARSE_RECOVER | libxml2.XML_PARSE_NOERROR) return doc
def apply_xslt(buf, encoding, url, xsltfile, params=None): """Apply xslt transform from file xsltfile to the string buf with parameters params. url is the location of buf. Returns the transformed file as a string, or None if the transformation couldn't be completed.""" stylesheet = libxslt.parseStylesheetFile(xsltfile) if stylesheet is None: #self.log_info('Can\'t open stylesheet %s' % xsltfile, 'warning') return None try: # htmlReadDoc fails if the buffer is empty but succeeds # (returning an empty tree) if the buffer is a single # space. if buf == '': buf = ' ' # Guess whether this is an XML or HTML document. if buf.startswith('<?xml'): doc = libxml2.readDoc(buf, url, None, libxml2.XML_PARSE_NOERROR | libxml2.XML_PARSE_NOWARNING | libxml2.XML_PARSE_NONET) else: #self.log_info('Using HTML parser', 'debug') doc = libxml2.htmlReadDoc(buf, url, encoding, libxml2.HTML_PARSE_NOERROR | libxml2.HTML_PARSE_NOWARNING | libxml2.HTML_PARSE_NONET) except libxml2.treeError: stylesheet.freeStylesheet() #self.log_info('Can\'t parse XML document', 'warning') return None resultdoc = stylesheet.applyStylesheet(doc, params) stylesheet.freeStylesheet() doc.freeDoc() if resultdoc is None: #self.log_info('Can\'t apply stylesheet', 'warning') return None # Postprocess the document: # Resolve relative URLs in srcurl (TODO: this should be done in XSLT) root = resultdoc.getRootElement() if root is None: resultdoc.freeDoc() return None node2 = root.children while node2 is not None: if node2.name not in ['link', 'button']: node2 = node2.next continue node = node2.children while node is not None: if (node.name == 'ref') or (node.name == 'stream') or \ (node.name == 'submission'): refurl = node.getContent() match = re.search(r'\?.*srcurl=([^&]*)', refurl) if match is not None: oldurl = urllib.unquote(match.group(1)) absurl = urljoin_query_fix(url, oldurl) newurl = refurl[:match.start(1)] + \ urllib.quote(absurl) + \ refurl[match.end(1):] node.setContent(resultdoc.encodeSpecialChars(newurl)) node = node.next node2 = node2.next ret = resultdoc.serialize('UTF-8') resultdoc.freeDoc() return ret
for image_url in args.urls: image_url_escaped = urllib2.quote(image_url,'') request_url = args.gisroot + image_url_escaped # select user agent user_agent = random.choice(user_agents).rstrip() if args.verbose: print "Search URL: ", request_url print "UA: ", user_agent gis_raw_result = get_raw_html_urllib( request_url, user_agent ) parse_options = libxml2.HTML_PARSE_RECOVER + libxml2.HTML_PARSE_NOERROR + libxml2.HTML_PARSE_NOWARNING doc = libxml2.htmlReadDoc( gis_raw_result, '', None, parse_options) scrapeResult = {} for key in xpath: r = get_simple_xpath(doc, xpath[key]) scrapeResult[key] = r scrapeResults[image_url] = scrapeResult doc.freeDoc() # output of the results if args.plainoutput: for imagefn in scrapeResults: print
def resolveProductListFromPage(html): product_list = [] nowtime = timeHelper.getNowLong() nowdate = timeHelper.getNow() try: doc = libxml2.htmlReadDoc(html, None, 'utf8', PARSE_OPTIONS) sku_docs = doc.xpathEval('//div[@data-sku]') for sku in sku_docs: #if True: try: sku_doc = libxml2.htmlReadDoc('%s' % sku, None, 'utf8', PARSE_OPTIONS) sku_id = int(sku_doc.xpathEval('//@data-sku')[0].content) # 判断是否是JD自营 if sku_id > 99999999: # 非自营商品 continue #print '%s' %sku sku_url = sku_doc.xpathEval( '//div[@class="p-img"]/a/@href')[0].content try: sku_thumnail_url = sku_doc.xpathEval( '//div[@class="p-img"]/a/img/@data-lazy-img' )[0].content except: sku_thumnail_url = sku_doc.xpathEval( '//div[@class="p-img"]/a/img/@src')[0].content sku_title = "" try: sku_title = sku_doc.xpathEval( '//div[@class="p-name"]/a/@title')[0].content except: pass if len(sku_title) == 0: sku_title = sku_doc.xpathEval( '//div[@class="p-name"]/a/em')[0].content comment_count = int( sku_doc.xpathEval('//div[@class="p-commit"]/strong/a') [0].content) sku_icon_url = "" icon_doc = sku_doc.xpathEval( '//div[@class="p-img"]/a/div/@style') if len(icon_doc) > 0: sku_icon_url = url_utils.getStringBetween( icon_doc[0].content, 'url("', '")') is_global = is_free_gift = is_pay_on_delivery = 0 price_items = sku_doc.xpathEval( '//div[@class="p-price"]/div/i') for pitem in price_items: txt = pitem.content if '全球购' in txt: is_global = 1 elif '货到付款' in txt: is_pay_on_delivery = 1 elif '赠品' in txt: is_free_gift = 1 else: print 'new-mark found:' print txt sku_stock = -1 try: sku_stock = int( sku_doc.xpathEval('//div[@data-stock_v]/@data-stock_v') [0].content) except: pass sku_url = __makeUrl__(sku_url) sku_thumnail_url = __makeUrl__(sku_thumnail_url) tp = (sku_id, nowdate, nowtime, sku_title, sku_url, sku_thumnail_url, sku_stock, comment_count, is_global, is_pay_on_delivery, is_free_gift, sku_icon_url) product_list.append(tp) except Exception as e: logging.error('resolveProductListError: %s, error = %s') % ( sku, e) continue finally: sku_doc.freeDoc() return product_list finally: doc.freeDoc()