def scrapeDetails(curUrl): global prevTrail ulog('curUrl= '+curUrl) try: d= pq(curUrl) md = elmToMd(d('#content')[0],True,True) # get device description mdl = md.splitlines() i = next(i for i,_ in enumerate(mdl) if _.strip().startswith('Home')) # step to non empty line brmd = [_.strip() for _ in mdl[i].split('ยป ')] brand = brmd[3] model = brmd[4].replace('Details','').strip() # get product Name i =findLineIdxWith(mdl,i+1,lambda _:_.strip().startswith('details:')) prodName = mdl[i].split(' |' )[1].strip() # "Dual-band wireless-AC3100 gigabit router" i = findLineIdxWith(mdl,i+1, lambda _:_.strip().startswith('hardware type:')) category = mdl[i].split(' | ')[1].strip() # "Wireless Router" # find empty line after details i = findLineIdxWith(mdl,i+1,lambda _:not _.strip()) # find non empty line i = findLineIdxWith(mdl,i+1,lambda _:_.strip()) j = findLineIdxWith(mdl,i+1, lambda _:re.match(r'All .+ products$',_.strip())) description='\n'.join(_.strip() for _ in mdl[i:j] if _.strip()) default_user_name,default_password,wifi_proto,availability,\ product_page,hw_fla1_amount,hw_ram1_amount = \ None,None,None,None,None,None,None trs = d('.tblight tr') pr=OrderedDict() for tr in trs: l = elmToMd(tr,False,False) if ' | ' not in l: continue n,v = [_.strip() for _ in l.split(' | ',1)] v = '\n'.join(_.strip() for _ in v.splitlines()) assert n not in pr n = n.rstrip(':') if v: v0 = v[0] if v0=='!': yn = re.search(r'!\[(.+?)\]', v).group(1) if yn=='yes': v= "true" elif yn=='no': v = "false" else: ipdb.set_trace(); uprint(yn) elif v0 =='<': v = re.search(r'<(.+?)>',v).group(1).strip() # "<http://router.asus.com>" elif v0 == '[': hreftitle = re.search(r'\((.+?)(?<!\\)\)', v).group(1) v = hreftitle.split()[0] v = v.replace('\\', '') else: """ 'Transmit Power: | +30 dBm' 'Receiver Sensitivity: | -76 dBm' 'Street price: | $52' 'Default admin password: | (blank)' """ if v0.isalnum() or v0 in "+-$" or (v0=='(' and v[-1]==')') : pass else: ipdb.set_trace() else: pass pr[n] = v if n== 'Default admin username': default_user_name = convertUserPassword(v) elif n== 'Default admin password': default_password = convertUserPassword(v) elif n=='WiFi standards supported': wifi_proto = abgnac_format(v) elif n== 'Availability': availability = v elif n == 'Product page': product_page = v elif n=='Flash Memory': assert re.match(r'[\d]*\.?\d+\s*(Mb|Kb)',v,re.I) hw_fla1_amount = v elif n== 'RAM': assert re.match(r'[\d]*\.?\d+\s*(Mb|Kb)',v,re.I) hw_ram1_amount = v else: pr[n]=v props_hstore = dict2hstore(pr) img= d('a.piframe img')[0] image_url=urlChangePath(d.base_url,img.attrib['src']) trailStr = str(prevTrail) sql("INSERT OR REPLACE INTO TFiles(brand,model,prod_name,category" ", default_user_name, default_password, wifi_proto, availability" ", description, product_page, hw_fla1_amount, hw_ram1_amount" ", image_url, props_hstore, tree_trail) VALUES " "(:brand,:model,:prodName,:category" ",:default_user_name,:default_password,:wifi_proto,:availability" ",:description,:product_page,:hw_fla1_amount,:hw_ram1_amount" ",:image_url,:props_hstore,:trailStr)", locals()) uprint('UPSERT "%(brand)s", "%(model)s", \'%(props_hstore)s\'' ', %(trailStr)s '% locals()) except Exception as ex: ipdb.set_trace() traceback.print_exc()
def detailScraper(baseUrl): global prevTrail try: ulog('baseUrl= '+baseUrl) """ OK: http://arris.force.com/consumers/ConsumerProductDetail_Ja?p=a0ha000000Rx4I4AAJ&c=Touchstone%20Modems%20and%20Gateways Not: http://shop.surfboard.com/ """ if not re.match(r'(http|https)://.*arris\..+\.com/.+', baseUrl): ulog('Not arris.force.com') return d = pq(url=baseUrl) try: dev_desc = elmToMd(d('div.row')[1]) except IndexError: ulog('no model to harvest') return dev_desc = '\n'.join(re.sub(r'^\+', '', _, 1).strip() for _ in dev_desc.splitlines()) model = dev_desc.splitlines()[0].strip() assert model ulog('model= '+model) dev_hstore = [_.text_content().strip() for _ in d('.specTbl tr')] dev_hstore = dict2hstore(OrderedDict( [(_.splitlines()[0].strip(), _.splitlines()[1].strip()) for _ in dev_hstore])) image_url= d('.box.boxProduct')[0].attrib['style'] # "background: url(https://arris--c.na13.content.force.com/servlet/servlet.ImageServer?id=015a0000003NYHt&oid=00D30000000kUAL&lastMod=1442430676000);" image_url = re.search(r'url\((.+)(?<!\\)\)', image_url).group(1) assert fileUrlIsCdn(image_url) files = d('#panel4 .small-12.columns:not(.text-center)') numFiles = len(files) ulog('numFiles=%s'%numFiles) if not numFiles: upsertModel(model, image_url, dev_desc, dev_hstore, baseUrl, str(prevTrail)) return startIdx= getStartIdx() for idx in range(startIdx, numFiles): file_name = '\n'.join(_.strip() for _ in files[idx].text_content().splitlines() if _.strip()) file_name = file_name.splitlines()[0].strip() ulog('file_name="%s"'%file_name) if re.match(r'No .+ Available', file_name, re.I): upsertModel(model, image_url, dev_desc, dev_hstore, baseUrl, str(prevTrail)) continue try: fw_ver = re.search(r"\d\.[\w\.\-]+", file_name).group(0) except AttributeError: fw_ver = file_name file_urls = files[idx].cssselect('a') if not file_urls: ulog('No files') upsertModel(model, image_url, dev_desc, dev_hstore, baseUrl, str(prevTrail)) continue file_url = next(_.attrib['href'] for _ in file_urls if _.text_content().strip().startswith('Download')) if not fileUrlIsCdn(file_url): faqScraper(file_url, model, image_url, dev_desc, dev_hstore) tree_trail = str(prevTrail+[idx]) sql("INSERT OR REPLACE INTO TFiles (model, image_url, dev_desc, dev_hstore, fw_ver, page_url, file_url, tree_trail) VALUES (:model, :image_url, :dev_desc, :dev_hstore, :fw_ver, :baseUrl, :file_url, :tree_trail)", locals()) uprint('UPSERT "%(model)s", "%(fw_ver)s", %(tree_trail)s, %(file_url)s '%locals()) except Exception as ex: ipdb.set_trace() traceback.print_exc()