def gethtml(url,**dict_args): try: html = get_html_urllib(url, 2) except Exception,e: html = 'timeout'
def get_dict_info(self,hc,**dict_args): ''' 冲详细信息页面的代码中获取数据 {} ''' fun = 'function get_dict_info of %s' % self.name dict_info = {} hc = filter_html(hc) ''' 此时 hc 是否为详细信息页面的代码 ''' iip = dict_args.get('is_info_page', False) if iip: ''' 处在详细信息页面 ''' res_imgurl = p_imgurl_iip.findall(hc) res_partno = p_partno_iip.findall(hc) res_mfr = p_mfr_iip.findall(hc) res_desc = p_desc_iip.findall(hc) res_stock = p_stock_iip.findall(hc) else: res_imgurl = p_imgurl.findall(hc) res_partno = p_partno.findall(hc) res_mfr = p_mfr.findall(hc) res_desc = p_desc.findall(hc) res_stock = p_stock.findall(hc) if res_imgurl: dict_info[keys_mouser_tt[28][1]] = p_jkh.sub('',res_imgurl[0]).replace('"','').strip() if res_partno: dict_info[keys_mouser_tt[1][1]] = p_jkh.sub('',res_partno[0]).strip() if res_mfr: dict_info[keys_mouser_tt[2][1]] = p_jkh.sub('',res_mfr[0]).strip() if res_desc: dict_info[keys_mouser_tt[3][1]] = p_jkh.sub('',res_desc[0]).strip() if res_stock: dict_info[keys_mouser_tt[5][1]] = p_jkh.sub('',res_stock[0]).strip() ''' 处理区间价格 [('1-24','$82.08'),.('25 +','$61.56')] >>> ['1:$82.08','1000:$61.56'] >>> '1:$82.08|||25:$61.56' ''' if iip: ''' 详细信息页面 ''' res_priceinfo = p_priceinfo_iip.findall(hc) pc_pp_new = ['%s:%s' % (p_invalidnum.sub('',pc).strip(), pp) for pc,pp in res_priceinfo] pc_pp_finall = '|||'.join(pc_pp_new) else: ''' 现在为表格页面模式 获取该<tr>对应的详细信息页面url ''' res_seemore = p_seemore.findall(hc) res_priceinfo = []#默认设置空 if res_seemore: ''' 需要进入详细信息页面获取完整价格 ''' res_uip = p_uip.findall(hc) if res_uip: dict_info[keys_mouser_tt[27][1]] = res_uip[0] hc_more = filter_html(get_html_urllib(res_uip[0], 2)) if hc_more != 'timeout': ''' 正常获取了 详细信息页面的价格信息 ''' res_priceinfo = p_priceinfo_iip.findall(hc_more) else: ''' 只需从<tr>行字符串获取价格 即可 ''' res_priceinfo = p_priceinfo.findall(hc) pc_pp_new = ['%s:%s' % (p_invalidnum.sub('',pc).strip(), pp) for pc,pp in res_priceinfo] pc_pp_finall = '|||'.join(pc_pp_new) if pc_pp_finall:dict_info[keys_mouser_tt[10][1]] = pc_pp_finall ''' 加上型号搜索页面url 详细页面url 详细页面html代码 ''' dict_info[keys_mouser_tt[19][1]] = self.url_search return dict_info