def query_url_data(self, url): """ 获取每页的数据 :param url: :return: """ if not url: return # resp = get_data(url) # url = self.parse_index_page(resp.content.decode()) # while url: # self.url = url # self.down() # resp = get_data(url) # url = self.parse_index_page(resp.content.decode()) try: resp = get_data(url) url = self.parse_index_page(resp.content.decode()) while url: self.url = url self.down() # time.sleep(1) resp = get_data(url) url = self.parse_index_page(resp.content.decode()) except Exception as e: print(e) # print('*',self.url) self.log.error(self.url) # self.log.exception(e) change_ips() return True
def NMR(content): eles = etree.HTML(content) eles = eles.xpath('//ul[@class="mbctabs fix-clear"]/li/a/@href')[-1] flag = re.match('.*html', eles) if flag: eles = fix_url(eles) eles = get_data(eles).content.decode() ele = etree.HTML(eles) try: tables = ele.xpath( '//div[@style="margin:9px;background-color:#fff;"]') nmr_h1 = tables[0].xpath('string(.)').strip() nmr_h1_url = filter(tables[0].xpath('./img/@src')) nmr_13c = tables[1].xpath('string(.)').strip() nmr_13c_url = filter(tables[1].xpath('./img/@src')) return dict(nmr_h1=nmr_h1, nmr_h1_url=nmr_h1_url, nmr_13c=nmr_13c, nmr_13c_url=nmr_13c_url) except: nodata = ele.xpath('//div[@class="nodata"]') if len(nodata) == 0: print('nmr error') return NMR(content) return dict(nmr_h1='', nmr_h1_url='', nmr_13c='', nmr_13c_url='') return dict(nmr_h1='', nmr_h1_url='', nmr_13c='', nmr_13c_url='')
def save_img(url): if not url: return "" resp = get_data(url) name = str(uuid.uuid4()) + '.png' with open('base_img/%s' % name, 'wb') as f: f.write(resp.content) return name
def parse(url): content = get_data(url).content.decode() dt = {'url': url} for func in parses: res = func(content) if isinstance(res, dict): dt[func.__name__] = filter_ele(res) else: dt[func.__name__] = res dt = show(dt) # print(json.dumps(dt)) return dt
def save_Img(url): """ 存图片 :param url: :return: """ if not url: return "" resp = get_data(url) name = str(uuid.uuid4()) + '.png' with open('imgs/%s' % name, 'wb') as f: f.write(resp.content) return name
def MSDS(content): eles = etree.HTML(content) eles = eles.xpath('//ul[@class="mbctabs fix-clear"]/li/a/@href')[-2] flag = re.match('.*html', eles) if flag: eles = fix_url(eles) eles = get_data(eles).content.decode() msds = etree.HTML(eles).xpath('//div[@class="msds"]') if len(msds) > 0: content = etree.tostring(filter(msds), encoding='utf-8').decode() return trim(content) return '' return ''
def SDS(content): eles = etree.HTML(content) eles = eles.xpath('//ul[@class="mbctabs fix-clear"]/li/a/@href')[-3] flag = re.match('.*html', eles) if flag: content = get_data(sds_url(eles)).content.decode() data = filter(re.findall('a\((.*)\)', content)) if data: dt = json.loads(data) if dt['code'] == 'error': return '' return trim(dt['data']) return '' return ''
def parse(url): try: content = get_data(url).content.decode() dt = {'url': url} for func in parses: res = func(content) if isinstance(res, dict): dt[func.__name__] = filter_ele(res) else: dt[func.__name__] = res pipline(dt) except DuplicateKeyError: pass except Exception as e: # en_olbase_err().insert({'url': url, 'msg': str(e)}) log.exception(e)
def parse(url): flag = True while flag: try: content = get_data(url).content.decode() dt = {'url': url} for func in parses: res = func(content) if isinstance(res, dict): dt[func.__name__] = filter_ele(res) else: dt[func.__name__] = res pipline(dt) flag = False except DuplicateKeyError: flag = False except IndexError: change_ips() except Exception as e: en_olbase_err().insert({'url': url, 'msg': str(e)}) log.exception(e) flag = False