예제 #1
0
 def query_url_data(self, url):
     """
     获取每页的数据
     :param url:
     :return:
     """
     if not url:
         return
     # resp = get_data(url)
     # url = self.parse_index_page(resp.content.decode())
     # while url:
     #     self.url = url
     #     self.down()
     #     resp = get_data(url)
     #     url = self.parse_index_page(resp.content.decode())
     try:
         resp = get_data(url)
         url = self.parse_index_page(resp.content.decode())
         while url:
             self.url = url
             self.down()
             # time.sleep(1)
             resp = get_data(url)
             url = self.parse_index_page(resp.content.decode())
     except Exception as e:
         print(e)
         # print('*',self.url)
         self.log.error(self.url)
         # self.log.exception(e)
         change_ips()
         return True
예제 #2
0
def NMR(content):
    eles = etree.HTML(content)
    eles = eles.xpath('//ul[@class="mbctabs fix-clear"]/li/a/@href')[-1]
    flag = re.match('.*html', eles)
    if flag:
        eles = fix_url(eles)
        eles = get_data(eles).content.decode()
        ele = etree.HTML(eles)
        try:
            tables = ele.xpath(
                '//div[@style="margin:9px;background-color:#fff;"]')
            nmr_h1 = tables[0].xpath('string(.)').strip()
            nmr_h1_url = filter(tables[0].xpath('./img/@src'))
            nmr_13c = tables[1].xpath('string(.)').strip()
            nmr_13c_url = filter(tables[1].xpath('./img/@src'))
            return dict(nmr_h1=nmr_h1,
                        nmr_h1_url=nmr_h1_url,
                        nmr_13c=nmr_13c,
                        nmr_13c_url=nmr_13c_url)
        except:
            nodata = ele.xpath('//div[@class="nodata"]')
            if len(nodata) == 0:
                print('nmr error')
                return NMR(content)
            return dict(nmr_h1='', nmr_h1_url='', nmr_13c='', nmr_13c_url='')
    return dict(nmr_h1='', nmr_h1_url='', nmr_13c='', nmr_13c_url='')
예제 #3
0
def save_img(url):
    if not url:
        return ""
    resp = get_data(url)
    name = str(uuid.uuid4()) + '.png'
    with open('base_img/%s' % name, 'wb') as f:
        f.write(resp.content)
    return name
예제 #4
0
def parse(url):
    content = get_data(url).content.decode()
    dt = {'url': url}
    for func in parses:
        res = func(content)
        if isinstance(res, dict):
            dt[func.__name__] = filter_ele(res)
        else:
            dt[func.__name__] = res
    dt = show(dt)
    # print(json.dumps(dt))
    return dt
예제 #5
0
def save_Img(url):
    """
    存图片
    :param url:
    :return:
    """
    if not url:
        return ""
    resp = get_data(url)
    name = str(uuid.uuid4()) + '.png'
    with open('imgs/%s' % name, 'wb') as f:
        f.write(resp.content)
    return name
예제 #6
0
def MSDS(content):
    eles = etree.HTML(content)
    eles = eles.xpath('//ul[@class="mbctabs fix-clear"]/li/a/@href')[-2]
    flag = re.match('.*html', eles)
    if flag:
        eles = fix_url(eles)
        eles = get_data(eles).content.decode()
        msds = etree.HTML(eles).xpath('//div[@class="msds"]')
        if len(msds) > 0:
            content = etree.tostring(filter(msds), encoding='utf-8').decode()
            return trim(content)
        return ''
    return ''
예제 #7
0
def SDS(content):
    eles = etree.HTML(content)
    eles = eles.xpath('//ul[@class="mbctabs fix-clear"]/li/a/@href')[-3]
    flag = re.match('.*html', eles)
    if flag:
        content = get_data(sds_url(eles)).content.decode()
        data = filter(re.findall('a\((.*)\)', content))
        if data:
            dt = json.loads(data)
            if dt['code'] == 'error':
                return ''
            return trim(dt['data'])
        return ''
    return ''
예제 #8
0
def parse(url):
    try:
        content = get_data(url).content.decode()
        dt = {'url': url}
        for func in parses:
            res = func(content)
            if isinstance(res, dict):
                dt[func.__name__] = filter_ele(res)
            else:
                dt[func.__name__] = res
        pipline(dt)
    except DuplicateKeyError:
        pass
    except Exception as e:
        # en_olbase_err().insert({'url': url, 'msg': str(e)})
        log.exception(e)
예제 #9
0
def parse(url):
    flag = True
    while flag:
        try:
            content = get_data(url).content.decode()
            dt = {'url': url}
            for func in parses:
                res = func(content)
                if isinstance(res, dict):
                    dt[func.__name__] = filter_ele(res)
                else:
                    dt[func.__name__] = res
            pipline(dt)
            flag = False
        except DuplicateKeyError:
            flag = False
        except IndexError:
            change_ips()
        except Exception as e:
            en_olbase_err().insert({'url': url, 'msg': str(e)})
            log.exception(e)
            flag = False