Beispiel #1
0
    def parser_sub(self,response):
        content = response.body # 乱码处理
        # for i in range(100):
        #     try:
        #         new_content = unicode(content, 'gbk')
        #         break
        #     except Exception, e:
        #         if 'position' in str(e):
        #             error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e))
        #             start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1
        #             content = content[:start_index] + content[end_index:]
        response_content = content
        print response.url
        url= response.url
        cfemail = crawlerTool.getXpath('//a[@class="__cf_email__"]/@data-cfemail', content)
        title = crawlerTool.getXpath('//title/text()', content)[0]

        mail = ''
        if cfemail:
            mail = self.get_mail(cfemail[0])
        data_obj = CmocroItem()
        data_obj['url'] = url
        data_obj['mail'] = mail
        data_obj['name'] = title.replace('- CMOCRO','')
       # print lxr,dz,yb,dh,sj
        yield data_obj
    def parse1(self, response):
        base_url = get_base_url(response)

        response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多
        cat_name = response.meta.get('cat_name')
        segs = crawlerTool.getXpath('//div[@class="product_list_left_in"]//li', response_content)
        for seg in segs:
            ChemicalName,CASNumber,MolFormula,SearchImg,Synonyms,url = ['' for i in range(6)]
            SearchImg = crawlerTool.getXpath1('//div[@class="leftSearchImg"]/a/img/@src', seg)
            SearchImg = 'https://www.trc-canada.com' + SearchImg
            contents = crawlerTool.getXpath('//div[@class="ContentDesc"]', seg)
            for content in contents:
                content=content.replace('\r','').replace('\n','')
                if 'Chemical Name:' in content:
                    ChemicalName = crawlerTool.getRegex('</label>(.*?)<',content).strip()
                elif 'CAS number:' in content:
                    CASNumber = crawlerTool.getRegex('</label>(.*?)<', content).strip()
                elif 'Mol. Formula:' in content:
                    MolFormula = crawlerTool.getRegex('</label>(.*?)<', content).strip()
                elif 'Synonyms' in content:
                    Synonyms = crawlerTool.getRegex('</label>(.*?)<', content).strip()

           # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg)
            data_obj = Trc_Item()
            data_obj['ChemicalName'] = ChemicalName
            data_obj['CASNumber'] = CASNumber
            data_obj['MolFormula'] = MolFormula
            data_obj['SearchImg'] = SearchImg
            data_obj['Synonyms'] = Synonyms
            data_obj['api_name'] = cat_name
            data_obj['url'] = SearchImg
            yield data_obj
 def parse(self, response):
     base_url = get_base_url(response)
     response_content = response.body  # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多
     suburls = crawlerTool.getXpath(
         "//table[@id='ContentPlaceHolder1_ProductClassDetail']//tr/td[1]/a/@href",
         response_content)
     for suburl in suburls:
         suburl = urljoin(base_url, suburl)
         yield scrapy.Request(url=suburl, callback=self.parser_sub)
     next_page_url = crawlerTool.getXpath('//div[@align="center"]/a/@href',
                                          response_content)
     if next_page_url:
         for page_url in next_page_url:  # 重试
             page_url = urljoin(base_url, page_url)
             yield scrapy.Request(url=page_url, callback=self.parse)
Beispiel #4
0
 def parse(self, response):
     base_url = get_base_url(response)
     response_content = response.body  # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多
     cat_name = response.meta.get('cat_name')
     segs = crawlerTool.getXpath('//doc', response_content, xml_type='XML')
     for seg in segs:
         name = crawlerTool.getXpath1('//str[@name="name"]/text()', seg)
         cas = crawlerTool.getXpath1('//str[@name="casNumber"]/text()', seg)
         function = crawlerTool.getXpath1('//str[@name="tagline"]/text()',
                                          seg)
         # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg)
         data_obj = Caymanchem()
         data_obj['name'] = name
         data_obj['cas'] = cas
         data_obj['function'] = function
         data_obj['cat'] = cat_name
         data_obj['url'] = name + cat_name + cas
         yield data_obj
     totalnum = int(
         crawlerTool.getXpath1('//result[@name="response"]//@numFound',
                               response_content,
                               xml_type='XML'))
     if not response.meta.get('depth'):
         print totalnum
         for i in range(1, totalnum, 1):
             url = 'https://www.caymanchem.com/solr/cchProduct/select?facet=true&facet.field=raptas'+\
                   '&facet.field=newProduct&facet.limit=100000&fl=isEUSellable%2Cname%2CmarkupName%2CcatalogNum%2CproductImage%2Csynonyms%2CcasNumber%2Ctagline%2Cscore%2CitemGroupId%2CprimaryVendorId&spellcheck=true&spellcheck.collate=true&spellcheck.count=10&spellcheck.extendedResults=true&spellcheck.onlyMorePopular=false&facet.mincount=1&rows=10&version=2.2&json.nl=map&'+\
                   'q=*%3A*&start='+str(i)+'&fq=('+cats[cat_name]+')AND(!raptas%3ARAP000101%20AND%20websiteNotSearchable%3Afalse)'
             yield scrapy.Request(url,
                                  callback=self.parse,
                                  meta={
                                      'cat_name': cat_name,
                                      'depth': 1
                                  })
Beispiel #5
0
class HxChemSpider(scrapy.Spider):
    name = "hxchem"  # 唯一标识
    # allowed_domains = ["csdn.net"]
    start_urls = [
        "http://www.hxchem.net/company.php?page=%s"%str(i) for i in range(200,3160,1) #
    ]

    #def start_requests(self): #测试cookie
    #    # 带着cookie向网站服务器发请求,表明我们是一个已登录的用户
    #    yield scrapy.Request(self.start_urls[0], callback=self.parse, cookies={'meng':1})

    def parse(self, response):
        base_url = get_base_url(response)
        content = response.body # 乱码处理
        for i in range(100):
            try:
                new_content = unicode(content, 'gbk')
                break
            except Exception, e:
                if 'position' in str(e):
                    error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e))
                    start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1
                    content = content[:start_index] + content[end_index:]
        response_content = new_content

        suburls = crawlerTool.getXpath("//div[@class='ad_content']//dl/dt/a/@href",response_content)
        if len(suburls)<10:
            print('num error',response.url)
        for suburl in suburls:
            suburl = urljoin(base_url,suburl)
            yield scrapy.Request(url=suburl,callback = self.parser_sub)
Beispiel #6
0
    def parse(self, response):
        base_url = get_base_url(response)
        content = response.body  # 乱码处理
        manufacturers = crawlerTool.getXpath("//h2/a/@href", content)

        for manufacturer in manufacturers:
            sub_url = 'https://www.parkers.co.uk' + manufacturer + 'specs/'
            yield scrapy.Request(url=sub_url, callback=self.parser_sub)
Beispiel #7
0
 def parser_spec_url(self, response):
     content = response.body
     url = response.url
     FullSpecs_urls = crawlerTool.getXpath('//h3/a/@href', content)
     for spec_url in FullSpecs_urls:
         if not 'http' in spec_url:
             spec_url = 'https://www.parkers.co.uk' + spec_url
             yield scrapy.Request(url=spec_url, callback=self.parser_detail)
Beispiel #8
0
def keyword_search(keyword):
    keywords = urllib.quote(keyword)
    url = 'https://www.youtube.com/results?search_query=' + keywords
    page = ct.get(url)
    imgurl0 = ct.getXpath('//div[@id="img-preload"]/img/@src', page)[0]
    vid = ct.getRegex('i.ytimg.com/vi/(.*?)/', imgurl0)
    video_url = 'https://www.youtube.com/watch?v=' + vid
    print video_url
    return video_url, imgurl0
Beispiel #9
0
    def parse(self, response):
        base_url = get_base_url(response)
        url_now = response.url
        response_content = response.body  # 乱码处理
        segs = crawlerTool.getXpath('//div[@class="cas_default_list_star "]//ul', response_content)
        for seg in segs[1:-1]:
            data_obj = SeekchemItem()
            lis=crawlerTool.getXpath('//li',seg)
            data_obj['url'] = crawlerTool.getXpath1('//a/@href',lis[0])
            data_obj['cas'] =  crawlerTool.getXpath1('//b/text()',lis[0])
            data_obj['name'] = crawlerTool.getXpath1('//text()',lis[1])
            yield data_obj

        # next_page = crawlerTool.getXpath1("//a[@class='next']/@href", response_content)
        # next_page_url = urljoin(url_now,next_page)
        # yield scrapy.Request(url=next_page_url, callback=self.parse)
        page_urls = crawlerTool.getXpath( '//div[@class="pages"]/a/@href', response_content)
        for page_url in page_urls:
            yield scrapy.Request(urljoin(url_now,page_url), callback=self.parse)
Beispiel #10
0
 def parse(self, response):
     base_url = get_base_url(response)
     response_content = response.body  # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多
     # cat_name = response.meta.get('cat_name')
     segs = crawlerTool.getXpath('//li[@class="list-group-item"]/text()',
                                 response_content)
     for seg in segs:
         data_obj = angenechemical_item()
         data_obj['url'] = seg
         yield data_obj
 def parser_sub(self, response):
     base_url = get_base_url(response)
     response_content = response.body  # 乱码处理
     url = response.url
     #detail =crawlerTool.getXpath('//div[@id="ContentPlaceHolder1_SupplierContact"]',response_content)[0] # 关于我们
     # response_content = unicode(response_content, 'gbk') # http://www.hxchem.net/companydetaildesenborn.html 这个就不行了!
     # lxwm = HTMLParser().unescape(lxwm)
     # lxwm=lxwm.encode('utf8')
     data_obj = ChemicalBook()
     data_obj['url'] = url
     data_obj['name'] = crawlerTool.getXpath(
         '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[2]/td[2]/a/text()',
         response_content)[0]
     data_obj['lxdh'] = crawlerTool.getXpath1(
         '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[3]/td[2]//text()',
         response_content)
     data_obj['email'] = crawlerTool.getXpath1(
         '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[5]/td[2]//text()',
         response_content)
     data_obj['wz'] = crawlerTool.getXpath1(
         '//div[@id="ContentPlaceHolder1_SupplierContact"]/table[2]//tr[6]/td[2]//text()',
         response_content)
     cplb_div = crawlerTool.getXpath(
         '//div[@id="ContentPlaceHolder1_ProductSupplier"]//table',
         response_content)[3:-1]
     print data_obj['name'].encode('unicode-escape').decode('string_escape')
     cplb = []
     for cp in cplb_div:
         chinese_name = crawlerTool.getXpath('//tr/td[2]/text()', cp)
         chinese_name = chinese_name[0] if chinese_name else ''
         cps = crawlerTool.getXpath('//tr/td[3]/text()', cp)
         cps = cps[0] if cps else ''
         cplb.append(' '.join([chinese_name, cps]))
     data_obj['cplb'] = cplb
     # print lxr,dz,yb,dh,sj
     yield data_obj
     page_urls = crawlerTool.getXpath(
         '//div[@id="ContentPlaceHolder1_ProductSupplier"]//table[2]//tr[2]/td[2]//a/@href',
         response_content)
     for page_url in page_urls:
         page_url = urljoin(base_url, page_url)
         yield scrapy.Request(url=page_url, callback=self.parser_sub)
    def parse(self, response):
        base_url = get_base_url(response)
        response_content = response.body  # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多
        cat_name = response.meta.get('cat_name')
        segs = crawlerTool.getXpath('//table[@id="product-list"]/tbody/tr',
                                    response_content)

        for seg in segs:
            name, MolecularFormula, MolecularWeight, image, cas, url = [
                '' for i in range(6)
            ]
            SearchImg = crawlerTool.getXpath1(
                '//img[@class="dg-picture-zoom  acc_img_container acc_zoomer"]/@src',
                seg)
            contents = crawlerTool.getXpath('//table//tr', seg)
            for content in contents:
                content = content.replace('\r', '').replace('\n', '')
                if 'Name' in content:
                    name = crawlerTool.getXpath1('//td[2]', content)
                    name = crawlerTool.getRegex('>(.*?)<', name).strip()
                elif 'CAS No' in content:
                    cas = crawlerTool.getXpath1('//td[2]', content)
                    cas = crawlerTool.getRegex('>(.*?)<', cas).strip()
                elif 'Molecular Formula' in content:
                    MolecularFormula = crawlerTool.getXpath1(
                        '//td[2]', content)
                    MolecularFormula = re.sub('<.*?>', '',
                                              MolecularFormula).strip()
                elif 'Molecular Weight' in content:
                    MolecularWeight = crawlerTool.getXpath1('//td[2]', content)
                    MolecularWeight = crawlerTool.getRegex(
                        '>(.*?)<', MolecularWeight).strip()

        # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg)
            data_obj = acccorporation_Item()
            data_obj['url'] = name
            data_obj['name'] = name
            data_obj['MolecularFormula'] = MolecularFormula
            data_obj['MolecularWeight'] = MolecularWeight
            data_obj['image'] = SearchImg
            data_obj['cas'] = cas
            yield data_obj
 def parse(self, response):
     response_content = response.body
     cats = crawlerTool.getXpath('//input[@type="checkbox"]/@value',response_content)
     print len(cats)
     # cats = ['Ether']
     for cat in cats:
         first_str = cat[0].lower()
         # if first_str in ('a','b','c'):continue
         yield scrapy.FormRequest(url='https://www.trc-canada.com/parentdrug-listing/',
                                  formdata={"keyword" : " %s "%cat, "t" : "product","advanced":"yes"},
                                  callback=self.parse1,meta = {'cat_name':cat})
Beispiel #14
0
    def parser_detail(self, response):
        content = response.body
        url = response.url
        data_obj = ParkersItem()
        data_obj['title'] = crawlerTool.getXpath('//title/text()', content)[0]
        data_obj['url'] = url  # url 中提取名称和model

        urlsplit = url.split('/')
        if len(urlsplit) > 4:
            data_obj['name'] = urlsplit[3]
            data_obj['model'] = urlsplit[4]
        data_obj['power'] = crawlerTool.getRegex('Power</th><td>(.*?)</td>',
                                                 content)
        data_obj['TopSpeed'] = crawlerTool.getRegex(
            'Top Speed</th><td>(.*?)</td>', content)
        data_obj['zerotosixty'] = crawlerTool.getRegex(
            '<th>0-60 mph</th><td>(.*?)</td>', content)
        data_obj['Torque'] = crawlerTool.getRegex(
            '<th>Torque</th><td>(.*?)</td>', content)
        data_obj['co2Emissions'] = crawlerTool.getRegex(
            '<th>CO<sub>2</sub> Emissions</th><td>(.*?)</td>', content)
        data_obj['EuroEmissionsStandard'] = crawlerTool.getRegex(
            '<th>Euro Emissions Standard</th><td>(.*?)</td>', content)
        data_obj['Fuelconsumption'] = crawlerTool.getRegex(
            '<tr><th>Fuel consumption</th><td>(.*?)</td>', content)

        data_obj['Length'] = crawlerTool.getRegex(
            '<tr><th>Length</th><td>(.*?)</td>', content)

        data_obj['Width'] = crawlerTool.getRegex(
            '<tr><th>Width</th><td>(.*?)</td>', content)
        data_obj['Height'] = crawlerTool.getRegex(
            '<tr><th>Height</th><td>(.*?)</td>', content)
        data_obj['EngineSize'] = crawlerTool.getRegex(
            '<tr><th>Engine Size</th><td>(.*?)</td>', content)
        data_obj['Cylinders'] = crawlerTool.getRegex(
            '<tr><th>Cylinders</th><td>(.*?)</td>', content)
        data_obj['FuelType'] = crawlerTool.getRegex(
            '<tr><th>Fuel Type</th><td>(.*?)</td>', content)
        data_obj['Transmission'] = crawlerTool.getRegex(
            '<tr><th>Transmission</th><td>(.*?)</td>', content)
        data_obj['Doors'] = crawlerTool.getRegex(
            '<tr><th>Doors</th><td>(.*?)</td>', content)
        data_obj['Seats'] = crawlerTool.getRegex(
            '<tr><th>Seats</th><td>(.*?)</td>', content)
        data_obj['taxcostBasic'] = crawlerTool.getRegex(
            '<tr><th>Monthly company car tax cost \(Basic Rate\)</th><td>(.*?)</td>',
            content).replace('&#163;', '£')  # £ 是英镑

        # print lxr,dz,yb,dh,sj
        yield data_obj
Beispiel #15
0
    def parse(self, response):
        base_url = get_base_url(response)
        content = response.body  # 乱码处理
        segs = crawlerTool.getXpath(
            "//table//td[2]//td/table[2]//td//table//tr//td//tr", content)
        for seg in segs[1:]:
            tds = crawlerTool.getXpath("//td", seg)
            if len(tds) < 4:
                continue

            cat_no = tds[0]
            product_name = tds[1]
            cas = tds[2]
            assay = tds[3]

            rovathin_item = RovathinItem()
            rovathin_item['cat_no'] = re.sub('\s*<.*?>\s*', '', cat_no)
            rovathin_item['product_name'] = re.sub('\s*<.*?>\s*', '',
                                                   product_name)
            rovathin_item['cas'] = re.sub('\s*<.*?>\s*', '', cas)
            rovathin_item['assay'] = re.sub('\s*<.*?>\s*', '', assay)
            rovathin_item['url'] = crawlerTool.getXpath1(
                "//a/@href", product_name)
            yield rovathin_item
Beispiel #16
0
class CmocroSpider(scrapy.Spider):
    name = "cmocro"  # 唯一标识
    # allowed_domains = ["csdn.net"]

    start_urls = [
         #
    ]
    db_connect = MysqlPipeline3()

    url_cache = [] # 这里用一下内存去重
    for i in xrange(26):
        c1 = chr(i + ord('a'))
        for i in xrange(26):
            c2 = chr(i + ord('a'))
            start_urls .append("https://www.cmocro.com/company_search.php?company=%s%s" % (c1,c2))
    # start_urls = start_urls[:1]


    #def start_requests(self): #测试cookie
    #    # 带着cookie向网站服务器发请求,表明我们是一个已登录的用户
    #    yield scrapy.Request(self.start_urls[0], callback=self.parse, cookies={'meng':1})

    def parse(self, response):
        base_url = get_base_url(response)
        content = response.body # 乱码处理
        for i in range(100):
            try:
                new_content = unicode(content, 'utf8')
                break
            except Exception, e:
                if 'position' in str(e):
                    print str(e)
                    error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e))
                    if '-' in str(e):
                        start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1
                        content = content[:start_index] + content[end_index:]
                    else:
                        start_index  = int(crawlerTool.getRegex('position (\d+)',str(e)))
                        content = content[:start_index] + content[start_index+1:]

        response_content = new_content

        suburls = crawlerTool.getXpath('//div[@class="company_list"]/a/@href',response_content)
        for suburl in suburls:
            suburl = urljoin(base_url, suburl)
            if not self.db_connect.get_by_unique_value(suburl):
                yield scrapy.Request(url=suburl,callback = self.parser_sub)
Beispiel #17
0
def extractor_page2(page, code):  # 解析详情页
    # 买入前5名
    content_table = crawlerTool.getXpath('//table', page, charset='gbk')[0]
    trs = crawlerTool.getXpath('//tr', content_table, charset='gbk')
    rows = [[], [u'股票代码', code]]
    for tr in trs:
        row = []
        for td in crawlerTool.getXpath('//th', tr, charset='gbk'):
            row.append(re.sub('(<.*?>)', "", td).strip())
        for td in crawlerTool.getXpath('//td', tr, charset='gbk'):
            row.append(re.sub('(<.*?>)', "", td).strip())
        rows.append(row)
    #卖出前5名
    content_table = crawlerTool.getXpath('//table', page, charset='gbk')[1]
    trs = crawlerTool.getXpath('//tr', content_table, charset='gbk')
    for tr in trs:
        row = []
        for td in crawlerTool.getXpath('//th', tr, charset='gbk'):
            row.append(re.sub('(<.*?>)', "", td).strip())
        for td in crawlerTool.getXpath('//td', tr, charset='gbk'):
            row.append(re.sub('(<.*?>)', "", td).strip())
        rows.append(row)

    return rows
Beispiel #18
0
    def parser_sub(self,response):
        content = response.body # 乱码处理
        for i in range(100):
            try:
                new_content = unicode(content, 'gbk')
                break
            except Exception, e:
                if 'position' in str(e):
                    error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e))
                    start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1
                    content = content[:start_index] + content[end_index:]
        response_content = new_content
        print response.url
        url= response.url
        gywm =crawlerTool.getXpath("//td[@class='goscill22']/table[2]//p/text()",response_content) # 关于我们
        gywm = ''.join(gywm).replace('\n','').replace('\r','')
       # response_content = unicode(response_content, 'gbk') # http://www.hxchem.net/companydetaildesenborn.html 这个就不行了!
        lxwm = crawlerTool.getXpath("//td[@class='goscill22']/table[4]",response_content) # 联系我们
        lxwm = lxwm[0]
        # lxwm = HTMLParser().unescape(lxwm)
        # lxwm=lxwm.encode('utf8')
        data_obj = HxchemItem()
        data_obj['url'] = url
        data_obj['gywm'] = gywm
        data_obj['name'] = crawlerTool.getXpath("//h1/text()",response_content)[0]
        data_obj['lxr'] = crawlerTool.getRegex('联系人:(.*?)<',lxwm)
        data_obj['dz'] = crawlerTool.getRegex('地 址:(.*?)<',lxwm)
        data_obj['yb'] = crawlerTool.getRegex('邮 编:(.*?)<',lxwm)
        data_obj['dh'] = crawlerTool.getRegex('电 话:(.*?)<',lxwm)
        data_obj['sj'] = crawlerTool.getRegex('手 机:(.*?)<',lxwm)