Ejemplo n.º 1
0
    def parse1(self, response):
        base_url = get_base_url(response)

        response_content = response.body # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多
        cat_name = response.meta.get('cat_name')
        segs = crawlerTool.getXpath('//div[@class="product_list_left_in"]//li', response_content)
        for seg in segs:
            ChemicalName,CASNumber,MolFormula,SearchImg,Synonyms,url = ['' for i in range(6)]
            SearchImg = crawlerTool.getXpath1('//div[@class="leftSearchImg"]/a/img/@src', seg)
            SearchImg = 'https://www.trc-canada.com' + SearchImg
            contents = crawlerTool.getXpath('//div[@class="ContentDesc"]', seg)
            for content in contents:
                content=content.replace('\r','').replace('\n','')
                if 'Chemical Name:' in content:
                    ChemicalName = crawlerTool.getRegex('</label>(.*?)<',content).strip()
                elif 'CAS number:' in content:
                    CASNumber = crawlerTool.getRegex('</label>(.*?)<', content).strip()
                elif 'Mol. Formula:' in content:
                    MolFormula = crawlerTool.getRegex('</label>(.*?)<', content).strip()
                elif 'Synonyms' in content:
                    Synonyms = crawlerTool.getRegex('</label>(.*?)<', content).strip()

           # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg)
            data_obj = Trc_Item()
            data_obj['ChemicalName'] = ChemicalName
            data_obj['CASNumber'] = CASNumber
            data_obj['MolFormula'] = MolFormula
            data_obj['SearchImg'] = SearchImg
            data_obj['Synonyms'] = Synonyms
            data_obj['api_name'] = cat_name
            data_obj['url'] = SearchImg
            yield data_obj
Ejemplo n.º 2
0
 def parse(self, response):
     base_url = get_base_url(response)
     content = response.body # 乱码处理
     for i in range(100):
         try:
             new_content = unicode(content, 'utf8')
             break
         except Exception, e:
             if 'position' in str(e):
                 print str(e)
                 error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e))
                 if '-' in str(e):
                     start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1
                     content = content[:start_index] + content[end_index:]
                 else:
                     start_index  = int(crawlerTool.getRegex('position (\d+)',str(e)))
                     content = content[:start_index] + content[start_index+1:]
Ejemplo n.º 3
0
def keyword_search(keyword):
    keywords = urllib.quote(keyword)
    url = 'https://www.youtube.com/results?search_query=' + keywords
    page = ct.get(url)
    imgurl0 = ct.getXpath('//div[@id="img-preload"]/img/@src', page)[0]
    vid = ct.getRegex('i.ytimg.com/vi/(.*?)/', imgurl0)
    video_url = 'https://www.youtube.com/watch?v=' + vid
    print video_url
    return video_url, imgurl0
Ejemplo n.º 4
0
    def parse(self, response):
        base_url = get_base_url(response)
        response_content = response.body  # 乱码处理 #像https://www.chemicalbook.com/ShowSupplierProductsList6187/51100.htm有9万多
        cat_name = response.meta.get('cat_name')
        segs = crawlerTool.getXpath('//table[@id="product-list"]/tbody/tr',
                                    response_content)

        for seg in segs:
            name, MolecularFormula, MolecularWeight, image, cas, url = [
                '' for i in range(6)
            ]
            SearchImg = crawlerTool.getXpath1(
                '//img[@class="dg-picture-zoom  acc_img_container acc_zoomer"]/@src',
                seg)
            contents = crawlerTool.getXpath('//table//tr', seg)
            for content in contents:
                content = content.replace('\r', '').replace('\n', '')
                if 'Name' in content:
                    name = crawlerTool.getXpath1('//td[2]', content)
                    name = crawlerTool.getRegex('>(.*?)<', name).strip()
                elif 'CAS No' in content:
                    cas = crawlerTool.getXpath1('//td[2]', content)
                    cas = crawlerTool.getRegex('>(.*?)<', cas).strip()
                elif 'Molecular Formula' in content:
                    MolecularFormula = crawlerTool.getXpath1(
                        '//td[2]', content)
                    MolecularFormula = re.sub('<.*?>', '',
                                              MolecularFormula).strip()
                elif 'Molecular Weight' in content:
                    MolecularWeight = crawlerTool.getXpath1('//td[2]', content)
                    MolecularWeight = crawlerTool.getRegex(
                        '>(.*?)<', MolecularWeight).strip()

        # primaryVendorId = crawlerTool.getXpath1('//str[@name="primaryVendorId"]/text()', seg)
            data_obj = acccorporation_Item()
            data_obj['url'] = name
            data_obj['name'] = name
            data_obj['MolecularFormula'] = MolecularFormula
            data_obj['MolecularWeight'] = MolecularWeight
            data_obj['image'] = SearchImg
            data_obj['cas'] = cas
            yield data_obj
Ejemplo n.º 5
0
 def parse(self,response):
     base_url = get_base_url(response)
     response_content = response.body # 乱码处理
     url= response.url
     '''
 url = scrapy.Field()
 IUPACname = scrapy.Field()
 CAS = scrapy.Field()
 Chemspaceid = scrapy.Field()
 Molformula = scrapy.Field()
 Molweight = scrapy.Field()
     '''
     data_obj = ChemspaceItem()
     data_obj['url'] = url
     data_obj['IUPACname'] = crawlerTool.getXpath1('//div[@class="iupac-name"]//text()',response_content)
     data_obj['CAS'] = crawlerTool.getRegex('<dt>CAS</dt>[^<]?<dd>([\d-]+)</dd>',response_content)
     data_obj['Molformula'] = crawlerTool.getRegex('<dt>Mol formula</dt>[^<]?<dd>([\d\w]+)</dd>',response_content.replace('</sub>','').replace('<sub>',''))
     data_obj['Molweight']  = crawlerTool.getRegex('<dt>Mol weight</dt>[^<]?<dd>([\d\.]+)</dd>',response_content)
     print data_obj
     yield data_obj
Ejemplo n.º 6
0
    def start_requests(self):
        sdf_dir = 'compounds'
        sdf_files = os.listdir(sdf_dir)
        for sdf_file in sdf_files:
            with open('compounds/'+sdf_file,'r') as fout:
                print 'sdf_file',sdf_file
                for line in fout:
                   url = crawlerTool.getRegex('(https://chem-space.com/\w+)',line)

                   if url and  not self.db_connect.get_by_unique_value(url): # 由于内存不够会被kill掉!
                       yield scrapy.Request(url, callback=self.parse)
Ejemplo n.º 7
0
 def parser_sub(self,response):
     content = response.body # 乱码处理
     for i in range(100):
         try:
             new_content = unicode(content, 'gbk')
             break
         except Exception, e:
             if 'position' in str(e):
                 error_str = crawlerTool.getRegex('position\s+(\d+-\d+)', str(e))
                 start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1
                 content = content[:start_index] + content[end_index:]
Ejemplo n.º 8
0
def extractor_info(video_url):
    page = ct.get(video_url)
    artist = ct.getRegex('歌手.*?[tT]ext":"(.*?)"', page)
    if not artist:
        artist = ct.getRegex('艺术家.*?[tT]ext":"(.*?)"', page)
    if not artist:
        artist = ct.getRegex('"Artist".*?[tT]ext":"(.*?)"', page)

    album = ct.getRegex('专辑.*?[tT]ext":"(.*?)"', page)
    label = ct.getRegex('由以下相关方许可给.*?[tT]ext":"(.*?)"', page)
    if not label: label = ct.getRegex('獲以下人士授權.*?[tT]ext":"(.*?)"', page)
    if not label:
        label = ct.getRegex('Licensed to YouTube.*?[tT]ext":"(.*?)"', page)
    song = ct.getRegex('"Song".*?[tT]ext":"(.*?)"', page)
    if not song:
        song = ct.getRegex('"歌曲".*?[tT]ext":"(.*?)"', page)

    title = ct.getRegex(',"title":"(.*?)"', page).replace('\\u0026', '&')
    title = re.sub(u"([/\\\\:*?<>|])", "", title)  # 标题特殊符号过滤
    print title
    return title, artist, album, label, song
Ejemplo n.º 9
0
    def parser_detail(self, response):
        content = response.body
        url = response.url
        data_obj = ParkersItem()
        data_obj['title'] = crawlerTool.getXpath('//title/text()', content)[0]
        data_obj['url'] = url  # url 中提取名称和model

        urlsplit = url.split('/')
        if len(urlsplit) > 4:
            data_obj['name'] = urlsplit[3]
            data_obj['model'] = urlsplit[4]
        data_obj['power'] = crawlerTool.getRegex('Power</th><td>(.*?)</td>',
                                                 content)
        data_obj['TopSpeed'] = crawlerTool.getRegex(
            'Top Speed</th><td>(.*?)</td>', content)
        data_obj['zerotosixty'] = crawlerTool.getRegex(
            '<th>0-60 mph</th><td>(.*?)</td>', content)
        data_obj['Torque'] = crawlerTool.getRegex(
            '<th>Torque</th><td>(.*?)</td>', content)
        data_obj['co2Emissions'] = crawlerTool.getRegex(
            '<th>CO<sub>2</sub> Emissions</th><td>(.*?)</td>', content)
        data_obj['EuroEmissionsStandard'] = crawlerTool.getRegex(
            '<th>Euro Emissions Standard</th><td>(.*?)</td>', content)
        data_obj['Fuelconsumption'] = crawlerTool.getRegex(
            '<tr><th>Fuel consumption</th><td>(.*?)</td>', content)

        data_obj['Length'] = crawlerTool.getRegex(
            '<tr><th>Length</th><td>(.*?)</td>', content)

        data_obj['Width'] = crawlerTool.getRegex(
            '<tr><th>Width</th><td>(.*?)</td>', content)
        data_obj['Height'] = crawlerTool.getRegex(
            '<tr><th>Height</th><td>(.*?)</td>', content)
        data_obj['EngineSize'] = crawlerTool.getRegex(
            '<tr><th>Engine Size</th><td>(.*?)</td>', content)
        data_obj['Cylinders'] = crawlerTool.getRegex(
            '<tr><th>Cylinders</th><td>(.*?)</td>', content)
        data_obj['FuelType'] = crawlerTool.getRegex(
            '<tr><th>Fuel Type</th><td>(.*?)</td>', content)
        data_obj['Transmission'] = crawlerTool.getRegex(
            '<tr><th>Transmission</th><td>(.*?)</td>', content)
        data_obj['Doors'] = crawlerTool.getRegex(
            '<tr><th>Doors</th><td>(.*?)</td>', content)
        data_obj['Seats'] = crawlerTool.getRegex(
            '<tr><th>Seats</th><td>(.*?)</td>', content)
        data_obj['taxcostBasic'] = crawlerTool.getRegex(
            '<tr><th>Monthly company car tax cost \(Basic Rate\)</th><td>(.*?)</td>',
            content).replace('&#163;', '£')  # £ 是英镑

        # print lxr,dz,yb,dh,sj
        yield data_obj
Ejemplo n.º 10
0
    sheet.write(row, 4, u'audio_link')
    sheet.write(row, 5, u'label')
    sheet.write(row, 6, u'keyword')

    with open('title_list.txt', 'r') as f:
        for line in f:
            row += 1
            keyword = line.strip()
            # keyword = keyword.replace(' ','+')
            if not keyword:
                continue
            try:
                video_url, imgurl0 = keyword_search(keyword)
                title, artist, album, label, song = extractor_info(video_url)
                img_path = 'img/' + title + '.jpg'
                imgurl0 = ct.getRegex('(http.*?)\?', imgurl0)
                img_dl(imgurl0, img_path)
            except Exception, e:
                print e
                video_url, imgurl0, title, artist, album, label, song = '', '', '', '', '', '', keyword
            print(video_url, imgurl0, title, artist, album, label)
            sheet.write(row, 0, song)
            sheet.write(row, 1, artist)
            sheet.write(row, 2, title)
            sheet.write(row, 3, title + '.mp3')
            sheet.write(row, 4, video_url)
            sheet.write(row, 5, label)
            sheet.write(row, 6, keyword)
    wbk.save(filename)

# 需要过滤广告
Ejemplo n.º 11
0
                    start_index, end_index = int(error_str.split('-')[0]), int(error_str.split('-')[1]) + 1
                    content = content[:start_index] + content[end_index:]
        response_content = new_content
        print response.url
        url= response.url
        gywm =crawlerTool.getXpath("//td[@class='goscill22']/table[2]//p/text()",response_content) # 关于我们
        gywm = ''.join(gywm).replace('\n','').replace('\r','')
       # response_content = unicode(response_content, 'gbk') # http://www.hxchem.net/companydetaildesenborn.html 这个就不行了!
        lxwm = crawlerTool.getXpath("//td[@class='goscill22']/table[4]",response_content) # 联系我们
        lxwm = lxwm[0]
        # lxwm = HTMLParser().unescape(lxwm)
        # lxwm=lxwm.encode('utf8')
        data_obj = HxchemItem()
        data_obj['url'] = url
        data_obj['gywm'] = gywm
        data_obj['name'] = crawlerTool.getXpath("//h1/text()",response_content)[0]
        data_obj['lxr'] = crawlerTool.getRegex('联系人:(.*?)<',lxwm)
        data_obj['dz'] = crawlerTool.getRegex('地 址:(.*?)<',lxwm)
        data_obj['yb'] = crawlerTool.getRegex('邮 编:(.*?)<',lxwm)
        data_obj['dh'] = crawlerTool.getRegex('电 话:(.*?)<',lxwm)
        data_obj['sj'] = crawlerTool.getRegex('手 机:(.*?)<',lxwm)
        data_obj['wz'] = crawlerTool.getRegex('网 址:<.*?>(.*?)<', lxwm)
        data_obj['dzyj'] = crawlerTool.getRegex('电子邮件:<.*?>(.*?)<', lxwm)
       # print lxr,dz,yb,dh,sj
        yield data_obj