Python Selector.split Examples, scrapy.selector.Selector.split Python Examples

Example #1

0

Show file

File: playlist_spider.py Project: m-motawea/yCrawler

 def parse_splash(self, response):
     for li in Selector(text=response.body).xpath('//ol[@class = "playlist-videos-list yt-uix-scroller yt-viewport"]/li').extract():
         img_url = Selector(text=li).xpath('//img/@data-thumb').extract_first()
         if img_url:
             img_src = img_url.split('&rs')[0] + '&amp;rs' + img_url.split('&rs')[1]
         else:
             img_url = Selector(text=li).xpath('//img/@src').extract_first()
             img_src = img_url.split('&rs')[0] + '&amp;rs' + img_url.split('&rs')[1]
         href = 'https://youtube.com' + Selector(text=li).xpath('//a/@href').extract_first()
         title = Selector(text=li).xpath('//h4 [@class = "yt-ui-ellipsis yt-ui-ellipsis-2"]/text()').extract_first().strip()
         try:
             v = Video.get(title=title)
         except Exception as e:
             print(e)
             exit(1)
         if not v:
             try:
                 v = Video(title=title, url=href, img_src=img_src, img_thumb=img_url)
                 v.save(self.download_path)
                 if self.vDownload:
                     v.download(self.vPath)
             except Exception as e:
                 print(e)
                 exit(1)
         send_message('/tmp/url_pipe', href)
         yield {
                 'title': title,
                 'url': href,
                 'img_src': img_src,
                 'img_thumb': img_url
             }

Example #2

0

Show file

 def parse(self, response):
     for vid in Selector(text=response.body).xpath(
             '//ytd-grid-video-renderer[@class="style-scope ytd-grid-renderer use-ellipsis"]'
     ).extract():
         duration = Selector(text=vid).xpath(
             '//span[@class="style-scope ytd-thumbnail-overlay-time-status-renderer"]/text()'
         ).extract_first().strip()
         href = 'https://youtube.com' + Selector(
             text=vid).xpath('//a[@id="thumbnail"]/@href').extract_first()
         title = Selector(text=vid).xpath(
             '//a[@id="video-title"]/text()').extract_first()
         views = int(
             Selector(text=vid).xpath('//a[@id="video-title"]/@aria-label').
             extract_first().split()[-2].replace(',', ''))
         img_thumb = Selector(text=vid).xpath('//img/@src').extract_first()
         img_src = None
         if img_thumb:
             img_src = img_thumb.split(
                 '?sqp')[0] + '?amp;sqp' + img_thumb.split('?sqp')[1]
         v = Video.get(title=title)
         if v:
             v.update(views=views, duration=duration)
         else:
             v = Video(title, href, img_src, img_thumb, views, duration)
             v.save(self.download_path)
             if self.vDownload:
                 v.download(self.vPath)

Example #3

0

Show file

def select_one(req, select, comments=False):
    try:
        _sel = Selector(response=req).css(select).get()
        if _sel != None and comments:
            _sel = _sel.split('-->')[1]
            _sel = _sel.split('<!--')[0]
        return _sel
    except:
        log_error(req.url, f'{select} სელექტორის', True)
        return None

Example #4

0

Show file

def get_bet_ratio(source):
    ratios = {}

    for i in [1, 2, 3]:
        try:
            extracted_text = Selector(text=source) \
                                .xpath('//*[@id="highcharts-0"]/svg/g[2]/g/g/g[{}]/text/tspan/text()'.format(str(i))) \
                                .extract()[0]
            team_name, ratio = extracted_text.split(
            )[:-1][0], extracted_text.split()[-1].strip('(|)')
            ratios[team_name] = ratio
        except:
            continue

    return ratios

Example #5

0

Show file

File: update_weather.py Project: JadeHayes/project

    def parse(self, response):
        """ Parse through Whistlers weather page """
        # Extract the current weather report
        daily_rept = Selector(
            response=response).xpath('//script/text()').extract()[8]
        snowfall_rept = json.loads(daily_rept.split("=")[2].split(";")[0])

        # Extract the forcast weather report
        forcast_rept = Selector(
            response=response).xpath('//script/text()').extract()[11]
        forcast = json.loads(forcast_rept.split("=")[2].split(";")[0])

        # Calculate and convert snow to inches
        daily_snow_inches = snowfall_rept['TwentyFourHourSnowfall']['Inches']
        daily_snow_centimeters = snowfall_rept['TwentyFourHourSnowfall'][
            'Centimeters']
        daily_snowfall = (int(daily_snow_centimeters) *
                          0.39) + int(daily_snow_inches)

        # extract overnight snowfall information
        overnight_snowfall_inches = snowfall_rept['OvernightSnowfall'][
            'Inches']
        overnight_snowfall_centimeters = snowfall_rept['OvernightSnowfall'][
            'Centimeters']
        overnight_snowfall = (int(overnight_snowfall_centimeters) *
                              0.39) + int(overnight_snowfall_inches)

        if overnight_snowfall < 1:
            overnight_snowfall = 0
        # import pdb; pdb.set_trace()

        # extract forcast information
        long_forcast = forcast[0]['ForecastData'][0]['WeatherLongDescription']
        wind_forcast = forcast[0]['Wind']
        forcast_icon = forcast[0]['WeatherIconStatus']

        #  query the db for the weater obj
        weather = Weather.query.filter(Weather.weather_id == 1).first()

        #  update weather conditions
        weather.wind_forcast = wind_forcast
        weather.daily_snowfall = daily_snowfall
        weather.overnight_snowfall = overnight_snowfall
        weather.snow_forcast = long_forcast
        weather.forcast_icon = forcast_icon

        db.session.commit()
        print "wooo hoooo-commited to db \n\n"

Example #6

0

Show file

File: Spider_Meilele.py Project: hkruni/Spider

    def parse(self,response):

        
        item = Meilele_Item()
        
        item['site'] = response.url
        title = Selector(response).xpath("//span[@id='jsImgName']/text()").extract()
        if len(title) > 0 :
            item['title'] = title[0]
        else :
            return
        
        tags = Selector(response).xpath("//div[@class='content']/a/text()").extract();
        if len(tags) > 0 :
            item['tag'] = ','.join(tags)
        else :
            item['tag'] = ''
            
        url = Selector(response).xpath("//div[@id='jsGalleryStageIn']/img/@src").extract()[0]
        item['origin_url'] = url
        item['new_url']=''
        item['mb']=''
        item['pixel']=''
        item['format']=url.split("/")[-1].split('.')[-1]
        return item;

Example #7

0

Show file

File: Spider_Qijia.py Project: junfeng-feng/Spider

    def parse_item(self,response):

        
        item = Fang_Item()
        
        item['site'] = response.url
        item['title'] = Selector(response).xpath("//div[@class='info']/h1/text()").extract()[0]
        item['shortDescription'] = ''
        item['shortDescription'] = "|".join(Selector(response).xpath("//div[@class='info']/p/text()").extract())
        item['category']=''
        item['style']=''
        
        category = Selector(response).xpath("//div[@class='info']/ul/li[1]/a/text()").extract()
        if category:
            item['category']=category[0]
            
        style = Selector(response).xpath("//div[@class='info']/ul/li[2]/a/text()").extract()
        if style:
            item['style']=style[0]
        
        tags =Selector(response).xpath("//div[@class='tag']/ul/a/text()").extract()
        tag = [];
        for n in tags:
            tag.append(n)
        item['tag'] = ','.join(tag);
        url = Selector(response).xpath("//img[@id='LeftImg']/@src").extract()[0]
        item['origin_url'] = url
        item['new_url']=''
        item['mb']=''
        widthPx = Selector(response).xpath("//img[@id='LeftImg']/@width").extract()
        heightPx = Selector(response).xpath("//img[@id='LeftImg']/@height").extract()
        item['pixel']=''
        item['format']=url.split("/")[-1].split('.')[-1]
        return item;

Example #8

0

Show file

 def parse2(self, response):
     item = CrawlerItem()
     try:
         page = Selector(response=response).xpath('//ul[@class="pagination"]')
         author = Selector(response=response).xpath('//h1[@class="title J_title"]/text()').get().strip()
         author = ' '.join(author.split())
         print(author)
         if len(page)==0:
             print('只有一页评论')
             comments=self.comms(response)
             for comment in comments:
                 if comment == " " or comment == "  " :
                     pass
                 else:
                     item['author'] = author
                     item['comment'] = comment
                     yield item
             #####
         else:
             print('该页有多页评论')
             page_num = page[0].xpath('./li/a/text()').getall()
             print(page_num)
             num = int(page_num[-2])
             print(num)
             for n in range(1,num+1):
                 print(f'正在提取第{n}页')
                 if n == 1:
                     url = response.request.url + '/#comments'
                 else:
                     url = response.request.url + f'/p{n}/#comments'
                 yield scrapy.Request(url=url, callback=self.parse3, dont_filter=False)
     except Exception as e:
         print(e)
         print('手机详情页链接未抓取成功')

Example #9

0

Show file

File: Spider_Aiuw.py Project: hkruni/Spider

    def parse(self,response):

        
        item = Aiuw_Item()
        
        item['site'] = response.url
        title = Selector(response).xpath("//span[@id='imgExplain']/text()").extract()
        if len(title) > 0 :
            item['title'] = title[0]
        else :
            return
        
        tags = Selector(response).xpath("//div[@class='tag']/a/text()").extract()
        if len(tags) > 0 :
            item['tag'] = ','.join(tags)
        else :
            item['tag'] = ''
            
        url = Selector(response).xpath("//div[@class='img_boxlist up userSelectNone']/img/@src").extract()[0]
        item['origin_url'] = url.replace("zip@q80", "zip@w400")
        item['new_url']=''
        item['mb']=''
        item['pixel']=''
        item['format']=url.split("/")[-1].split('.')[-1]
        return item;

Example #10

0

Show file

File: AnimePeople_spider.py Project: taipeifx/bootcamp007_project

    def parse_info_page(self, response):
        item = AnimePeopleItem()
        people_fav = None
        more_info = None
        anime_people = response.xpath('//*[@class="h1"]/text()').extract()[0]
        infolist = response.xpath(
            '//*[@id="content"]/table/tr/td/div').extract()
        for i in range(len(infolist)):
            if Selector(text=infolist[i]).xpath('//span/text()').extract() == [
                    u'Member Favorites:'
            ]:
                people_fav = Selector(
                    text=infolist[i]).xpath('//text()').extract()[1].strip()
            if Selector(text=infolist[i]).xpath('//span/text()').extract() == [
                    u'More:'
            ]:
                more_info = Selector(
                    text=infolist[i + 1]).xpath('//text()').extract()
                more_info = reduce(lambda x, y: x + y, more_info, '')
                more_info = ' '.join(more_info.split('\r\n'))

        item['anime_people'] = anime_people
        item['anime_people_fav'] = people_fav
        item['anime_people_info'] = more_info

        yield item

Example #11

0

Show file

File: rrys2019.py Project: talen0702/Python000-class01

    def parse2(self, response):
        """
        解析详情页
        获取：电影分级、本站排名、收藏次数、简介
        """
        item = response.meta['item']
        try:
            movieLevel = Selector(
                response=response).xpath('//div[@class="level-item"]/img[@src]'
                                         ).extract_first().strip()
            p = re.compile('([a-z])-big-1.png')
            movieLevel = p.findall(movieLevel)[0]
        except:
            movieLevel = None
        movieScore = Selector(response=response).xpath(
            '//p[@class="f4"]/text()').extract_first().strip('本站排名:').strip()
        movieFav = Selector(response=response).xpath(
            '//label[@id="resource_views"]/../../div[2]/text()').extract_first(
            ).strip('收藏次数：').strip()
        #movieCon = Selector(response=response).xpath('//div[@class="con"]/span/descendant-or-self::text()').extract_first().strip()
        movieCon = Selector(response=response).xpath('//div[@class="con"][2]')
        movieCon = movieCon.xpath('string(.)').extract_first().strip().replace(
            '\r\n', '')
        movieCon = ''.join(movieCon.split()).replace(',', '，')
        movieViwsLink = 'http://www.rrys2019.com/resource/index_json/rid/' + item[
            'rid'] + '/channel/tv'

        print(movieLevel, movieScore, movieViwsLink, movieFav, movieCon)
        item['movieLevel'] = movieLevel
        item['movieScore'] = movieScore
        item['movieFav'] = movieFav
        item['movieCon'] = movieCon
        yield scrapy.Request(url=movieViwsLink,
                             meta={'item': item},
                             callback=self.parse3)

Example #12

0

Show file

File: wuxi.py Project: liyongping/fangxun

    def parse_build_basic_contents(self, response):
        '''楼盘概况 ifrm_BuildBasic.pub?blid=102699'''
        content_list = response.xpath("//table[@id='info']")
        if len(content_list) == 1:
            table_content = content_list[0].extract()
            #key = [k.extract() for k in Selector(text=table_content).xpath("//td[@align='right']/text()")]
            #value = [v.extract() for v in Selector(text=table_content).xpath("//td[@align='left']")]
            value = Selector(text=table_content).xpath("//td[@align='left']//text()")
            #kv = dict(zip(key,value))
            

            item = ProjectBasicItem()
            # get 102699 from ifrm_BuildBasic.pub?blid=102699
            project_id = response.url.split("=")[1]
            # get ProvInfo.pub?prid=100498
            provinfo_href = Selector(text=table_content).xpath("//td[@align='left']/a[@href]//@href").extract()[0]
            developer_id = provinfo_href.split("=")[1]
            item['project_id'] = project_id
            item['developer_id'] = developer_id
            item['project_name'] = value[0].extract() #kv[u'项目现定名：']
            item['project_temp_name'] = value[1].extract() #kv[u'项目暂定名：']
            item['licence_id'] = value[2].extract()# kv[u'预(销)售许可证号：']
            item['approving_authority'] = value[3].extract() #kv[u'预(销)售批准机关：']
            item['developer'] = value[4].extract().strip() #kv[u'开 发 商：']
            item['partner'] = strip_null(value[5].extract()) #kv[u'合 作 方：']
            item['location'] = value[6].extract() #kv[u'坐　　落：']
            item['district'] = value[7].extract() #kv[u'行 政 区：']
            item['zone'] = value[8].extract() #kv[u'区　　位：']
            item['total_building_area'] = float(value[9].extract().split()[0]) #kv[u'总建筑面积：']
            item['approval'] = value[10].extract() #kv[u'立项批文：']
            item['planning_id'] = value[11].extract() #kv[u'规划许可证号：']
            item['land_id'] = value[12].extract() #kv[u'土地证号：']
            item['builder_licence'] = value[13].extract() #kv[u'施工许可证号；']
            item['land_licence'] = value[14].extract() #kv[u'用地许可证号：']
            item['total_land'] = float(value[15].extract().split()[0]) #kv[u'总 用 地：']
            item['current_used_land'] = float(value[16].extract().split()[0]) #kv[u'当期用地：']
            item['start_date'] = value[17].extract() #kv[u'开工日期：']
            item['planning_end_date'] = value[18].extract() #kv[u'预计竣工日期：']
            item['invest'] = float(value[19].extract().split()[0]) #kv[u'项目投资：']
            item['presell_total_area'] = float(value[20].extract().split()[0]) #kv[u'预售总面积：']

            #kv[u'公建配套面积：']
            public_area = value[21].extract().split()[0]
            item['public_area'] = float(public_area)
            item['total_units'] = int(value[22].extract()) #kv[u'总 套 数：']
            item['plot_rate'] = float(value[23].extract()) #kv[u'容 积 率：']
            item['green_rate'] = float(strip_null(value[24].extract()).split()[0]) #kv[u'绿 化 率：']
            item['sale_agent'] = strip_null(value[25].extract()) #kv[u'代销公司：']
            item['phone_number'] = strip_null(value[26].extract()) #kv[u'电　　话： ']
            item['sale_location'] = strip_null(value[27].extract()) #kv[u'项目销售地点：']
            item['sale_phone_number'] = strip_null(value[28].extract()) #kv[u'销售电话：']
            item['property_management_company'] = strip_null(value[29].extract()) #kv[u'物业公司：']

            fee_str = format_property_fee(value[30].extract())
            fee = fee_str.split("-")
            item['property_fee_from'] = float(fee[0]) #kv[u'物 管 费：']
            item['property_fee_to'] = float(fee[1])

            yield item

Example #13

0

Show file

File: tianyancha.py Project: wangjunping0938/collector

 def download_font_file(data):
     # 下载字体文件
     rules = Rules.css_font
     css_url = Selector(text=data).xpath(rules['css']).extract_first()
     response = requests.get(css_url).text
     font_url = Selector(text=response).re_first(rules['font'])
     filename = '{}/{}'.format(self.temp, font_url.split(r'/')[-1])
     urlretrieve(font_url, filename)

Example #14

0

Show file

    def get_rating_descriptors(self, desc_line=None):
        """
        get_rating_descriptors is used to query BotSentinel to get the
        descriptions that correspond to a description key, and both are
        correlated to the integer rating each user is assigned
        :param desc_line: Default argument, not required.
        :type desc_line: str
        :return: A list of tuples of the ranges and rating descriptions
        :rtype: list
        """
        descriptors = []

        # Check if we have the descriptors cached, return the cache
        if len(self.descriptors) > 0:
            return [True, self.descriptors]

        # If the descriptors arent in cache and arent provied, get them
        if len(self.descriptors) == 0 and not desc_line:
            page = self.query_site(self.url, 'get')
            desc_line = Selector(text=page.text).xpath(self.js_path).extract()

        # Parse the <script> text and grab the descriptors
        for i in desc_line:
            desc_matched = self.desc_regex.search(i)
            if not desc_matched:
                continue
            if desc_matched:
                desc_line = desc_matched.group(0)
                break

        # Clean up the returned descriptors
        desc_line = re.sub(self.tag_clean, '', desc_line)
        desc_line = desc_line.split('[')
        desc_line = [desc_line[2], desc_line[3], desc_line[4], desc_line[5]]

        # More cleanup of returned descriptors. This could probably be
        # improved
        for i in desc_line:
            i = re.sub("|".join(self.extra_chars), '', i)
            prelist = []
            for x in i.split(','):
                if x != '':

                    # The parsed integers are considered strings,
                    # convert them if theyre integers, catch the
                    # exception if it's a string and pass
                    try:
                        x = int(x)
                    except ValueError:
                        pass
                    prelist.append(x)
            descriptors.append(tuple(prelist))

        # Return a list of tuples, each tuple containing info
        # and ranges for each descriptor
        return descriptors

Example #15

0

Show file

File: vacancy.py Project: Caravan2/scripts

def Vacancy(link):
    print("request sent for Vacancy succesfully")
    url = link
    page = requests.get(url)

    # Category
    try:
        category = Selector(response=page).xpath(
            '//*[@id="MainContentPlaceHolder_jobContainer"]/div[5]/div[1]/text()'
        ).get()
    except:
        category = ""

    # Ends
    try:
        ends = Selector(response=page).xpath(
            '//*[@id="MainContentPlaceHolder_jobContainer"]/div[5]/div[3]/text()'
        ).get()
        ends = ends.split("/")
        deadline_day = int(ends[0])
        deadline_month = int(ends[1])
        deadline_year = int(ends[2])
    except:
        deadline_day = 0
        deadline_month = 0
        deadline_year = 0

    # Description
    try:
        description = Selector(response=page).xpath(
            '//*[@id="MainContentPlaceHolder_jobContainer"]').get()
        description = remove_tags(description)
    except:
        description = ""

    # Email
    try:
        email = re.findall(r'[\w\.-]+@[\w\.-]+', description)[0]
        email = [email]
    except:
        email = []

    data = {
        "category": category,
        "deadline_day": deadline_day,
        "deadline_month": deadline_month,
        "deadline_year": deadline_year,
        'description': description,
        "email": email
    }

    print("Vacancy data is scraped")
    return data


# Vacancy('https://www.myjob.am/Announcement.aspx?jobId=57479')

Example #16

0

Show file

File: weather_spider.py Project: JadeHayes/project

    def parse(self, response):
        """ Parse through Whistlers weather page """
        # Extract the current weather report
        daily_rept = Selector(
            response=response).xpath('//script/text()').extract()[8]
        snowfall_rept = json.loads(daily_rept.split("=")[2].split(";")[0])

        # Extract the forcast weather report
        forcast_rept = Selector(
            response=response).xpath('//script/text()').extract()[11]
        forcast = json.loads(forcast_rept.split("=")[2].split(";")[0])

        # Calculate and convert snow to inches
        daily_snow_inches = snowfall_rept['TwentyFourHourSnowfall']['Inches']
        daily_snow_centimeters = snowfall_rept['TwentyFourHourSnowfall'][
            'Centimeters']
        daily_snowfall = (int(daily_snow_centimeters) *
                          0.39) + int(daily_snow_inches)

        # extract overnight snowfall information
        overnight_snowfall_inches = snowfall_rept['OvernightSnowfall'][
            'Inches']
        overnight_snowfall_centimeters = snowfall_rept['OvernightSnowfall'][
            'Centimeters']
        overnight_snowfall = (int(overnight_snowfall_centimeters) *
                              0.39) + int(overnight_snowfall_inches)

        # extract forcast information
        long_forcast = forcast[0]['ForecastData'][0]['WeatherLongDescription']
        wind_forcast = forcast[0]['Wind']
        forcast_icon = forcast[0]['WeatherIconStatus']

        # Instansiate a weather object
        weather = Weather(wind_forcast=wind_forcast,
                          daily_snowfall=daily_snowfall,
                          overnight_snowfall=overnight_snowfall,
                          snow_forcast=long_forcast,
                          forcast_icon=forcast_icon)

        db.session.add(weather)
        db.session.commit()

Example #17

0

Show file

File: question_parser.py Project: mpraus/NLP_and_Web

def get_questions(category, level):
    data = {
        "play": True,
        "kat[]": [category],
        "level[]": [level],
        "anzahl": 300
    }
    response = requests.post("https://www.fragespiel.com/quiz/training.html",
                             data=data)
    js_questions = Selector(
        text=response.text).xpath('//script[3]/text()').get()
    json_questions = js_questions.split("json\' : \'")[1].split("\',")[0]
    return json.loads(json_questions)

Example #18

0

Show file

    def _get_pages_total(self, response):
        """
        Calculate total page count

        :param response: scrapy response object
        :return: calculated page count|int
        """
        el = Selector(response).xpath(
            '//td[@class="pagination"]//span[@class="fielddata"]/text()'
        ).extract_first()
        total = int(el.split('of')[1].strip()) / self.items_per_page
        self.logger.debug("Total pages: %s", total)
        return int(total)

Example #19

0

Show file

File: get_pair_stations.py Project: cansadadeserfeliz/sitp_scraper

    def handle(self, *args, **options):
        for bus_station in BusStation.objects.filter(code__isnull=True).all():
            print(bus_station.name, bus_station.address, bus_station.link)
            try:
                response = requests.get(bus_station.link)
                print('\t', response.status_code)
                if response.status_code == 200:
                    response_text = response.text
                    res = Selector(text=response_text).css(
                        '#zonaBloqueContent h3::text').extract_first()
                    if not res:
                        continue
                    code, name = res.split('-')
                    code = code.replace('Paradero', '').strip()
                    print(code)
                    if not bus_station.code:
                        bus_station.code = code
                        bus_station.save()

                    for other_station in Selector(text=response_text).css(
                            '.moduloParaderoMultiple a'):
                        other_station_code = other_station.css(
                            '.codigoParadero::text').extract_first()
                        station = BusStation.objects.filter(
                            code=other_station_code).first()
                        if station and station.id != bus_station.id:
                            if station not in bus_station.related_stations.all(
                            ):
                                bus_station.related_stations.add(station)
                            if bus_station not in station.related_stations.all(
                            ):
                                station.related_stations.add(bus_station)

                            if (station.latitude and station.longitude
                                    and not bus_station.latitude):
                                bus_station.latitude = station.latitude
                                bus_station.longitude = station.longitude
                                bus_station.save()
                                continue
                            if (bus_station.latitude and bus_station.longitude
                                    and not station.latitude):
                                station.latitude = bus_station.latitude
                                station.longitude = bus_station.longitude
                                station.save()
                                continue
                    if not res:
                        continue
            except Exception as e:
                print(bus_station.id, e)
            print('...Next station')

Example #20

0

Show file

    def __init__(self, symbol, **kwargs):
        self.allowed_domains = ['guba.eastmoney.com']
        self.base_url_prefix = 'http://guba.eastmoney.com'
        self.base_url = 'http://guba.eastmoney.com/list,%s,1,f.html' % symbol
        self.start_urls = [self.base_url]

        # obtain the number of pages
        subpage_url = 'http://guba.eastmoney.com/list,%s,1,f_{}.html' % symbol
        pageresponse = requests.post(self.base_url)
        pageresponse = Selector(text=pageresponse.text).xpath(
            '//span[@class="pagernums"]').extract_first()
        numpage = math.ceil(
            int(pageresponse.split('|')[-3]) /
            int(pageresponse.split('|')[-2]))

        # record the right number of news
        with open('log/web_%s.txt' % symbol, 'w') as f:
            f.write('%d' % int(pageresponse.split('|')[-3]))

        # add all the urls of pages
        for i in range(1, numpage + 1):
            self.start_urls.append(subpage_url.format(i))
        super().__init__(**kwargs)

Example #21

0

Show file

File: Spider_Tuba.py Project: hkruni/Spider

    def parse(self,response):

        
        item = Tuba_Item()
        
        item['site'] = response.url
        item['title'] = Selector(response).xpath("//div[@id='apphigh']/img/@title").extract()[0]

            
        url = Selector(response).xpath("//div[@id='apphigh']/img/@src").extract()[0]
        item['origin_url'] = url
        item['new_url']=''
        item['mb']=''
        item['pixel']=''
        item['format']=url.split("/")[-1].split('.')[-1]
        return item;

Example #22

0

Show file

 def parse3(self, response):
     item = CrawlerItem()
     try:
         author = Selector(response=response).xpath('//h1[@class="title J_title"]/text()').get().strip()
         author = ' '.join(author.split())
         comments=self.comms(response)
         for comment in comments:
                 if comment == " " or comment == "  " :
                     pass
                 else:
                     item['author'] = author
                     item['comment'] = comment
                     yield item
     except Exception as e:
         print(e)
         print('手机详情页链接未抓取成功')

Example #23

0

Show file

    def parse(self, response):
        html_list = response.xpath(
            "//div[@id='toplist']/div[@class='g-mn3']/div[@class='g-mn3c']/div[@class='g-wrap12']/div[@id='song-list-pre-cache']/div/div[@class='j-flag']/table/tbody/tr"
        ).extract()

        for item in html_list:
            music_item = NeteaseMusicItem()

            hot_num = Selector(
                text=item).xpath("//tr/td[1]/div[@class='hd']/span/text()"
                                 ).extract_first() or ''

            hot_rk = Selector(text=item).xpath(
                "//tr/td[1]/div[@class='hd']/div/span[@class='ico u-icn u-icn-73 s-fc9']/text()"
            ).extract_first() or ''

            song_pic = Selector(text=item).xpath(
                "//tr/td[2]/div[@class='f-cb']/div[@class='tt']/a/img/@src"
            ).extract_first() or ''

            song_name = Selector(text=item).xpath(
                "//tr/td[2]/div[@class='f-cb']/div[@class='tt']/div[@class='ttc']/span/a/b/@title"
            ).extract_first() or ''

            song_href = Selector(text=item).xpath(
                "//tr/td[2]/div[@class='f-cb']/div[@class='tt']/div[@class='ttc']/span/a/@href"
            ).extract_first()
            song_id = ''
            if song_href:
                song_id = song_href.split('=')[-1]

            song_time = Selector(text=item).xpath(
                "//tr/td[3]/span/text()").extract_first() or ''

            singer_name = Selector(text=item).xpath(
                "//tr/td[4]/div[@class='text']/@title").extract_first() or ''

            music_item['hot_num'] = hot_num
            music_item['hot_rk'] = hot_rk
            music_item['song_pic'] = song_pic
            music_item['song_name'] = song_name
            music_item['song_id'] = song_id
            music_item['song_time'] = song_time
            music_item['singer_name'] = singer_name
            yield music_item

Example #24

0

Show file

File: Spider_Qijia.py Project: hkruni/Spider

    def parse_item(self,response):

        
        item = Qijia_Item()
        
        item['site'] = response.url
        item['title'] = Selector(response).xpath("//div[@class='crumb']/i/text()").extract()[0]
        item['key'] = Selector(response).xpath("//meta[@name='keywords']").xpath('@content').extract()[0]

            
        item['tag']=Selector(response).xpath("//p[@class='pic_desp']/span/i/text()").extract()[0]
        url = Selector(response).xpath("//img[@class='lazyload']/@_src").extract()[0]
        item['origin_url'] = url
        item['new_url']=''
        item['mb']=''
        item['pixel']=''
        item['format']=url.split("/")[-1].split('.')[-1]
        return item;

Example #25

0

Show file

File: Spider_Shejiben.py Project: hkruni/Spider

    def parse(self,response):

        
        item = Shejiben_Item()
        
        item['site'] = response.url
        title = Selector(response).xpath("//div[@class='pageTag']/a/text()").extract();
        if len(title) == 3 :
            item['title'] = title[-1]
        else :
            return 

            
        url = Selector(response).xpath("//li[@class='nowPic']/img/@src").extract()[0]
        item['origin_url'] = url
        item['new_url']=''
        item['mb']=''
        item['size']=''
        item['pixel']=''
        item['format']=url.split("/")[-1].split('.')[-1]
        return item;

Example #26

0

Show file

File: update_spider.py Project: JadeHayes/whistler-project

    def parse(self, response):
        """ Parsing through our data returned from webscraping """
        skiruns_str = Selector(
            response=response).xpath('//script/text()').extract()[11]
        skiruns = json.loads(skiruns_str.split("=")[1].split(";")[0])
        lifts = skiruns['Lifts']

        # update lift status information in the db
        for lift_dict in lifts:
            liftname = lift_dict['Name']
            liftstatus = lift_dict['Status']
            lift = Lift.query.filter(Lift.name == liftname).first()

            lift.status = liftstatus

        db.session.commit()

        runs_list = skiruns['GroomingAreas']

        for lifts_dict in runs_list:

            # Seperate the data from the webbscrapper
            skirun_list = lifts_dict['Runs']
            lift_names = lifts_dict['Name']
            lift_names = lift_names.split(" - ")

            # Loop over and update the run information
            for skirun in skirun_list:
                skirun_name = skirun['Name']
                skirun_status = skirun['IsOpen']
                skirun_groomed = skirun['IsGroomed']
                # import pdb; pdb.set_trace()
                skirun2 = Skirun.query.filter(
                    Skirun.name == skirun_name).first()

                # import pdb; pdb.set_trace()
                skirun2.status = skirun_status
                skirun2.groomed = skirun_groomed

            db.session.commit()

Example #27

0

Show file

    def __init__(self, symbol, **kwargs):
        self.allowed_domains = ['guba.eastmoney.com']
        self.base_url_prefix = 'http://guba.eastmoney.com'
        self.base_url = 'http://guba.eastmoney.com/list,%s,f_1.html' % symbol
        self.start_urls = []

        # obtain the number of pages
        subpage_url = 'http://guba.eastmoney.com/list,%s,f_{}.html' % symbol
        pageresponse = requests.post(self.base_url)
        pageresponse = Selector(text=pageresponse.text).xpath(
            '//span[@class="pagernums"]').extract_first()
        numpage = math.ceil(
            int(pageresponse.split('|')[-3]) /
            int(pageresponse.split('|')[-2]))

        # record the right number of news
        end_page = 0
        if os.path.exists('log/check_out.txt'):
            with open('log/check_out.txt', 'r') as f:
                end_page = int(f.readlines()[-1].strip())

        if numpage - end_page < 1:
            return

        if numpage - end_page < 10:
            task_page = numpage - end_page
        else:
            task_page = 10

        with open('log/check_out.txt', 'a+') as f:
            f.write('%d\n' % (end_page + task_page))

        # add all the urls of pages
        for i in range(numpage - end_page, numpage - end_page - task_page, -1):
            #job = int(job)
            #for i in range(job*10,(job+1)*10):
            self.start_urls.append(subpage_url.format(i))
        super().__init__(**kwargs)

Example #28

0

Show file

def Vacancy(link):
    print("request sent for Vacancy succesfully")
    url = link
    print(url)
    # headers = {"Accept-Language": "en-US,en;q=0.5"}
    page = requests.get(url)  #headers=headers)

    # Company
    try:
        company = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lnkCompany"]/text()'
        ).get()
    except:
        company = ""

    # Website
    try:
        website = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lnkCompany"]/@href'
        ).get()
        website = [website]
    except:
        website = []

    # Position
    try:
        position = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobPostTitle"]/text()'
        ).get()
    except:
        position = ""

    # logo
    try:
        logo = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_imgCompanyLogoLink"]/@src'
        ).get()
        logo = "http://jobfinder.am/" + logo
    except:
        logo = ''

    # Job_type
    try:
        job_type = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblPositionType"]/text()'
        ).get()
    except:
        job_type = ""

    # Category
    try:
        category = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblCategory"]/text()'
        ).get()
    except:
        category = ""

    # Experience
    try:
        experience = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblExperience"]/text()'
        ).get()
    except:
        experience = ""

    # Education
    try:
        education = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblEducation"]/text()'
        ).get()
    except:
        education = ""

    # Location
    try:
        location = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblLocation"]/text()'
        ).get()
    except:
        location = ""

    # Published
    try:
        published = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblDate"]/text()'
        ).get()
        published = published.split(" ")
        published = published[0].split("-")
        publish_day = int(published[0])
        publish_month = int(published[1])
        publish_year = int("20" + published[2])
    except:
        publish_day = 0
        publish_month = 0
        publish_year = 0
    if yesterday_day != publish_day or yesterday_month != publish_month:
        print("Not published yesterday")
        return

    # Ends
    try:
        ends = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblDate"]/text()'
        ).get()
        ends = ends.split(" ")
        ends = ends[0].split("-")
        deadline_day = int(ends[0])
        deadline_month = int(ends[1])
        deadline_year = int("20" + ends[2])
    except:
        deadline_day = 0
        deadline_month = 0
        deadline_year = 0

    # Salary
    try:
        salary = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblSalary"]/text()'
        ).get()
        salary = int(salary)
    except:
        salary = 0

    # Age
    try:
        age = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblAge"]/text()'
        ).get()
        if "--------" in age:
            age = ""
    except:
        age = ""

    # Gender
    try:
        gender = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblGender"]/text()'
        ).get()
        if "--------" in gender:
            gender = ""
    except:
        gender = ""

    # Job Description
    try:
        j_description = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobDescription"]/text()'
        ).get()
    except:
        j_description = ""

    # Job Responsibilities
    try:
        j_responsibilities = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobResponsibilities"]/text()'
        ).get()
    except:
        j_responsibilities = ""

    # Required Qualifications
    try:
        r_qualifications = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblRequiredQualifications"]'
        ).get()
        r_qualifications = remove_tags(r_qualifications)
    except:
        r_qualifications = ""

    # Application Procedure
    try:
        a_procedure = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblApplicationProcedure"]'
        ).get()
        a_procedure = remove_tags(a_procedure)
    except:
        a_procedure = remove_tags(a_procedure)

    v_description = j_description + "\n" + j_responsibilities + "\n" + r_qualifications + "\n" + a_procedure
    try:
        if detect(v_description) == "et":
            try:
                v_description_en = Translate(v_description)
            except:
                v_description_en = ""
            v_description_am = v_description
        else:
            v_description_en = v_description
            v_description_am = ""
    except:
        v_description_en = ""
        v_description_am = ""

    # About Company
    try:
        c_description = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblAboutCompany"]'
        ).get()
        c_description = remove_tags(c_description)
    except:
        c_description = ""
    try:
        if detect(c_description) == "et":
            try:
                c_description_en = Translate(c_description)
            except:
                c_description_en = ""
            c_description_am = c_description
        else:
            c_description_en = c_description
            c_description_am = ""
    except:
        c_description_en = ""
        c_description_am = ""

    # Email
    try:
        email = Selector(response=page).xpath(
            '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblApplicationProcedure"]/a/text()'
        ).get()
        email = email.strip()
        email = [email]
    except:
        email = []

    # Phone
    try:
        phone = re.search(r"\d{9}", v_description_en).group()
        phone = [{"country_code": "374", "number": phone}]
    except:
        phone = []

    data = {
        "company": company,
        "position": position,
        "website": website,
        "logo": logo,
        "job_type": job_type,
        "category": category,
        "experience": experience,
        "education": education,
        "location": location,
        "publish_day": publish_day,
        "publish_month": publish_month,
        "publish_year": publish_year,
        "deadline_day": deadline_day,
        "deadline_month": deadline_month,
        "deadline_year": deadline_year,
        "salary": salary,
        "age": age,
        "gender": gender,
        "v_description_am": v_description_am,
        "v_description_en": v_description_en,
        "c_description_am": c_description_am,
        "c_description_en": c_description_en,
        "email": email,
        "phone": phone,
    }

    # print(data)
    return data


# Vacancy('http://jobfinder.am/ViewJob.aspx?JobPostingID=49217')

Example #29

0

Show file

File: bookspider.py Project: JosephPeng/doubanbook

    def parse_book(self, response):
        self.randomSleep(sleeptime)
        item = BookItem()
        selector = Selector(response)
        item['cover'] = (selector.xpath('//div[@id="mainpic"]/a/@href').extract())[0]
        # item['cover'] = (selector.xpath('//a[@class = "nbg"]/@href').extract())[0].replace('mpic', 'lpic')
        # mpic为小图， lpic为大图
        # get the cover image url
        item['book_id'] = response.url.split('/')[-2]
        logger.info('-----process book ' + item['book_id'] + ' -----')
        # 吐槽: 渣豆瓣，书名等信息没有标签，使用xpath根本不好提取！
        # WTF, there are no labels for the information of the book!
        info_block = (selector.xpath('//div[@id="info"]').extract()[0].encode('utf-8').split('<br>'))[:-1]
        # info_block最后一项为空，需要删除，否则会报越界错误
        # the last item of 'info_block' is null, it will raise an error unless delete it.
        item['author'] = ''
        item['subtitle'] = ''
        item['publisher'] = ''
        item['price'] = ''
        item['total_page'] = ''
        item['pub_year'] = ''
        item['isbn'] = ''
        for info_item in info_block:
            info_text = Selector(text = info_item).xpath('string(.)').extract()[0]
            info_text = ''.join(info_text.split())
            if('作者'.decode('utf-8') in info_text):
                item['author'] = info_text[3:]
            elif('副标题'.decode('utf-8') in info_text):
                item['subtitle'] = (info_text[4:])
            elif('出版社'.decode('utf-8') in info_text):
                item['publisher'] = (info_text[4:])
            elif('定价'.decode('utf-8') in info_text):
                item['price'] = info_text[3:]
            elif('页数'.decode('utf-8') in info_text):
                item['total_page'] = info_text[3:]
            elif('出版年'.decode('utf-8') in info_text):
                item['pub_year'] = info_text[4:]
            elif('ISBN' in info_text):
                item['isbn'] = info_text[5:]

        item['title'] = selector.xpath('//span[@property="v:itemreviewed"]/text()').extract()[0]
        item['grade'] = selector.xpath('//strong[@property="v:average"]/text()').extract()[0].encode('utf-8')
        item['gradecount'] = selector.xpath('//a[@class="rating_people"]/span/text()').extract()[0].encode('utf-8')

        read_block = selector.xpath('//div[@id="collector"]')
        item['reading_num'] = (read_block.xpath('.//a[contains(@href, "doings")]/text()').extract()[0])[:-3]
        item['readed_num'] = (read_block.xpath('.//a[contains(@href, "collections")]/text()').extract()[0])[:-3]
        item['preread_num'] = (read_block.xpath('.//a[contains(@href, "wishes")]/text()').extract()[0])[:-3]

        # TODO: 整理数据到相关数据库或表格
        # TODO: reform item to the database
        yield item

        # 请求bookid对应的book的封面图片
        # Request the cover image for the book
        yield scrapy.Request(
            url = item['cover'],
            headers = self.getRandomHds(),
            meta = {
                'proxy':proxy,
                'cookiejar':1
            },
            callback = (lambda response, book_id=item['book_id']: self.parse_cover(response, book_id)),
            errback = self.parse_err
        )

Example #30

0

Show file

File: vacancy.py Project: Caravan2/scripts

def Vacancy(link):
    print("request sent for Vacancy succesfully")
    url = link
    print(url)
    # headers = {"Accept-Language": "en-US,en;q=0.5"}
    page = requests.get(url)  #headers=headers)

    # Published
    try:
        published = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[2]/span/text()[2]'
        ).get()
        published = published.strip().split(" ")
        publish_day = int(published[0].split("/")[0])
        publish_month = int(published[0].split("/")[1])
        publish_year = int(published[0].split("/")[2])
    except Exception as e:
        publish_day = 0
        publish_month = 0
        publish_year = 0
    if yesterday_day != publish_day or yesterday_month != publish_month:
        print("Not published yesterday")
        return

    # Location #
    try:
        location = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[1]/text()'
        ).get()
        location = location.strip()
        location_id = []
        location = {"city": f"{location}", "id": f"{Geonames(location)}"}
        location_id.append(location)
    except:
        location_id = [{'city': 'Yerevan', 'id': '616052'}]

    # Posted by
    try:
        posted_by = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[1]/text()'
        ).get()
        posted_by = posted_by.strip()
    except:
        posted_by = ""

    # Email
    try:
        email = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[2]/text()'
        ).get()
        email = email.strip()
        if email == "":
            email = []
        else:
            email = [email]
    except:
        email = []

    # Workspace
    try:
        workspace = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[2]/div[2]/div[2]/p/text()'
        ).get()
        workspace = workspace.strip()
    except:
        workspace = ""

    # Job_type
    try:
        job_type = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[3]/div[2]/div[2]/p/text()'
        ).get()
        job_type = job_type.strip()
    except:
        job_type = ""

    # Salary
    try:
        salary = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[4]/div[2]/div[2]/p/text()'
        ).get()
        salary = salary.strip().replace("Until ", "")
        if "-" in salary:
            salary = salary.split("-")
            min_salary = int(salary[0].strip())
            max_salary = int(salary[1].strip())
        elif "-" not in salary and salary != '':
            min_salary = int(salary)
            max_salary = int(salary)
        else:
            min_salary = 0
            max_salary = 0
    except:
        min_salary = 0
        max_salary = 0

    # Education
    try:
        education = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[5]/div[2]/div[2]/p/text()'
        ).get()
        education = education.strip()
    except:
        education = ""

    # Experience
    try:
        experience = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[6]/div[2]/div[2]/p/text()'
        ).get()
        experience = experience.strip()
    except:
        experience = ""

    # Gender
    try:
        gender = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[7]/div[2]/div[2]/p/i/@class'
        ).get()
        if "female" in gender:
            gender = "female"
        elif "male" in gender:
            gender = "male"
        else:
            gender = ''
    except:
        gender = ""

    # Age
    try:
        age = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[8]/div[2]/div[2]/p/text()'
        ).get()
        age = age.strip()
    except:
        age = ""

    print(1)

    # Description
    try:
        description = Selector(response=page).xpath(
            '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[10]/div[2]/div/p/text()'
        ).get()
        description = description.strip()
    except:
        description = ""
    description_en = ""
    description_am = ""
    try:
        if detect(description) == "et":
            try:
                description_en = Translate(description)
            except:
                description_en = ""
            description_am = description
        else:
            description_en = description
            description_am = ""
    except:
        description_en = ""
        description_am = ""

    # Phone
    try:
        phone = Selector(response=page).css(
            '#sidebar-border > div.detailed-info-block.form-inline.clearfix > div.clearfix > div > div.user-details'
        ).extract()
        phones = []
        for phone in phone:
            phone = remove_tags(phone).strip()
            area_code = "374"
            number = phone.replace(" ", "")
            number = number.replace("-", "")
            number = number.replace("(", "")
            number = number.replace(")", "")
            phones.append({'country_code': area_code, "number": number})
    except:
        phone = []

    # Username
    try:
        username = Selector(response=page).xpath(
            '//*[@id="sidebar-border"]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div[2]/h6/a/text()'
        ).get()
        username = username.strip()
    except:
        username = ""

    data = {
        "publish_day": publish_day,
        "publish_month": publish_month,
        "publish_year": publish_year,
        "location_id": location_id,
        "posted_by": posted_by,
        "email": email,
        "workspace": workspace,
        "job_type": job_type,
        "min_salary": min_salary,
        "max_salary": max_salary,
        "education": education,
        "experience": experience,
        "gender": gender,
        "age": age,
        "description_am": description_am,
        "description_en": description_en,
        "phone": phones,
        "username": username
    }

    print(data)
    return data


# Vacancy("https://full.am/en/job/public/view/1163")

# https://full.am/en/job/public/view/12067
# https://full.am/en/job/public/view/1163

Example #31

0

Show file

                ).strip()
            link = "https://jobs.ge" + link
        except:
            link = ""

        try:
            position = Selector(response=page).xpath(
                f'//*[@id="job_list_table"]/tr[{tr}]/td[2]/a/text()').get(
                ).strip()
        except:
            position = ""

        try:
            published = Selector(response=page).xpath(
                f'//*[@id="job_list_table"]/tr[{tr}]/td[5]/text()').get()
            publish_day = int(published.split(" ")[0])
            publish_month = int(months[f"{published.split(' ')[1]}"])
            publish_year = year
        except:
            publish_day = 0
            publish_month = 0
            publish_year = 0
        if yesterday_day != publish_day:
            print("Not published yesterday")
            continue

        try:
            ends = Selector(response=page).xpath(
                f'//*[@id="job_list_table"]/tr[{tr}]/td[6]/text()').get()
            ends = ends.split(" ")
            deadline_day = int(ends[0])

Example #32

0

Show file

File: app.py Project: mrvaghani/customer_feedback_crawler

    def parse(self, response):
        detailed_review_object_list = []
        review_selector_list = response.xpath(
            '//div[@id="reviews-container"]//div[@class="js-paginator-data"]'
        ).xpath('//div[@class="rvw js-rvw"]')
        for _review_selector in review_selector_list:
            _current_review_selector_body = _review_selector.get()
            # _review_rating = _review_selector.xpath('//div[@class="rvw__hdr-stat"]//img/@data-rating').get()  # '5.0'
            _review_rating = Selector(
                text=_current_review_selector_body).xpath(
                    '//div[@class="rvw__hdr-stat"]//img/@data-rating').get()

            # _author_info = _review_selector.xpath('//div[@class="rvw-aut__inf"]/strong/text()').get()  # 'Julie of Ceres,, CA'
            _author_info = Selector(text=_current_review_selector_body).xpath(
                '//div[@class="rvw-aut__inf"]/strong/text()').get()

            _author_state: str = _author_info.split(',')[-1]  # 'CA'

            # _review_date_text = _review_selector.xpath('//div[@class="rvw-bd ca-txt-bd-2"]/span/text()').get()  #'Original review: March 18, 2019'
            _review_date_text = Selector(
                text=_current_review_selector_body).xpath(
                    '//div[@class="rvw-bd ca-txt-bd-2"]/span/text()').get(
                    ).split(':')[-1]

            # Let's remove whitespace to make it easier to convert to datetime object
            _review_date_text = _review_date_text.replace(' ', '')
            # _review_date_text = 'March18,2019'
            _review_date_text = _review_date_text[-4:]
            # _date_pattern = '%b.%d,%Y'  # 'Oct.21,2019'
            _date_pattern = '%Y'  # '2019'
            _struct_time_format = (time.strptime(_review_date_text,
                                                 _date_pattern))
            _date_time_format = datetime.datetime(*_struct_time_format[:6])
            eastern = pytz.timezone('US/Eastern')
            utc = pytz.utc
            aware_date_time = eastern.localize(_date_time_format)
            utc_review_date_time = aware_date_time.astimezone(utc).timestamp()

            # This will be the list of all paragraphs that we find in a review that we will be using to process.
            _review_description_paragraph_list: list = Selector(
                text=_current_review_selector_body).xpath(
                    '//div[@class="rvw-bd ca-txt-bd-2"]/p').getall()
            _clean_review_description_list: list = []

            # Let's check if there is a collapsed div that we need to process.
            if Selector(text=_current_review_selector_body).xpath(
                    '//div[@class="rvw-bd ca-txt-bd-2"]/div[@class="js-collapsed"]'
            ).get() is not None:

                # We need to get all the paragraphs in the collapsed div that we found
                _collapsed_paragraph_list = Selector(
                    text=_current_review_selector_body
                ).xpath(
                    '//div[@class="rvw-bd ca-txt-bd-2"]/div[@class="js-collapsed"]/p'
                ).getall()

                # Let's add these new paragraphs to our original list for processing
                _review_description_paragraph_list.extend(
                    _collapsed_paragraph_list)

            for para in _review_description_paragraph_list:
                if Selector(text=para).xpath('//p/text()').get(
                ) is not None:  # If the paragraph is not empty
                    _clean_review_description_list.append(
                        Selector(text=para).xpath('//p/text()').get())

            _clean_review_description = ''.join(_clean_review_description_list)
            _num_found_useful_text: str = Selector(
                text=_current_review_selector_body
            ).xpath(
                '//div[@class="rvw-foot"]/span[@class="rvw-foot__helpful-count js-helpful-count ca-txt--clr-gray"]/strong/text()'
            ).get()

            # We need to extract the number from the text we get from _num_found_useful_text --> E.g. '97 people'
            _num_found_useful: str = _num_found_useful_text.split(' ')[0]

            detailed_review_object = {
                'ratings': _review_rating,
                'reviewer_location': _author_state,
                'review_time_utc': str(utc_review_date_time),
                'review_description': _clean_review_description,
                'num_found_useful': _num_found_useful
            }
            detailed_review_object_list.append(detailed_review_object)

        _return_data = {'reviews': detailed_review_object_list}
        return _return_data

Example #33

0

Show file

File: news_spider.py Project: chaman1avnish/hackernews_clone

    def parse(self, response):
        description = response.xpath(
            "//table[@class='itemlist']/tr[not(re:test(@class, "
            "'(spacer)'))]").extract()
        row = self.get_default_row_dict()
        # print description
        for i, v in enumerate(description):
            index = i
            if not row['rank']:
                value = Selector(text=v).xpath(
                    '//td[1]/span[@class="rank"]/text()').extract_first()
                row['rank'] = int(value.replace('.', '')) if value else 0

            if not row['story_text']:
                value = Selector(text=v).xpath(
                    '//td[3]/a[@class="storylink"]/text()').extract_first()
                row['story_text'] = value.encode("utf8") if value else ''

            if not row['link_href']:
                value = Selector(text=v).xpath(
                    '//td[3]/a[@class="storylink"]/@href').extract_first()
                # print value
                row['link_href'] = value if value else ''

            if not row['hn_user']:
                value = Selector(text=v).xpath(
                    '//a[@class="hnuser"]/text()').extract_first()
                row['hn_user'] = value.encode("utf8") if value else ''

            if not row['age']:
                value = Selector(text=v).xpath(
                    '//span[@class="age"]/a/text()').extract_first()
                row['age'] = int(value.split(' ')[0]) if value else 0

            if not row['total_comments']:
                value = Selector(text=v).xpath(
                    '//td[@class="subtext"]/a[contains(@href, "item?id=")]/text()'
                ).extract_first()
                if value:
                    value = value.encode('ascii', 'ignore').replace(
                        'comments', '') if value else ''
                    value = value.encode('ascii', 'ignore').replace(
                        'comment', '') if value else ''
                    row['total_comments'] = int(value) if represents_int(
                        value) else 0

            if not row['score']:
                value = Selector(text=v).xpath(
                    '//span[@class="score"]/text()').extract_first()
                row['score'] = int(value.split(' ')[0]) if value else 0

            if not row['hn_id_code']:
                value = Selector(
                    text=v).xpath('//tr[@class="athing"]/@id').extract_first()
                row['hn_id_code'] = int(value) if represents_int(value) else 0

            if all([None for i, v in row.items() if v == None]):
                print 'Go for save >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'
                data = row.copy()
                row = self.get_default_row_dict()
                self.comment_url.append(
                    'https://news.ycombinator.com/item?id=15318440')
                news_id = data['hn_id_code']
                item = NewsBotItem(data)
                yield item
                request = scrapy.Request(
                    url='https://news.ycombinator.com/item?id=' + str(news_id),
                    callback=self.parse_comment)
                request.meta['item'] = item
                request.meta['news_id'] = int(news_id)
                yield request

            if index % 2:
                row = self.get_default_row_dict()

Example #34

0

Show file

    def captureDataAndWrite(self, response):

        data = {}

        # get all the data through XPATH
        univ_name = Selector(response).xpath(
            '/html/body/div[1]/div/div/div[2]/div[3]/div/div[1]/div[2]/h1/text()').extract()
        if univ_name is not None:
            univ_name = univ_name[0].strip()
            if '--' in univ_name:
                univ_name = univ_name.split("--")[0]
            else:
                univ_name = univ_name
        else:
            univ_name=""
        print univ_name

        murder_manslaughter = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[1]/td[4]/div[1]/text()').extract()
        murder_manslaughter = murder_manslaughter[0].strip()
        print murder_manslaughter
        data["Murder/Manslaughter"] = murder_manslaughter

        neg_manslaughter = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[2]/td[4]/div[1]/text()').extract()
        neg_manslaughter = neg_manslaughter[0].strip()
        print neg_manslaughter
        data["Negligence Manslaughter"] =  neg_manslaughter

        rape = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[3]/td[4]/div[1]/text()').extract()
        rape = rape[0].strip()
        print rape
        data["Rape"] = rape

        incest = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[4]/td[4]/div[1]/text()').extract()
        incest = incest[0].strip()
        print incest
        data["Incest"] = incest

        statutory_rape = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[5]/td[4]/div[1]/text()').extract()
        statutory_rape = statutory_rape[0].strip()
        print statutory_rape
        data["Statutory Rape"] = statutory_rape

        fondling = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[6]/td[4]/div[1]/text()').extract()
        fondling = fondling[0].strip()
        print fondling
        data["Fondling"] = fondling

        robbery = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[9]/td[4]/div[1]/text()').extract()
        robbery = robbery[0].strip()
        print robbery
        data["Robbery"] = robbery

        aggravated_assault = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[10]/td[4]/div[1]/text()').extract()
        aggravated_assault = aggravated_assault[0].strip()
        print aggravated_assault
        data["Aggrevated Assault"] = aggravated_assault

        burglary = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[11]/td[4]/div[1]/text()').extract()
        burglary = burglary[0].strip()
        print burglary
        data["Burglary"] = burglary

        motor_vehicle_theft = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[12]/td[4]/div[1]/text()').extract()
        motor_vehicle_theft = motor_vehicle_theft[0].strip()
        print motor_vehicle_theft
        data["Motor Vehicle Theft"] = motor_vehicle_theft

        arson = Selector(response).xpath(
            '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[13]/td[4]/div[1]/text()').extract()
        arson = arson[0].strip()
        print arson
        data["Arson"] = arson

        for key, value in data.iteritems():
            self.writeData(univ_name, key, value)

Example #35

0

Show file

    # Email
    try:
        email = re.findall(r'[\w\.-]+@[\w\.-]+', description)[0]
    except Exception as e:
        email = []

    # Publication stuff
    v_page = requests.get(v_link)

    try:
        published = Selector(response=v_page).xpath(
            '//*[@id="ContentplaceholderMain_T7553F19B005_Col00"]/div[2]/div[2]/div[1]/div[1]/text()'
        ).get()
        published = published.strip()
        published = published.split(" ")
        publish_day = published[1].replace(",", "")
        publish_day = int(publish_day)
        publish_month = int(months[f"{published[0]}"])
        publish_year = int(published[2])
    except:
        published = 0
        publish_month = 0
        publish_year = 0
    if publish_day != yesterday_day:
        print("Not published Yesterday")
        continue

    data = {
        "company": company,
        "position": position,

Example #36

0

Show file

    def __init__(self, symbol, **kwargs):
        self.allowed_domains = ['gb.eastmoney.com']
        self.base_url_prefix = 'http://gb.eastmoney.com'
        self.base_url = 'http://gb.eastmoney.com/list,%s,1,f.html' % symbol[:6]
        self.start_urls = [self.base_url]
        self.symbol = symbol
        t1 = time()
        # obtain the number of pages
        subpage_url = 'http://gb.eastmoney.com/list,%s,1,f_{}.html' % symbol[:6]
        pageresponse = requests.post(self.base_url)
        self.tot_msg_num = 0
        self.num_per_page = 80
        if not Selector(text=pageresponse.text).xpath(
                '//div[@class="noarticle"]').extract_first() is None:
            numpage = 0
        else:
            pageresponse_text = Selector(text=pageresponse.text).xpath(
                '//span[@class="pagernums"]').extract_first()
            self.tot_msg_num = int(pageresponse_text.split('|')[-3])
            self.num_per_page = int(pageresponse_text.split('|')[-2])
            numpage = math.ceil(self.tot_msg_num / self.num_per_page)

        stockname = Selector(text=pageresponse.text).xpath(
            '//*[@id="stockname"]/a/@href').extract_first()
        if not stockname is None:
            stockname = stockname.split(',')[-1].split('.')[0]
            logging.warning(stockname)
            if stockname != symbol[:6]:
                raise
        else:
            self.start_urls = []
            return
        t2 = time()
        # obtain the records number and the last new's time
        mysql_conn1 = create_engine(
            'mysql://*****:*****@10.24.224.249/webdata?charset=utf8')
        sql = 'select S_INFO_WINDCODE, URL from EastMoney where S_INFO_WINDCODE=\'' + symbol + '\''
        df_record = pd.read_sql(sql, mysql_conn1)
        if len(df_record) == 0:
            self.last_URL = -1
        else:
            self.last_URL = max(
                [int(url.split(',')[-1]) for url in df_record['URL']])
        t3 = time()
        # set proxy to avoid forbiddance

        proxys = pd.read_sql('select ip from Proxy where score>0',
                             mysql_conn1)['ip'].values
        sel_proxy = random.choice(proxys)
        if sel_proxy[:3] == '127':
            self.proxy = None
        else:
            self.proxy = 'https://*****:*****@%s' % sel_proxy

        t4 = time()
        #logging.warning(str(self.last_URL) + '\nokkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk!')
        today = datetime.datetime.now()
        date_begin = (today + datetime.timedelta(days=-90)).strftime('%Y%m%d')
        date_end = (today + datetime.timedelta(days=30)).strftime('%Y%m%d')
        mysql_conn2 = create_engine(
            'mysql://*****:*****@10.24.224.249/wind?charset=utf8')
        trade_days = pd.read_sql(
            'select TRADE_DAYS from MyAShareCalendar where S_INFO_EXCHMARKET="SSE" order by TRADE_DAYS',
            mysql_conn2).rename(columns={'TRADE_DAYS': 'TRADE_DT'})
        trade_days['date'] = trade_days['TRADE_DT']
        self.all_date = pd.DataFrame({
            'date': [
                str(d)[:10].replace('-', '')
                for d in pd.date_range(date_begin, date_end)
            ]
        })
        self.all_date = self.all_date.merge(trade_days[['date', 'TRADE_DT']],
                                            how='left')
        self.all_date['TRADE_DT'] = self.all_date['TRADE_DT'].bfill()
        # self.all_date['next_date'] = self.all_date['TRADE_DT'].shift(-1)
        self.all_date['next_date'] = self.all_date['date'].shift(-1)
        self.all_date = self.all_date.set_index('date')

        self.record_num = len(df_record)
        crawled_pages = self.record_num // self.num_per_page  # the web display the news 80 lines per page
        start_page = max(crawled_pages, 1)

        t5 = time()
        # print(t2-t1,t3-t2,t4-t3,t5-t4)
        for i in range(numpage - start_page + 1, 0, -1):
            self.start_urls.append(subpage_url.format(i))

        super().__init__(**kwargs)

Example #37

0

Show file

    def parse(self, response):
        # os.system('dropdb whistler')
        # os.system('createdb whistler')
        # db.create_all()
        # r = requests.get('https://www.whistlerblackcomb.com/the-mountain/mountain-conditions/terrain-and-lift-status.aspx')
        skiruns_str = Selector(
            response=response).xpath('//script/text()').extract()[8]
        skiruns = json.loads(skiruns_str.split("=")[1].split(";")[0])
        lifts = skiruns['Lifts']
        #add all lifts to the DB
        for lift_dict in lifts:
            liftname = lift_dict['Name']
            liftstatus = lift_dict['Status']
            mountain = lift_dict['Mountain']
            new_lift = Lift(name=liftname,
                            status=liftstatus,
                            mountain=mountain)
            # import pdb; pdb.set_trace()
            db.session.add(new_lift)
        db.session.commit()

        # a list of dictionaries, with all the data of ski runs, separated by lifts
        runs_list = skiruns['GroomingAreas']

        # lifts_dict is a dictionay where the key 'Runs' has a value of a
        # list with all ski runs that belong to the lift.  The key 'Name' has a value
        # of each lift that services those runs, split by "-"
        for lifts_dict in runs_list:
            #add all the runs to the DB
            skirun_list = lifts_dict['Runs']
            # list of lift names separated by '-'
            lift_names = lifts_dict['Name']
            if lift_names == 'The Peak - T-Bar':
                lift_names = ['The Peak', 'T-Bars']
            else:
                lift_names = lift_names.split(" - ")

            # each ski run list is a list of runs that belong to one lift
            for skirun in skirun_list:
                skirun_name = skirun['Name']
                if '/' in skirun_name:
                    skirun_name = skirun_name.replace('/', '-')
                skirun_status = skirun['IsOpen']
                skirun_groomed = skirun['IsGroomed']
                level = skirun['Type']

                new_run = Skirun(name=skirun_name,
                                 groomed=skirun_groomed,
                                 status=skirun_status,
                                 level=level)
                import pdb
                pdb.set_trace()
                db.session.add(new_run)
            db.session.commit()

            #make the connections
            for lift_name in lift_names:
                # Change to the same lift names in the scrape
                if lift_name == 'Crystal Zone':
                    lift_name = 'Crystal Ridge Express'
                if lift_name == 'Freestyle Half-pipes':
                    lift_name = 'Catskinner Chair'
                if lift_name == 'Symphony Amphitheatre':
                    lift_name = 'Symphony Express'
                if lift_name == 'The Peak':
                    lift_name = 'Peak Express'
                if lift_name == 'Glacier':
                    lift_name = 'Showcase T-Bar'
                if lift_name == 'Habitat Terrain Park':
                    lift_name = 'Emerald Express'
                lift_obj = Lift.query.filter(
                    Lift.name.contains(lift_name)).first()

                # adding relationship
                for run in skirun_list:
                    skirun_name = run['Name']
                    if '/' in skirun_name:
                        skirun_name = skirun_name.replace('/', '-')
                    run_obj = Skirun.query.filter(
                        Skirun.name == skirun_name).first()
                    lift_obj.skiruns.append(run_obj)

            db.session.commit()

        categorieslst = ['tree', 'groomer', 'park', 'bowl']
        for category in categorieslst:
            add_category = Category(cat=category)
            db.session.add(add_category)
        db.session.commit()

        levels = ['green', 'blue', 'black']
        for level in levels:
            add_level = SkillLevel(level=level)
            db.session.add(add_level)
        db.session.commit()

        # add category to each run
        skiruns = Skirun.query.all()
        categories = {
            category.cat: category
            for category in Category.query.all()
        }

        for skirun in skiruns:
            parks = [
                'Habitat Terrain Park', 'Big Easy Terrain Garden, Sz S',
                'Nintendo Terrain Park, sz. M,L', 'Highest Level Park, Sz XL'
            ]
            bowls = [
                'Jersey Cream Bowl', 'Rhapsody Bowl', 'Ego Bowl - Lower',
                'Ego Bowl - Upper'
            ]
            trees = [
                '7th Heaven', 'Enchanted Forest', 'Rock & Roll',
                "Franz's - Upper", "Franz's - Lower"
            ]
            # skirun.category relationship
            if skirun.name in parks:
                skirun.category = categories['park']
            elif skirun.name in bowls:
                skirun.category = categories['bowl']
            elif skirun.name in trees:
                skirun.category = categories['tree']
            else:
                skirun.category = categories['groomer']
            db.session.commit()

        # Add users to our db
        users = open("../../../static/json/users.json").read()
        users = json.loads(users)
        for user in users:
            fname = user['fname']
            lname = user['lname']
            email = user['email']
            zipcode = user['zipcode']

            # check to see user selected categories
            if user.get('category'):
                category = user['category']

            # level for fake data
            rand_level = random.choice(levels)
            level = SkillLevel.query.filter(
                SkillLevel.level == rand_level).first()

            clients = User(fname=fname,
                           lname=lname,
                           email=email,
                           zipcode=zipcode,
                           level_id=level.level_id,
                           password='******')

            db.session.add(clients)

            #make the connections
            for cat in category:
                user_obj = User.query.filter(User.email == email).first()
                catusr = Category.query.filter(Category.cat == cat).first()
                catusr.users.append(user_obj)
            db.session.commit()

        ratings = open("../../../static/json/rating.txt").read()
        ratings = ratings.strip()
        ratings = ratings.split('|')

        #  loop through the list of comments
        for comment in ratings:
            comment = comment[:140]
            user_id = random.randint(1, 100)
            rating = random.randint(1, 5)
            skirun_id = random.randint(1, 142)

            comments = Rating(user_id=user_id,
                              rating=rating,
                              skirun_id=skirun_id,
                              comment=comment)
            db.session.add(comments)

        # commit work to the db
        db.session.commit()

        restaurants = open("../../../static/json/food.txt")

        for restaurant in restaurants:
            restaurant = restaurant.strip()
            restaurant_data = restaurant.split('|')
            name = restaurant_data[0].title()
            description = restaurant_data[1][:200]
            location = restaurant_data[2]
            lift_id = int(restaurant_data[4])
            yelp_id = restaurant_data[5]

            # import pdb; pdb.set_trace()
            lift_obj = Lift.query.filter(Lift.lift_id == lift_id).first()

            new_restaurant = Food(name=name,
                                  description=description,
                                  location=location,
                                  yelp_id=yelp_id)
            db.session.add(new_restaurant)
            # adding relationship
            new_restaurant.lifts.append(lift_obj)

        db.session.commit()