def parse_splash(self, response): for li in Selector(text=response.body).xpath('//ol[@class = "playlist-videos-list yt-uix-scroller yt-viewport"]/li').extract(): img_url = Selector(text=li).xpath('//img/@data-thumb').extract_first() if img_url: img_src = img_url.split('&rs')[0] + '&rs' + img_url.split('&rs')[1] else: img_url = Selector(text=li).xpath('//img/@src').extract_first() img_src = img_url.split('&rs')[0] + '&rs' + img_url.split('&rs')[1] href = 'https://youtube.com' + Selector(text=li).xpath('//a/@href').extract_first() title = Selector(text=li).xpath('//h4 [@class = "yt-ui-ellipsis yt-ui-ellipsis-2"]/text()').extract_first().strip() try: v = Video.get(title=title) except Exception as e: print(e) exit(1) if not v: try: v = Video(title=title, url=href, img_src=img_src, img_thumb=img_url) v.save(self.download_path) if self.vDownload: v.download(self.vPath) except Exception as e: print(e) exit(1) send_message('/tmp/url_pipe', href) yield { 'title': title, 'url': href, 'img_src': img_src, 'img_thumb': img_url }
def parse(self, response): for vid in Selector(text=response.body).xpath( '//ytd-grid-video-renderer[@class="style-scope ytd-grid-renderer use-ellipsis"]' ).extract(): duration = Selector(text=vid).xpath( '//span[@class="style-scope ytd-thumbnail-overlay-time-status-renderer"]/text()' ).extract_first().strip() href = 'https://youtube.com' + Selector( text=vid).xpath('//a[@id="thumbnail"]/@href').extract_first() title = Selector(text=vid).xpath( '//a[@id="video-title"]/text()').extract_first() views = int( Selector(text=vid).xpath('//a[@id="video-title"]/@aria-label'). extract_first().split()[-2].replace(',', '')) img_thumb = Selector(text=vid).xpath('//img/@src').extract_first() img_src = None if img_thumb: img_src = img_thumb.split( '?sqp')[0] + '?amp;sqp' + img_thumb.split('?sqp')[1] v = Video.get(title=title) if v: v.update(views=views, duration=duration) else: v = Video(title, href, img_src, img_thumb, views, duration) v.save(self.download_path) if self.vDownload: v.download(self.vPath)
def select_one(req, select, comments=False): try: _sel = Selector(response=req).css(select).get() if _sel != None and comments: _sel = _sel.split('-->')[1] _sel = _sel.split('<!--')[0] return _sel except: log_error(req.url, f'{select} სელექტორის', True) return None
def get_bet_ratio(source): ratios = {} for i in [1, 2, 3]: try: extracted_text = Selector(text=source) \ .xpath('//*[@id="highcharts-0"]/svg/g[2]/g/g/g[{}]/text/tspan/text()'.format(str(i))) \ .extract()[0] team_name, ratio = extracted_text.split( )[:-1][0], extracted_text.split()[-1].strip('(|)') ratios[team_name] = ratio except: continue return ratios
def parse(self, response): """ Parse through Whistlers weather page """ # Extract the current weather report daily_rept = Selector( response=response).xpath('//script/text()').extract()[8] snowfall_rept = json.loads(daily_rept.split("=")[2].split(";")[0]) # Extract the forcast weather report forcast_rept = Selector( response=response).xpath('//script/text()').extract()[11] forcast = json.loads(forcast_rept.split("=")[2].split(";")[0]) # Calculate and convert snow to inches daily_snow_inches = snowfall_rept['TwentyFourHourSnowfall']['Inches'] daily_snow_centimeters = snowfall_rept['TwentyFourHourSnowfall'][ 'Centimeters'] daily_snowfall = (int(daily_snow_centimeters) * 0.39) + int(daily_snow_inches) # extract overnight snowfall information overnight_snowfall_inches = snowfall_rept['OvernightSnowfall'][ 'Inches'] overnight_snowfall_centimeters = snowfall_rept['OvernightSnowfall'][ 'Centimeters'] overnight_snowfall = (int(overnight_snowfall_centimeters) * 0.39) + int(overnight_snowfall_inches) if overnight_snowfall < 1: overnight_snowfall = 0 # import pdb; pdb.set_trace() # extract forcast information long_forcast = forcast[0]['ForecastData'][0]['WeatherLongDescription'] wind_forcast = forcast[0]['Wind'] forcast_icon = forcast[0]['WeatherIconStatus'] # query the db for the weater obj weather = Weather.query.filter(Weather.weather_id == 1).first() # update weather conditions weather.wind_forcast = wind_forcast weather.daily_snowfall = daily_snowfall weather.overnight_snowfall = overnight_snowfall weather.snow_forcast = long_forcast weather.forcast_icon = forcast_icon db.session.commit() print "wooo hoooo-commited to db \n\n"
def parse(self,response): item = Meilele_Item() item['site'] = response.url title = Selector(response).xpath("//span[@id='jsImgName']/text()").extract() if len(title) > 0 : item['title'] = title[0] else : return tags = Selector(response).xpath("//div[@class='content']/a/text()").extract(); if len(tags) > 0 : item['tag'] = ','.join(tags) else : item['tag'] = '' url = Selector(response).xpath("//div[@id='jsGalleryStageIn']/img/@src").extract()[0] item['origin_url'] = url item['new_url']='' item['mb']='' item['pixel']='' item['format']=url.split("/")[-1].split('.')[-1] return item;
def parse_item(self,response): item = Fang_Item() item['site'] = response.url item['title'] = Selector(response).xpath("//div[@class='info']/h1/text()").extract()[0] item['shortDescription'] = '' item['shortDescription'] = "|".join(Selector(response).xpath("//div[@class='info']/p/text()").extract()) item['category']='' item['style']='' category = Selector(response).xpath("//div[@class='info']/ul/li[1]/a/text()").extract() if category: item['category']=category[0] style = Selector(response).xpath("//div[@class='info']/ul/li[2]/a/text()").extract() if style: item['style']=style[0] tags =Selector(response).xpath("//div[@class='tag']/ul/a/text()").extract() tag = []; for n in tags: tag.append(n) item['tag'] = ','.join(tag); url = Selector(response).xpath("//img[@id='LeftImg']/@src").extract()[0] item['origin_url'] = url item['new_url']='' item['mb']='' widthPx = Selector(response).xpath("//img[@id='LeftImg']/@width").extract() heightPx = Selector(response).xpath("//img[@id='LeftImg']/@height").extract() item['pixel']='' item['format']=url.split("/")[-1].split('.')[-1] return item;
def parse2(self, response): item = CrawlerItem() try: page = Selector(response=response).xpath('//ul[@class="pagination"]') author = Selector(response=response).xpath('//h1[@class="title J_title"]/text()').get().strip() author = ' '.join(author.split()) print(author) if len(page)==0: print('只有一页评论') comments=self.comms(response) for comment in comments: if comment == " " or comment == " " : pass else: item['author'] = author item['comment'] = comment yield item ##### else: print('该页有多页评论') page_num = page[0].xpath('./li/a/text()').getall() print(page_num) num = int(page_num[-2]) print(num) for n in range(1,num+1): print(f'正在提取第{n}页') if n == 1: url = response.request.url + '/#comments' else: url = response.request.url + f'/p{n}/#comments' yield scrapy.Request(url=url, callback=self.parse3, dont_filter=False) except Exception as e: print(e) print('手机详情页链接未抓取成功')
def parse(self,response): item = Aiuw_Item() item['site'] = response.url title = Selector(response).xpath("//span[@id='imgExplain']/text()").extract() if len(title) > 0 : item['title'] = title[0] else : return tags = Selector(response).xpath("//div[@class='tag']/a/text()").extract() if len(tags) > 0 : item['tag'] = ','.join(tags) else : item['tag'] = '' url = Selector(response).xpath("//div[@class='img_boxlist up userSelectNone']/img/@src").extract()[0] item['origin_url'] = url.replace("zip@q80", "zip@w400") item['new_url']='' item['mb']='' item['pixel']='' item['format']=url.split("/")[-1].split('.')[-1] return item;
def parse_info_page(self, response): item = AnimePeopleItem() people_fav = None more_info = None anime_people = response.xpath('//*[@class="h1"]/text()').extract()[0] infolist = response.xpath( '//*[@id="content"]/table/tr/td/div').extract() for i in range(len(infolist)): if Selector(text=infolist[i]).xpath('//span/text()').extract() == [ u'Member Favorites:' ]: people_fav = Selector( text=infolist[i]).xpath('//text()').extract()[1].strip() if Selector(text=infolist[i]).xpath('//span/text()').extract() == [ u'More:' ]: more_info = Selector( text=infolist[i + 1]).xpath('//text()').extract() more_info = reduce(lambda x, y: x + y, more_info, '') more_info = ' '.join(more_info.split('\r\n')) item['anime_people'] = anime_people item['anime_people_fav'] = people_fav item['anime_people_info'] = more_info yield item
def parse2(self, response): """ 解析详情页 获取:电影分级、本站排名、收藏次数、简介 """ item = response.meta['item'] try: movieLevel = Selector( response=response).xpath('//div[@class="level-item"]/img[@src]' ).extract_first().strip() p = re.compile('([a-z])-big-1.png') movieLevel = p.findall(movieLevel)[0] except: movieLevel = None movieScore = Selector(response=response).xpath( '//p[@class="f4"]/text()').extract_first().strip('本站排名:').strip() movieFav = Selector(response=response).xpath( '//label[@id="resource_views"]/../../div[2]/text()').extract_first( ).strip('收藏次数:').strip() #movieCon = Selector(response=response).xpath('//div[@class="con"]/span/descendant-or-self::text()').extract_first().strip() movieCon = Selector(response=response).xpath('//div[@class="con"][2]') movieCon = movieCon.xpath('string(.)').extract_first().strip().replace( '\r\n', '') movieCon = ''.join(movieCon.split()).replace(',', ',') movieViwsLink = 'http://www.rrys2019.com/resource/index_json/rid/' + item[ 'rid'] + '/channel/tv' print(movieLevel, movieScore, movieViwsLink, movieFav, movieCon) item['movieLevel'] = movieLevel item['movieScore'] = movieScore item['movieFav'] = movieFav item['movieCon'] = movieCon yield scrapy.Request(url=movieViwsLink, meta={'item': item}, callback=self.parse3)
def parse_build_basic_contents(self, response): '''楼盘概况 ifrm_BuildBasic.pub?blid=102699''' content_list = response.xpath("//table[@id='info']") if len(content_list) == 1: table_content = content_list[0].extract() #key = [k.extract() for k in Selector(text=table_content).xpath("//td[@align='right']/text()")] #value = [v.extract() for v in Selector(text=table_content).xpath("//td[@align='left']")] value = Selector(text=table_content).xpath("//td[@align='left']//text()") #kv = dict(zip(key,value)) item = ProjectBasicItem() # get 102699 from ifrm_BuildBasic.pub?blid=102699 project_id = response.url.split("=")[1] # get ProvInfo.pub?prid=100498 provinfo_href = Selector(text=table_content).xpath("//td[@align='left']/a[@href]//@href").extract()[0] developer_id = provinfo_href.split("=")[1] item['project_id'] = project_id item['developer_id'] = developer_id item['project_name'] = value[0].extract() #kv[u'项目现定名:'] item['project_temp_name'] = value[1].extract() #kv[u'项目暂定名:'] item['licence_id'] = value[2].extract()# kv[u'预(销)售许可证号:'] item['approving_authority'] = value[3].extract() #kv[u'预(销)售批准机关:'] item['developer'] = value[4].extract().strip() #kv[u'开 发 商:'] item['partner'] = strip_null(value[5].extract()) #kv[u'合 作 方:'] item['location'] = value[6].extract() #kv[u'坐 落:'] item['district'] = value[7].extract() #kv[u'行 政 区:'] item['zone'] = value[8].extract() #kv[u'区 位:'] item['total_building_area'] = float(value[9].extract().split()[0]) #kv[u'总建筑面积:'] item['approval'] = value[10].extract() #kv[u'立项批文:'] item['planning_id'] = value[11].extract() #kv[u'规划许可证号:'] item['land_id'] = value[12].extract() #kv[u'土地证号:'] item['builder_licence'] = value[13].extract() #kv[u'施工许可证号;'] item['land_licence'] = value[14].extract() #kv[u'用地许可证号:'] item['total_land'] = float(value[15].extract().split()[0]) #kv[u'总 用 地:'] item['current_used_land'] = float(value[16].extract().split()[0]) #kv[u'当期用地:'] item['start_date'] = value[17].extract() #kv[u'开工日期:'] item['planning_end_date'] = value[18].extract() #kv[u'预计竣工日期:'] item['invest'] = float(value[19].extract().split()[0]) #kv[u'项目投资:'] item['presell_total_area'] = float(value[20].extract().split()[0]) #kv[u'预售总面积:'] #kv[u'公建配套面积:'] public_area = value[21].extract().split()[0] item['public_area'] = float(public_area) item['total_units'] = int(value[22].extract()) #kv[u'总 套 数:'] item['plot_rate'] = float(value[23].extract()) #kv[u'容 积 率:'] item['green_rate'] = float(strip_null(value[24].extract()).split()[0]) #kv[u'绿 化 率:'] item['sale_agent'] = strip_null(value[25].extract()) #kv[u'代销公司:'] item['phone_number'] = strip_null(value[26].extract()) #kv[u'电 话: '] item['sale_location'] = strip_null(value[27].extract()) #kv[u'项目销售地点:'] item['sale_phone_number'] = strip_null(value[28].extract()) #kv[u'销售电话:'] item['property_management_company'] = strip_null(value[29].extract()) #kv[u'物业公司:'] fee_str = format_property_fee(value[30].extract()) fee = fee_str.split("-") item['property_fee_from'] = float(fee[0]) #kv[u'物 管 费:'] item['property_fee_to'] = float(fee[1]) yield item
def download_font_file(data): # 下载字体文件 rules = Rules.css_font css_url = Selector(text=data).xpath(rules['css']).extract_first() response = requests.get(css_url).text font_url = Selector(text=response).re_first(rules['font']) filename = '{}/{}'.format(self.temp, font_url.split(r'/')[-1]) urlretrieve(font_url, filename)
def get_rating_descriptors(self, desc_line=None): """ get_rating_descriptors is used to query BotSentinel to get the descriptions that correspond to a description key, and both are correlated to the integer rating each user is assigned :param desc_line: Default argument, not required. :type desc_line: str :return: A list of tuples of the ranges and rating descriptions :rtype: list """ descriptors = [] # Check if we have the descriptors cached, return the cache if len(self.descriptors) > 0: return [True, self.descriptors] # If the descriptors arent in cache and arent provied, get them if len(self.descriptors) == 0 and not desc_line: page = self.query_site(self.url, 'get') desc_line = Selector(text=page.text).xpath(self.js_path).extract() # Parse the <script> text and grab the descriptors for i in desc_line: desc_matched = self.desc_regex.search(i) if not desc_matched: continue if desc_matched: desc_line = desc_matched.group(0) break # Clean up the returned descriptors desc_line = re.sub(self.tag_clean, '', desc_line) desc_line = desc_line.split('[') desc_line = [desc_line[2], desc_line[3], desc_line[4], desc_line[5]] # More cleanup of returned descriptors. This could probably be # improved for i in desc_line: i = re.sub("|".join(self.extra_chars), '', i) prelist = [] for x in i.split(','): if x != '': # The parsed integers are considered strings, # convert them if theyre integers, catch the # exception if it's a string and pass try: x = int(x) except ValueError: pass prelist.append(x) descriptors.append(tuple(prelist)) # Return a list of tuples, each tuple containing info # and ranges for each descriptor return descriptors
def Vacancy(link): print("request sent for Vacancy succesfully") url = link page = requests.get(url) # Category try: category = Selector(response=page).xpath( '//*[@id="MainContentPlaceHolder_jobContainer"]/div[5]/div[1]/text()' ).get() except: category = "" # Ends try: ends = Selector(response=page).xpath( '//*[@id="MainContentPlaceHolder_jobContainer"]/div[5]/div[3]/text()' ).get() ends = ends.split("/") deadline_day = int(ends[0]) deadline_month = int(ends[1]) deadline_year = int(ends[2]) except: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Description try: description = Selector(response=page).xpath( '//*[@id="MainContentPlaceHolder_jobContainer"]').get() description = remove_tags(description) except: description = "" # Email try: email = re.findall(r'[\w\.-]+@[\w\.-]+', description)[0] email = [email] except: email = [] data = { "category": category, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, 'description': description, "email": email } print("Vacancy data is scraped") return data # Vacancy('https://www.myjob.am/Announcement.aspx?jobId=57479')
def parse(self, response): """ Parse through Whistlers weather page """ # Extract the current weather report daily_rept = Selector( response=response).xpath('//script/text()').extract()[8] snowfall_rept = json.loads(daily_rept.split("=")[2].split(";")[0]) # Extract the forcast weather report forcast_rept = Selector( response=response).xpath('//script/text()').extract()[11] forcast = json.loads(forcast_rept.split("=")[2].split(";")[0]) # Calculate and convert snow to inches daily_snow_inches = snowfall_rept['TwentyFourHourSnowfall']['Inches'] daily_snow_centimeters = snowfall_rept['TwentyFourHourSnowfall'][ 'Centimeters'] daily_snowfall = (int(daily_snow_centimeters) * 0.39) + int(daily_snow_inches) # extract overnight snowfall information overnight_snowfall_inches = snowfall_rept['OvernightSnowfall'][ 'Inches'] overnight_snowfall_centimeters = snowfall_rept['OvernightSnowfall'][ 'Centimeters'] overnight_snowfall = (int(overnight_snowfall_centimeters) * 0.39) + int(overnight_snowfall_inches) # extract forcast information long_forcast = forcast[0]['ForecastData'][0]['WeatherLongDescription'] wind_forcast = forcast[0]['Wind'] forcast_icon = forcast[0]['WeatherIconStatus'] # Instansiate a weather object weather = Weather(wind_forcast=wind_forcast, daily_snowfall=daily_snowfall, overnight_snowfall=overnight_snowfall, snow_forcast=long_forcast, forcast_icon=forcast_icon) db.session.add(weather) db.session.commit()
def get_questions(category, level): data = { "play": True, "kat[]": [category], "level[]": [level], "anzahl": 300 } response = requests.post("https://www.fragespiel.com/quiz/training.html", data=data) js_questions = Selector( text=response.text).xpath('//script[3]/text()').get() json_questions = js_questions.split("json\' : \'")[1].split("\',")[0] return json.loads(json_questions)
def _get_pages_total(self, response): """ Calculate total page count :param response: scrapy response object :return: calculated page count|int """ el = Selector(response).xpath( '//td[@class="pagination"]//span[@class="fielddata"]/text()' ).extract_first() total = int(el.split('of')[1].strip()) / self.items_per_page self.logger.debug("Total pages: %s", total) return int(total)
def handle(self, *args, **options): for bus_station in BusStation.objects.filter(code__isnull=True).all(): print(bus_station.name, bus_station.address, bus_station.link) try: response = requests.get(bus_station.link) print('\t', response.status_code) if response.status_code == 200: response_text = response.text res = Selector(text=response_text).css( '#zonaBloqueContent h3::text').extract_first() if not res: continue code, name = res.split('-') code = code.replace('Paradero', '').strip() print(code) if not bus_station.code: bus_station.code = code bus_station.save() for other_station in Selector(text=response_text).css( '.moduloParaderoMultiple a'): other_station_code = other_station.css( '.codigoParadero::text').extract_first() station = BusStation.objects.filter( code=other_station_code).first() if station and station.id != bus_station.id: if station not in bus_station.related_stations.all( ): bus_station.related_stations.add(station) if bus_station not in station.related_stations.all( ): station.related_stations.add(bus_station) if (station.latitude and station.longitude and not bus_station.latitude): bus_station.latitude = station.latitude bus_station.longitude = station.longitude bus_station.save() continue if (bus_station.latitude and bus_station.longitude and not station.latitude): station.latitude = bus_station.latitude station.longitude = bus_station.longitude station.save() continue if not res: continue except Exception as e: print(bus_station.id, e) print('...Next station')
def __init__(self, symbol, **kwargs): self.allowed_domains = ['guba.eastmoney.com'] self.base_url_prefix = 'http://guba.eastmoney.com' self.base_url = 'http://guba.eastmoney.com/list,%s,1,f.html' % symbol self.start_urls = [self.base_url] # obtain the number of pages subpage_url = 'http://guba.eastmoney.com/list,%s,1,f_{}.html' % symbol pageresponse = requests.post(self.base_url) pageresponse = Selector(text=pageresponse.text).xpath( '//span[@class="pagernums"]').extract_first() numpage = math.ceil( int(pageresponse.split('|')[-3]) / int(pageresponse.split('|')[-2])) # record the right number of news with open('log/web_%s.txt' % symbol, 'w') as f: f.write('%d' % int(pageresponse.split('|')[-3])) # add all the urls of pages for i in range(1, numpage + 1): self.start_urls.append(subpage_url.format(i)) super().__init__(**kwargs)
def parse(self,response): item = Tuba_Item() item['site'] = response.url item['title'] = Selector(response).xpath("//div[@id='apphigh']/img/@title").extract()[0] url = Selector(response).xpath("//div[@id='apphigh']/img/@src").extract()[0] item['origin_url'] = url item['new_url']='' item['mb']='' item['pixel']='' item['format']=url.split("/")[-1].split('.')[-1] return item;
def parse3(self, response): item = CrawlerItem() try: author = Selector(response=response).xpath('//h1[@class="title J_title"]/text()').get().strip() author = ' '.join(author.split()) comments=self.comms(response) for comment in comments: if comment == " " or comment == " " : pass else: item['author'] = author item['comment'] = comment yield item except Exception as e: print(e) print('手机详情页链接未抓取成功')
def parse(self, response): html_list = response.xpath( "//div[@id='toplist']/div[@class='g-mn3']/div[@class='g-mn3c']/div[@class='g-wrap12']/div[@id='song-list-pre-cache']/div/div[@class='j-flag']/table/tbody/tr" ).extract() for item in html_list: music_item = NeteaseMusicItem() hot_num = Selector( text=item).xpath("//tr/td[1]/div[@class='hd']/span/text()" ).extract_first() or '' hot_rk = Selector(text=item).xpath( "//tr/td[1]/div[@class='hd']/div/span[@class='ico u-icn u-icn-73 s-fc9']/text()" ).extract_first() or '' song_pic = Selector(text=item).xpath( "//tr/td[2]/div[@class='f-cb']/div[@class='tt']/a/img/@src" ).extract_first() or '' song_name = Selector(text=item).xpath( "//tr/td[2]/div[@class='f-cb']/div[@class='tt']/div[@class='ttc']/span/a/b/@title" ).extract_first() or '' song_href = Selector(text=item).xpath( "//tr/td[2]/div[@class='f-cb']/div[@class='tt']/div[@class='ttc']/span/a/@href" ).extract_first() song_id = '' if song_href: song_id = song_href.split('=')[-1] song_time = Selector(text=item).xpath( "//tr/td[3]/span/text()").extract_first() or '' singer_name = Selector(text=item).xpath( "//tr/td[4]/div[@class='text']/@title").extract_first() or '' music_item['hot_num'] = hot_num music_item['hot_rk'] = hot_rk music_item['song_pic'] = song_pic music_item['song_name'] = song_name music_item['song_id'] = song_id music_item['song_time'] = song_time music_item['singer_name'] = singer_name yield music_item
def parse_item(self,response): item = Qijia_Item() item['site'] = response.url item['title'] = Selector(response).xpath("//div[@class='crumb']/i/text()").extract()[0] item['key'] = Selector(response).xpath("//meta[@name='keywords']").xpath('@content').extract()[0] item['tag']=Selector(response).xpath("//p[@class='pic_desp']/span/i/text()").extract()[0] url = Selector(response).xpath("//img[@class='lazyload']/@_src").extract()[0] item['origin_url'] = url item['new_url']='' item['mb']='' item['pixel']='' item['format']=url.split("/")[-1].split('.')[-1] return item;
def parse(self,response): item = Shejiben_Item() item['site'] = response.url title = Selector(response).xpath("//div[@class='pageTag']/a/text()").extract(); if len(title) == 3 : item['title'] = title[-1] else : return url = Selector(response).xpath("//li[@class='nowPic']/img/@src").extract()[0] item['origin_url'] = url item['new_url']='' item['mb']='' item['size']='' item['pixel']='' item['format']=url.split("/")[-1].split('.')[-1] return item;
def parse(self, response): """ Parsing through our data returned from webscraping """ skiruns_str = Selector( response=response).xpath('//script/text()').extract()[11] skiruns = json.loads(skiruns_str.split("=")[1].split(";")[0]) lifts = skiruns['Lifts'] # update lift status information in the db for lift_dict in lifts: liftname = lift_dict['Name'] liftstatus = lift_dict['Status'] lift = Lift.query.filter(Lift.name == liftname).first() lift.status = liftstatus db.session.commit() runs_list = skiruns['GroomingAreas'] for lifts_dict in runs_list: # Seperate the data from the webbscrapper skirun_list = lifts_dict['Runs'] lift_names = lifts_dict['Name'] lift_names = lift_names.split(" - ") # Loop over and update the run information for skirun in skirun_list: skirun_name = skirun['Name'] skirun_status = skirun['IsOpen'] skirun_groomed = skirun['IsGroomed'] # import pdb; pdb.set_trace() skirun2 = Skirun.query.filter( Skirun.name == skirun_name).first() # import pdb; pdb.set_trace() skirun2.status = skirun_status skirun2.groomed = skirun_groomed db.session.commit()
def __init__(self, symbol, **kwargs): self.allowed_domains = ['guba.eastmoney.com'] self.base_url_prefix = 'http://guba.eastmoney.com' self.base_url = 'http://guba.eastmoney.com/list,%s,f_1.html' % symbol self.start_urls = [] # obtain the number of pages subpage_url = 'http://guba.eastmoney.com/list,%s,f_{}.html' % symbol pageresponse = requests.post(self.base_url) pageresponse = Selector(text=pageresponse.text).xpath( '//span[@class="pagernums"]').extract_first() numpage = math.ceil( int(pageresponse.split('|')[-3]) / int(pageresponse.split('|')[-2])) # record the right number of news end_page = 0 if os.path.exists('log/check_out.txt'): with open('log/check_out.txt', 'r') as f: end_page = int(f.readlines()[-1].strip()) if numpage - end_page < 1: return if numpage - end_page < 10: task_page = numpage - end_page else: task_page = 10 with open('log/check_out.txt', 'a+') as f: f.write('%d\n' % (end_page + task_page)) # add all the urls of pages for i in range(numpage - end_page, numpage - end_page - task_page, -1): #job = int(job) #for i in range(job*10,(job+1)*10): self.start_urls.append(subpage_url.format(i)) super().__init__(**kwargs)
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Company try: company = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lnkCompany"]/text()' ).get() except: company = "" # Website try: website = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lnkCompany"]/@href' ).get() website = [website] except: website = [] # Position try: position = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobPostTitle"]/text()' ).get() except: position = "" # logo try: logo = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_imgCompanyLogoLink"]/@src' ).get() logo = "http://jobfinder.am/" + logo except: logo = '' # Job_type try: job_type = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblPositionType"]/text()' ).get() except: job_type = "" # Category try: category = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblCategory"]/text()' ).get() except: category = "" # Experience try: experience = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblExperience"]/text()' ).get() except: experience = "" # Education try: education = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblEducation"]/text()' ).get() except: education = "" # Location try: location = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblLocation"]/text()' ).get() except: location = "" # Published try: published = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblDate"]/text()' ).get() published = published.split(" ") published = published[0].split("-") publish_day = int(published[0]) publish_month = int(published[1]) publish_year = int("20" + published[2]) except: publish_day = 0 publish_month = 0 publish_year = 0 if yesterday_day != publish_day or yesterday_month != publish_month: print("Not published yesterday") return # Ends try: ends = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblDate"]/text()' ).get() ends = ends.split(" ") ends = ends[0].split("-") deadline_day = int(ends[0]) deadline_month = int(ends[1]) deadline_year = int("20" + ends[2]) except: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Salary try: salary = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblSalary"]/text()' ).get() salary = int(salary) except: salary = 0 # Age try: age = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblAge"]/text()' ).get() if "--------" in age: age = "" except: age = "" # Gender try: gender = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblGender"]/text()' ).get() if "--------" in gender: gender = "" except: gender = "" # Job Description try: j_description = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobDescription"]/text()' ).get() except: j_description = "" # Job Responsibilities try: j_responsibilities = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobResponsibilities"]/text()' ).get() except: j_responsibilities = "" # Required Qualifications try: r_qualifications = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblRequiredQualifications"]' ).get() r_qualifications = remove_tags(r_qualifications) except: r_qualifications = "" # Application Procedure try: a_procedure = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblApplicationProcedure"]' ).get() a_procedure = remove_tags(a_procedure) except: a_procedure = remove_tags(a_procedure) v_description = j_description + "\n" + j_responsibilities + "\n" + r_qualifications + "\n" + a_procedure try: if detect(v_description) == "et": try: v_description_en = Translate(v_description) except: v_description_en = "" v_description_am = v_description else: v_description_en = v_description v_description_am = "" except: v_description_en = "" v_description_am = "" # About Company try: c_description = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblAboutCompany"]' ).get() c_description = remove_tags(c_description) except: c_description = "" try: if detect(c_description) == "et": try: c_description_en = Translate(c_description) except: c_description_en = "" c_description_am = c_description else: c_description_en = c_description c_description_am = "" except: c_description_en = "" c_description_am = "" # Email try: email = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblApplicationProcedure"]/a/text()' ).get() email = email.strip() email = [email] except: email = [] # Phone try: phone = re.search(r"\d{9}", v_description_en).group() phone = [{"country_code": "374", "number": phone}] except: phone = [] data = { "company": company, "position": position, "website": website, "logo": logo, "job_type": job_type, "category": category, "experience": experience, "education": education, "location": location, "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, "salary": salary, "age": age, "gender": gender, "v_description_am": v_description_am, "v_description_en": v_description_en, "c_description_am": c_description_am, "c_description_en": c_description_en, "email": email, "phone": phone, } # print(data) return data # Vacancy('http://jobfinder.am/ViewJob.aspx?JobPostingID=49217')
def parse_book(self, response): self.randomSleep(sleeptime) item = BookItem() selector = Selector(response) item['cover'] = (selector.xpath('//div[@id="mainpic"]/a/@href').extract())[0] # item['cover'] = (selector.xpath('//a[@class = "nbg"]/@href').extract())[0].replace('mpic', 'lpic') # mpic为小图, lpic为大图 # get the cover image url item['book_id'] = response.url.split('/')[-2] logger.info('-----process book ' + item['book_id'] + ' -----') # 吐槽: 渣豆瓣,书名等信息没有标签,使用xpath根本不好提取! # WTF, there are no labels for the information of the book! info_block = (selector.xpath('//div[@id="info"]').extract()[0].encode('utf-8').split('<br>'))[:-1] # info_block最后一项为空,需要删除,否则会报越界错误 # the last item of 'info_block' is null, it will raise an error unless delete it. item['author'] = '' item['subtitle'] = '' item['publisher'] = '' item['price'] = '' item['total_page'] = '' item['pub_year'] = '' item['isbn'] = '' for info_item in info_block: info_text = Selector(text = info_item).xpath('string(.)').extract()[0] info_text = ''.join(info_text.split()) if('作者'.decode('utf-8') in info_text): item['author'] = info_text[3:] elif('副标题'.decode('utf-8') in info_text): item['subtitle'] = (info_text[4:]) elif('出版社'.decode('utf-8') in info_text): item['publisher'] = (info_text[4:]) elif('定价'.decode('utf-8') in info_text): item['price'] = info_text[3:] elif('页数'.decode('utf-8') in info_text): item['total_page'] = info_text[3:] elif('出版年'.decode('utf-8') in info_text): item['pub_year'] = info_text[4:] elif('ISBN' in info_text): item['isbn'] = info_text[5:] item['title'] = selector.xpath('//span[@property="v:itemreviewed"]/text()').extract()[0] item['grade'] = selector.xpath('//strong[@property="v:average"]/text()').extract()[0].encode('utf-8') item['gradecount'] = selector.xpath('//a[@class="rating_people"]/span/text()').extract()[0].encode('utf-8') read_block = selector.xpath('//div[@id="collector"]') item['reading_num'] = (read_block.xpath('.//a[contains(@href, "doings")]/text()').extract()[0])[:-3] item['readed_num'] = (read_block.xpath('.//a[contains(@href, "collections")]/text()').extract()[0])[:-3] item['preread_num'] = (read_block.xpath('.//a[contains(@href, "wishes")]/text()').extract()[0])[:-3] # TODO: 整理数据到相关数据库或表格 # TODO: reform item to the database yield item # 请求bookid对应的book的封面图片 # Request the cover image for the book yield scrapy.Request( url = item['cover'], headers = self.getRandomHds(), meta = { 'proxy':proxy, 'cookiejar':1 }, callback = (lambda response, book_id=item['book_id']: self.parse_cover(response, book_id)), errback = self.parse_err )
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Published try: published = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[2]/span/text()[2]' ).get() published = published.strip().split(" ") publish_day = int(published[0].split("/")[0]) publish_month = int(published[0].split("/")[1]) publish_year = int(published[0].split("/")[2]) except Exception as e: publish_day = 0 publish_month = 0 publish_year = 0 if yesterday_day != publish_day or yesterday_month != publish_month: print("Not published yesterday") return # Location # try: location = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[1]/text()' ).get() location = location.strip() location_id = [] location = {"city": f"{location}", "id": f"{Geonames(location)}"} location_id.append(location) except: location_id = [{'city': 'Yerevan', 'id': '616052'}] # Posted by try: posted_by = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[1]/text()' ).get() posted_by = posted_by.strip() except: posted_by = "" # Email try: email = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[2]/text()' ).get() email = email.strip() if email == "": email = [] else: email = [email] except: email = [] # Workspace try: workspace = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[2]/div[2]/div[2]/p/text()' ).get() workspace = workspace.strip() except: workspace = "" # Job_type try: job_type = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[3]/div[2]/div[2]/p/text()' ).get() job_type = job_type.strip() except: job_type = "" # Salary try: salary = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[4]/div[2]/div[2]/p/text()' ).get() salary = salary.strip().replace("Until ", "") if "-" in salary: salary = salary.split("-") min_salary = int(salary[0].strip()) max_salary = int(salary[1].strip()) elif "-" not in salary and salary != '': min_salary = int(salary) max_salary = int(salary) else: min_salary = 0 max_salary = 0 except: min_salary = 0 max_salary = 0 # Education try: education = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[5]/div[2]/div[2]/p/text()' ).get() education = education.strip() except: education = "" # Experience try: experience = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[6]/div[2]/div[2]/p/text()' ).get() experience = experience.strip() except: experience = "" # Gender try: gender = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[7]/div[2]/div[2]/p/i/@class' ).get() if "female" in gender: gender = "female" elif "male" in gender: gender = "male" else: gender = '' except: gender = "" # Age try: age = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[8]/div[2]/div[2]/p/text()' ).get() age = age.strip() except: age = "" print(1) # Description try: description = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[10]/div[2]/div/p/text()' ).get() description = description.strip() except: description = "" description_en = "" description_am = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Phone try: phone = Selector(response=page).css( '#sidebar-border > div.detailed-info-block.form-inline.clearfix > div.clearfix > div > div.user-details' ).extract() phones = [] for phone in phone: phone = remove_tags(phone).strip() area_code = "374" number = phone.replace(" ", "") number = number.replace("-", "") number = number.replace("(", "") number = number.replace(")", "") phones.append({'country_code': area_code, "number": number}) except: phone = [] # Username try: username = Selector(response=page).xpath( '//*[@id="sidebar-border"]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div[2]/h6/a/text()' ).get() username = username.strip() except: username = "" data = { "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "location_id": location_id, "posted_by": posted_by, "email": email, "workspace": workspace, "job_type": job_type, "min_salary": min_salary, "max_salary": max_salary, "education": education, "experience": experience, "gender": gender, "age": age, "description_am": description_am, "description_en": description_en, "phone": phones, "username": username } print(data) return data # Vacancy("https://full.am/en/job/public/view/1163") # https://full.am/en/job/public/view/12067 # https://full.am/en/job/public/view/1163
).strip() link = "https://jobs.ge" + link except: link = "" try: position = Selector(response=page).xpath( f'//*[@id="job_list_table"]/tr[{tr}]/td[2]/a/text()').get( ).strip() except: position = "" try: published = Selector(response=page).xpath( f'//*[@id="job_list_table"]/tr[{tr}]/td[5]/text()').get() publish_day = int(published.split(" ")[0]) publish_month = int(months[f"{published.split(' ')[1]}"]) publish_year = year except: publish_day = 0 publish_month = 0 publish_year = 0 if yesterday_day != publish_day: print("Not published yesterday") continue try: ends = Selector(response=page).xpath( f'//*[@id="job_list_table"]/tr[{tr}]/td[6]/text()').get() ends = ends.split(" ") deadline_day = int(ends[0])
def parse(self, response): detailed_review_object_list = [] review_selector_list = response.xpath( '//div[@id="reviews-container"]//div[@class="js-paginator-data"]' ).xpath('//div[@class="rvw js-rvw"]') for _review_selector in review_selector_list: _current_review_selector_body = _review_selector.get() # _review_rating = _review_selector.xpath('//div[@class="rvw__hdr-stat"]//img/@data-rating').get() # '5.0' _review_rating = Selector( text=_current_review_selector_body).xpath( '//div[@class="rvw__hdr-stat"]//img/@data-rating').get() # _author_info = _review_selector.xpath('//div[@class="rvw-aut__inf"]/strong/text()').get() # 'Julie of Ceres,, CA' _author_info = Selector(text=_current_review_selector_body).xpath( '//div[@class="rvw-aut__inf"]/strong/text()').get() _author_state: str = _author_info.split(',')[-1] # 'CA' # _review_date_text = _review_selector.xpath('//div[@class="rvw-bd ca-txt-bd-2"]/span/text()').get() #'Original review: March 18, 2019' _review_date_text = Selector( text=_current_review_selector_body).xpath( '//div[@class="rvw-bd ca-txt-bd-2"]/span/text()').get( ).split(':')[-1] # Let's remove whitespace to make it easier to convert to datetime object _review_date_text = _review_date_text.replace(' ', '') # _review_date_text = 'March18,2019' _review_date_text = _review_date_text[-4:] # _date_pattern = '%b.%d,%Y' # 'Oct.21,2019' _date_pattern = '%Y' # '2019' _struct_time_format = (time.strptime(_review_date_text, _date_pattern)) _date_time_format = datetime.datetime(*_struct_time_format[:6]) eastern = pytz.timezone('US/Eastern') utc = pytz.utc aware_date_time = eastern.localize(_date_time_format) utc_review_date_time = aware_date_time.astimezone(utc).timestamp() # This will be the list of all paragraphs that we find in a review that we will be using to process. _review_description_paragraph_list: list = Selector( text=_current_review_selector_body).xpath( '//div[@class="rvw-bd ca-txt-bd-2"]/p').getall() _clean_review_description_list: list = [] # Let's check if there is a collapsed div that we need to process. if Selector(text=_current_review_selector_body).xpath( '//div[@class="rvw-bd ca-txt-bd-2"]/div[@class="js-collapsed"]' ).get() is not None: # We need to get all the paragraphs in the collapsed div that we found _collapsed_paragraph_list = Selector( text=_current_review_selector_body ).xpath( '//div[@class="rvw-bd ca-txt-bd-2"]/div[@class="js-collapsed"]/p' ).getall() # Let's add these new paragraphs to our original list for processing _review_description_paragraph_list.extend( _collapsed_paragraph_list) for para in _review_description_paragraph_list: if Selector(text=para).xpath('//p/text()').get( ) is not None: # If the paragraph is not empty _clean_review_description_list.append( Selector(text=para).xpath('//p/text()').get()) _clean_review_description = ''.join(_clean_review_description_list) _num_found_useful_text: str = Selector( text=_current_review_selector_body ).xpath( '//div[@class="rvw-foot"]/span[@class="rvw-foot__helpful-count js-helpful-count ca-txt--clr-gray"]/strong/text()' ).get() # We need to extract the number from the text we get from _num_found_useful_text --> E.g. '97 people' _num_found_useful: str = _num_found_useful_text.split(' ')[0] detailed_review_object = { 'ratings': _review_rating, 'reviewer_location': _author_state, 'review_time_utc': str(utc_review_date_time), 'review_description': _clean_review_description, 'num_found_useful': _num_found_useful } detailed_review_object_list.append(detailed_review_object) _return_data = {'reviews': detailed_review_object_list} return _return_data
def parse(self, response): description = response.xpath( "//table[@class='itemlist']/tr[not(re:test(@class, " "'(spacer)'))]").extract() row = self.get_default_row_dict() # print description for i, v in enumerate(description): index = i if not row['rank']: value = Selector(text=v).xpath( '//td[1]/span[@class="rank"]/text()').extract_first() row['rank'] = int(value.replace('.', '')) if value else 0 if not row['story_text']: value = Selector(text=v).xpath( '//td[3]/a[@class="storylink"]/text()').extract_first() row['story_text'] = value.encode("utf8") if value else '' if not row['link_href']: value = Selector(text=v).xpath( '//td[3]/a[@class="storylink"]/@href').extract_first() # print value row['link_href'] = value if value else '' if not row['hn_user']: value = Selector(text=v).xpath( '//a[@class="hnuser"]/text()').extract_first() row['hn_user'] = value.encode("utf8") if value else '' if not row['age']: value = Selector(text=v).xpath( '//span[@class="age"]/a/text()').extract_first() row['age'] = int(value.split(' ')[0]) if value else 0 if not row['total_comments']: value = Selector(text=v).xpath( '//td[@class="subtext"]/a[contains(@href, "item?id=")]/text()' ).extract_first() if value: value = value.encode('ascii', 'ignore').replace( 'comments', '') if value else '' value = value.encode('ascii', 'ignore').replace( 'comment', '') if value else '' row['total_comments'] = int(value) if represents_int( value) else 0 if not row['score']: value = Selector(text=v).xpath( '//span[@class="score"]/text()').extract_first() row['score'] = int(value.split(' ')[0]) if value else 0 if not row['hn_id_code']: value = Selector( text=v).xpath('//tr[@class="athing"]/@id').extract_first() row['hn_id_code'] = int(value) if represents_int(value) else 0 if all([None for i, v in row.items() if v == None]): print 'Go for save >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>' data = row.copy() row = self.get_default_row_dict() self.comment_url.append( 'https://news.ycombinator.com/item?id=15318440') news_id = data['hn_id_code'] item = NewsBotItem(data) yield item request = scrapy.Request( url='https://news.ycombinator.com/item?id=' + str(news_id), callback=self.parse_comment) request.meta['item'] = item request.meta['news_id'] = int(news_id) yield request if index % 2: row = self.get_default_row_dict()
def captureDataAndWrite(self, response): data = {} # get all the data through XPATH univ_name = Selector(response).xpath( '/html/body/div[1]/div/div/div[2]/div[3]/div/div[1]/div[2]/h1/text()').extract() if univ_name is not None: univ_name = univ_name[0].strip() if '--' in univ_name: univ_name = univ_name.split("--")[0] else: univ_name = univ_name else: univ_name="" print univ_name murder_manslaughter = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[1]/td[4]/div[1]/text()').extract() murder_manslaughter = murder_manslaughter[0].strip() print murder_manslaughter data["Murder/Manslaughter"] = murder_manslaughter neg_manslaughter = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[2]/td[4]/div[1]/text()').extract() neg_manslaughter = neg_manslaughter[0].strip() print neg_manslaughter data["Negligence Manslaughter"] = neg_manslaughter rape = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[3]/td[4]/div[1]/text()').extract() rape = rape[0].strip() print rape data["Rape"] = rape incest = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[4]/td[4]/div[1]/text()').extract() incest = incest[0].strip() print incest data["Incest"] = incest statutory_rape = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[5]/td[4]/div[1]/text()').extract() statutory_rape = statutory_rape[0].strip() print statutory_rape data["Statutory Rape"] = statutory_rape fondling = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[6]/td[4]/div[1]/text()').extract() fondling = fondling[0].strip() print fondling data["Fondling"] = fondling robbery = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[9]/td[4]/div[1]/text()').extract() robbery = robbery[0].strip() print robbery data["Robbery"] = robbery aggravated_assault = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[10]/td[4]/div[1]/text()').extract() aggravated_assault = aggravated_assault[0].strip() print aggravated_assault data["Aggrevated Assault"] = aggravated_assault burglary = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[11]/td[4]/div[1]/text()').extract() burglary = burglary[0].strip() print burglary data["Burglary"] = burglary motor_vehicle_theft = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[12]/td[4]/div[1]/text()').extract() motor_vehicle_theft = motor_vehicle_theft[0].strip() print motor_vehicle_theft data["Motor Vehicle Theft"] = motor_vehicle_theft arson = Selector(response).xpath( '//*[@data-field-id="gCrimOnCampus"]/table/tbody/tr[13]/td[4]/div[1]/text()').extract() arson = arson[0].strip() print arson data["Arson"] = arson for key, value in data.iteritems(): self.writeData(univ_name, key, value)
# Email try: email = re.findall(r'[\w\.-]+@[\w\.-]+', description)[0] except Exception as e: email = [] # Publication stuff v_page = requests.get(v_link) try: published = Selector(response=v_page).xpath( '//*[@id="ContentplaceholderMain_T7553F19B005_Col00"]/div[2]/div[2]/div[1]/div[1]/text()' ).get() published = published.strip() published = published.split(" ") publish_day = published[1].replace(",", "") publish_day = int(publish_day) publish_month = int(months[f"{published[0]}"]) publish_year = int(published[2]) except: published = 0 publish_month = 0 publish_year = 0 if publish_day != yesterday_day: print("Not published Yesterday") continue data = { "company": company, "position": position,
def __init__(self, symbol, **kwargs): self.allowed_domains = ['gb.eastmoney.com'] self.base_url_prefix = 'http://gb.eastmoney.com' self.base_url = 'http://gb.eastmoney.com/list,%s,1,f.html' % symbol[:6] self.start_urls = [self.base_url] self.symbol = symbol t1 = time() # obtain the number of pages subpage_url = 'http://gb.eastmoney.com/list,%s,1,f_{}.html' % symbol[:6] pageresponse = requests.post(self.base_url) self.tot_msg_num = 0 self.num_per_page = 80 if not Selector(text=pageresponse.text).xpath( '//div[@class="noarticle"]').extract_first() is None: numpage = 0 else: pageresponse_text = Selector(text=pageresponse.text).xpath( '//span[@class="pagernums"]').extract_first() self.tot_msg_num = int(pageresponse_text.split('|')[-3]) self.num_per_page = int(pageresponse_text.split('|')[-2]) numpage = math.ceil(self.tot_msg_num / self.num_per_page) stockname = Selector(text=pageresponse.text).xpath( '//*[@id="stockname"]/a/@href').extract_first() if not stockname is None: stockname = stockname.split(',')[-1].split('.')[0] logging.warning(stockname) if stockname != symbol[:6]: raise else: self.start_urls = [] return t2 = time() # obtain the records number and the last new's time mysql_conn1 = create_engine( 'mysql://*****:*****@10.24.224.249/webdata?charset=utf8') sql = 'select S_INFO_WINDCODE, URL from EastMoney where S_INFO_WINDCODE=\'' + symbol + '\'' df_record = pd.read_sql(sql, mysql_conn1) if len(df_record) == 0: self.last_URL = -1 else: self.last_URL = max( [int(url.split(',')[-1]) for url in df_record['URL']]) t3 = time() # set proxy to avoid forbiddance proxys = pd.read_sql('select ip from Proxy where score>0', mysql_conn1)['ip'].values sel_proxy = random.choice(proxys) if sel_proxy[:3] == '127': self.proxy = None else: self.proxy = 'https://*****:*****@%s' % sel_proxy t4 = time() #logging.warning(str(self.last_URL) + '\nokkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkk!') today = datetime.datetime.now() date_begin = (today + datetime.timedelta(days=-90)).strftime('%Y%m%d') date_end = (today + datetime.timedelta(days=30)).strftime('%Y%m%d') mysql_conn2 = create_engine( 'mysql://*****:*****@10.24.224.249/wind?charset=utf8') trade_days = pd.read_sql( 'select TRADE_DAYS from MyAShareCalendar where S_INFO_EXCHMARKET="SSE" order by TRADE_DAYS', mysql_conn2).rename(columns={'TRADE_DAYS': 'TRADE_DT'}) trade_days['date'] = trade_days['TRADE_DT'] self.all_date = pd.DataFrame({ 'date': [ str(d)[:10].replace('-', '') for d in pd.date_range(date_begin, date_end) ] }) self.all_date = self.all_date.merge(trade_days[['date', 'TRADE_DT']], how='left') self.all_date['TRADE_DT'] = self.all_date['TRADE_DT'].bfill() # self.all_date['next_date'] = self.all_date['TRADE_DT'].shift(-1) self.all_date['next_date'] = self.all_date['date'].shift(-1) self.all_date = self.all_date.set_index('date') self.record_num = len(df_record) crawled_pages = self.record_num // self.num_per_page # the web display the news 80 lines per page start_page = max(crawled_pages, 1) t5 = time() # print(t2-t1,t3-t2,t4-t3,t5-t4) for i in range(numpage - start_page + 1, 0, -1): self.start_urls.append(subpage_url.format(i)) super().__init__(**kwargs)
def parse(self, response): # os.system('dropdb whistler') # os.system('createdb whistler') # db.create_all() # r = requests.get('https://www.whistlerblackcomb.com/the-mountain/mountain-conditions/terrain-and-lift-status.aspx') skiruns_str = Selector( response=response).xpath('//script/text()').extract()[8] skiruns = json.loads(skiruns_str.split("=")[1].split(";")[0]) lifts = skiruns['Lifts'] #add all lifts to the DB for lift_dict in lifts: liftname = lift_dict['Name'] liftstatus = lift_dict['Status'] mountain = lift_dict['Mountain'] new_lift = Lift(name=liftname, status=liftstatus, mountain=mountain) # import pdb; pdb.set_trace() db.session.add(new_lift) db.session.commit() # a list of dictionaries, with all the data of ski runs, separated by lifts runs_list = skiruns['GroomingAreas'] # lifts_dict is a dictionay where the key 'Runs' has a value of a # list with all ski runs that belong to the lift. The key 'Name' has a value # of each lift that services those runs, split by "-" for lifts_dict in runs_list: #add all the runs to the DB skirun_list = lifts_dict['Runs'] # list of lift names separated by '-' lift_names = lifts_dict['Name'] if lift_names == 'The Peak - T-Bar': lift_names = ['The Peak', 'T-Bars'] else: lift_names = lift_names.split(" - ") # each ski run list is a list of runs that belong to one lift for skirun in skirun_list: skirun_name = skirun['Name'] if '/' in skirun_name: skirun_name = skirun_name.replace('/', '-') skirun_status = skirun['IsOpen'] skirun_groomed = skirun['IsGroomed'] level = skirun['Type'] new_run = Skirun(name=skirun_name, groomed=skirun_groomed, status=skirun_status, level=level) import pdb pdb.set_trace() db.session.add(new_run) db.session.commit() #make the connections for lift_name in lift_names: # Change to the same lift names in the scrape if lift_name == 'Crystal Zone': lift_name = 'Crystal Ridge Express' if lift_name == 'Freestyle Half-pipes': lift_name = 'Catskinner Chair' if lift_name == 'Symphony Amphitheatre': lift_name = 'Symphony Express' if lift_name == 'The Peak': lift_name = 'Peak Express' if lift_name == 'Glacier': lift_name = 'Showcase T-Bar' if lift_name == 'Habitat Terrain Park': lift_name = 'Emerald Express' lift_obj = Lift.query.filter( Lift.name.contains(lift_name)).first() # adding relationship for run in skirun_list: skirun_name = run['Name'] if '/' in skirun_name: skirun_name = skirun_name.replace('/', '-') run_obj = Skirun.query.filter( Skirun.name == skirun_name).first() lift_obj.skiruns.append(run_obj) db.session.commit() categorieslst = ['tree', 'groomer', 'park', 'bowl'] for category in categorieslst: add_category = Category(cat=category) db.session.add(add_category) db.session.commit() levels = ['green', 'blue', 'black'] for level in levels: add_level = SkillLevel(level=level) db.session.add(add_level) db.session.commit() # add category to each run skiruns = Skirun.query.all() categories = { category.cat: category for category in Category.query.all() } for skirun in skiruns: parks = [ 'Habitat Terrain Park', 'Big Easy Terrain Garden, Sz S', 'Nintendo Terrain Park, sz. M,L', 'Highest Level Park, Sz XL' ] bowls = [ 'Jersey Cream Bowl', 'Rhapsody Bowl', 'Ego Bowl - Lower', 'Ego Bowl - Upper' ] trees = [ '7th Heaven', 'Enchanted Forest', 'Rock & Roll', "Franz's - Upper", "Franz's - Lower" ] # skirun.category relationship if skirun.name in parks: skirun.category = categories['park'] elif skirun.name in bowls: skirun.category = categories['bowl'] elif skirun.name in trees: skirun.category = categories['tree'] else: skirun.category = categories['groomer'] db.session.commit() # Add users to our db users = open("../../../static/json/users.json").read() users = json.loads(users) for user in users: fname = user['fname'] lname = user['lname'] email = user['email'] zipcode = user['zipcode'] # check to see user selected categories if user.get('category'): category = user['category'] # level for fake data rand_level = random.choice(levels) level = SkillLevel.query.filter( SkillLevel.level == rand_level).first() clients = User(fname=fname, lname=lname, email=email, zipcode=zipcode, level_id=level.level_id, password='******') db.session.add(clients) #make the connections for cat in category: user_obj = User.query.filter(User.email == email).first() catusr = Category.query.filter(Category.cat == cat).first() catusr.users.append(user_obj) db.session.commit() ratings = open("../../../static/json/rating.txt").read() ratings = ratings.strip() ratings = ratings.split('|') # loop through the list of comments for comment in ratings: comment = comment[:140] user_id = random.randint(1, 100) rating = random.randint(1, 5) skirun_id = random.randint(1, 142) comments = Rating(user_id=user_id, rating=rating, skirun_id=skirun_id, comment=comment) db.session.add(comments) # commit work to the db db.session.commit() restaurants = open("../../../static/json/food.txt") for restaurant in restaurants: restaurant = restaurant.strip() restaurant_data = restaurant.split('|') name = restaurant_data[0].title() description = restaurant_data[1][:200] location = restaurant_data[2] lift_id = int(restaurant_data[4]) yelp_id = restaurant_data[5] # import pdb; pdb.set_trace() lift_obj = Lift.query.filter(Lift.lift_id == lift_id).first() new_restaurant = Food(name=name, description=description, location=location, yelp_id=yelp_id) db.session.add(new_restaurant) # adding relationship new_restaurant.lifts.append(lift_obj) db.session.commit()