def parse(self, response): # the rules how to deal with the pages you get # learn more about 'xpath' grammar conference = response.xpath('//td/a[1]').re("<a href=\"([^\"]*)\"") t = response.xpath('//a[1]/font[@style="line-height:130%;"]/text()').extract() csvfile = open('output/A-12.csv', 'a', newline='', errors='ignore', encoding='gbk') writer = csv.writer(csvfile) for i in range(len(conference)): print(conference[i]) if re.findall("(\S*)\.pdf", conference[i]) or re.findall("(\S*)\.doc", conference[i]): name = conference[i].split('/') if not os.path.exists('file/A-12/'): os.mkdir('file/A-12/') print('file/A-12/' + str(name[len(name) - 1]) + ' ' + conference[i]) try: urllib.request.urlretrieve(conference[i], 'file/A-12/' + str(name[len(name) - 1])) writer.writerow([t[i], conference[i]]) except Exception: writer.writerow([t[i], '']) print("Error!") continue else: print("This is a html.") # yield Request(conference[i], meta={'url': conference[i]}, callback=self.parsecontent_temps) # close the file csvfile.flush() csvfile.close()
def parse_item(self, response): item = CsharpItem() sno = response.xpath( '//div[@class = "submission-full"]/div[@class = "header"]/h3[@class = "title"]/text()' ).extract() name = response.xpath( '//div[@class = "author"]/div[@class = "fullname"]/a/text()' ).extract() info = response.xpath( '//div[@class = "attachments"]/ul/li/a/@href').extract() item['name'] = name[0].replace(' ', '') item['sno'] = sno[0] item['info'] = info[0] yield item zipper = FileItem() # url = response.urljoin(info[0].replace('?forcedownload=1', '')) url = response.urljoin(info[0]) zipper = FileItem() zipper['file_urls'] = [url] zipper['name'] = name[0].replace(' ', '') zipper['sno'] = sno[0] zipper['info'] = info[0] yield zipper
def parse(self, response): self.flag = True # 遍历当前页 所有人的主页链接地址 cxy = response.xpath("//div[@class='user-avatar ui tiny image']//@href").extract() for i in cxy: yield scrapy.Request(url=i, callback=self.zh) # 爬取全站 while self.flag != False: no = int(response.xpath("//div[@class='ui pagination menu']//a[@class='item active']//text()").extract()[0]) if no > 9: if((no+10) % no ==0): #10的倍数页的 next_page = response.xpath("//div[@class='ui pagination menu']//a//@href").extract()[2] else: #10x-10y之间的 next_page = response.xpath("//div[@class='ui pagination menu']//a//@href").extract()[int(no%10+2)] else: #<10页以下的 next_page = response.xpath("//div[@class='ui pagination menu']//a//@href").extract()[int(no)] # 查看有木有存在下一页链接 if next_page is not None: next_page = response.urljoin(next_page) #如果存在的话,我们使用:response.urljoin(next_page)把相对路径,如:page/1转换为绝对路径, #其实也就是加上网站域名,如:http://lab.scrapyd.cn/page/1; self.flag = False yield scrapy.Request(next_page, callback=self.parse)
def parsecontent_temps(self, response): # parameters can be got by response.meta['key'] # url = response.meta['url'] t = response.xpath('//div[@id="cke_pastebin"]/p[1]').extract() if not t: t = response.xpath('//td[@class="STYLE3"]/p[1]').extract() # content_temp = response.xpath('//td[@style="FONT-SIZE: 16pt; FONT-FAMILY: 宋体; mso-ascii-font-family: \'Times New Roman\'; mso-hansi-font-family: \'Times New Roman\'"]')[0].extract() # t = response.xpath('//p[@class="MsoNormal"]/span').re('style="FONT-SIZE: 22pt; COLOR: red; FONT-FAMILY: 宋体; mso-ascii-font-family: \'Times New Roman\'; mso-hansi-font-family: \'Times New Roman\'">(\S*)</span>') c = response.xpath('//div[@id="cke_pastebin"]/p[position()>1]').extract() if not c: c = response.xpath('//td[@class="STYLE3"]/p[position()>1]').extract() title = '' for i in t: title += i title = re.sub('<[^>]+>', '', title) title = re.sub('[\t\n\r]', '', title) title = re.sub(' +', ' ', title) content_temp = '' for i in c: content_temp += i # use regular expression to resolve the pages you get, especially html elements content_temp = re.sub('<[^>]+>', '', content_temp) content_temp = re.sub('[\t\n\r]', '', content_temp) content_temp = re.sub(' +', ' ', content_temp) # open the file and write and close it csvfile = open('output/A-11.csv', 'a', newline='', errors='ignore', encoding='gbk') writer = csv.writer(csvfile) if title == '': title = response.url writer.writerow([title, content_temp]) csvfile.flush() csvfile.close() relative_urls = response.xpath('..//a[@href]').re("<a href=\"([^\"]*)\"") if relative_urls: for i in range(len(relative_urls)): relative_url = relative_urls[i] relative_url = re.sub('<[^>]+>', '', relative_url) relative_url = re.sub('[\t\n\r]', '', relative_url) relative_url = re.sub(' +', ' ', relative_url) absolute_url = 'http://www.gsc.org.cn/' + relative_url if re.findall("(\S*)\.pdf|(\S*)\.doc", absolute_url): name = absolute_url.split('/') if not os.path.exists('file/A-11/'): os.mkdir('file/A-11/') print('file/A-11/' + str(name[len(name) - 1]) + ' ' + absolute_url) urllib.request.urlretrieve(absolute_url, 'file/A-11/' + str(name[len(name) - 1]))
def parsecontent_temps(self, response): # parameters can be got by response.meta['key'] # url = response.meta['url'] t = response.xpath( '//td[@style="width:700px;text-align:center;font-size:24px;font-weight:bold;line-height:40px;color:#000000;"]/text()' )[0].extract() # content_temp = response.xpath('//td[@style="FONT-SIZE: 16pt; FONT-FAMILY: 宋体; mso-ascii-font-family: \'Times New Roman\'; mso-hansi-font-family: \'Times New Roman\'"]')[0].extract() # t = response.xpath('//p[@class="MsoNormal"]/span').re('style="FONT-SIZE: 22pt; COLOR: red; FONT-FAMILY: 宋体; mso-ascii-font-family: \'Times New Roman\'; mso-hansi-font-family: \'Times New Roman\'">(\S*)</span>') c1 = response.xpath('//p[@align="left"]').extract() c2 = response.xpath('//p[@class ="MsoNormal"]').extract() c3 = response.xpath( '//p[@style="width:730px;text-align:left;line-height:28px;"]' ).extract() # c2 = response.xpath('//p[@style="text-align:left;text-indent:2em;font-family:SimSun;font-size:18px;"]').re("\">(\S*)</p>") title = t content_temp = '' for i in c1: content_temp += i for i in c2: content_temp += i for i in c3: content_temp += i # if re.findall("<a target=\"_blank\" href=\"(\S*)\.pdf\"", content_temp): # file_url = re.findall("<a target=\"_blank\" href=\"[^\"]*\"", content_temp) # file_name = re.findall("<a target=\"_blank\" href=\"[^\"]*\">(\S*)</a>", content_temp) # file_url[0] = re.sub("<a target=\"_blank\" href=\"", "", file_url[0]) # file_url[0] = re.sub("\"", "", file_url[0]) # name = file_name[0].split('/') # if not os.path.exists('file/A-08/'): # os.mkdir('file/A-08/') # print('file/A-08/' + str(name[len(name) - 1]) + ' ' + "http://www.cms1924.org" + file_url[0]) # urllib.request.urlretrieve("http://www.cms1924.org" + file_url[0], 'file/A-08/' + str(name[len(name) - 1])) # use regular expression to resolve the pages you get, especially html elements content_temp = re.sub('<[^>]+>', '', content_temp) content_temp = re.sub('[\t\n\r]', '', content_temp) content_temp = re.sub(' +', ' ', content_temp) # open the file and write and close it csvfile = open('output/A-10.csv', 'a', newline='', errors='ignore', encoding='gbk') writer = csv.writer(csvfile) writer.writerow([title, content_temp]) csvfile.flush() csvfile.close()
def parse(self, response): item = SlcoinItem() list = response.xpath('//*[@id="table"]/tbody') coin_type = list.xpath('//tr/td[2]/a/img/@alt').extract() coin_money = list.xpath('//tr/td[4]/a/@data-usd').extract() coin_upDown = list.xpath('//tr/td[7]/span/text()').extract() for i in range(0, 100): if coin_type[i] == 'BTC-比特币' or coin_type[i] == 'ETH-以太坊' or coin_type[i] == 'EOS-柚子' or coin_type[ i] == 'LTC-莱特币' or \ coin_type[i] == 'XMR-门罗币' or coin_type[i] == 'NEO-小蚁' or coin_type[i] == 'ETC-以太经典' or coin_type[ i] == 'OMG-嫩模币' or \ coin_type[i] == 'BNB-币安币' or \ coin_type[i] == 'HT-火币积分' or coin_type[i] == 'ZEC-大零币' or coin_type[i] == 'DCR' or coin_type[ i] == 'MKR' or \ coin_type[i] == 'REP' or coin_type[i] == 'GXS-公信宝' or coin_type[i] == 'EMC-崛起币' or coin_type[ i] == 'ZEN' or \ coin_type[i] == 'VERI' or coin_type[i] == 'XZC-小零币' or coin_type[i] == 'FCT-公证通': item = SlcoinItem() item['coin_type'] = re.sub("[\u4e00-\u9fa5]|-", "", coin_type[i]) item['coin_money'] = coin_money[i] item["time"] = time.strftime('%Y-%m-%d %H-%M-%S', time.localtime(time.time())) item["coin_upDown"] = coin_upDown[i] yield item else: print("I do not want it") next_page = ['list_2.html#USD'] next_page = ','.join(next_page) if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse)
def parse_url(self, response): urls = response.xpath( '//tr/td[@class = "submission cell c1"]/a[@class = "title"]/@href' ).extract() for url in urls: yield scrapy.Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.parse_item)
def parseAppDetail(self, response): print('-------11111, meta ' + str(response.meta)) appRank = response.meta['rank'] print('-------2222, flag ' + str(appRank)) appName = response.xpath( '//*[@id="J_DetDataContainer"]/div/div[1]/div[2]/div[1]/div[1]/text()' ).extract()[0] appDownload = response.xpath( '//*[@id="J_DetDataContainer"]/div/div[1]/div[2]/div[3]/div[1]/text()' ).extract()[0] appType = response.xpath('//*[@id="J_DetCate"]/text()').extract()[0] item = TestscrapytemplateItem() item['appName'] = appName item['appDownloadCount'] = appDownload item['appType'] = appType item['appLink'] = response.url item['appRank'] = appRank print('-------222, appName ' + appName + ' type = ' + appType + ' download =- ' + appDownload) yield item
def parsecontent_temps(self, response): # parameters can be got by response.meta['key'] # url = response.meta['url'] title = response.xpath( '//td[@class ="contentTitle"]/text()')[0].extract() content_temp = response.xpath( '//td[@class="contentDetail"]')[0].extract() if re.findall("<a target=\"_blank\" href=\"(\S*)\.pdf\"", content_temp): file_url = re.findall("<a target=\"_blank\" href=\"[^\"]*\"", content_temp) file_name = re.findall( "<a target=\"_blank\" href=\"[^\"]*\">(\S*)</a>", content_temp) file_url[0] = re.sub("<a target=\"_blank\" href=\"", "", file_url[0]) file_url[0] = re.sub("\"", "", file_url[0]) name = file_name[0].split('/') if not os.path.exists('file/A-08/'): os.mkdir('file/A-08/') print('file/A-08/' + str(name[len(name) - 1]) + ' ' + "http://www.cms1924.org" + file_url[0]) urllib.request.urlretrieve("http://www.cms1924.org" + file_url[0], 'file/A-08/' + str(name[len(name) - 1])) # use regular expression to resolve the pages you get, especially html elements content_temp = re.sub('<[^>]+>', '', content_temp) content_temp = re.sub('[\t\n\r]', '', content_temp) content_temp = re.sub(' +', ' ', content_temp) # open the file and write and close it csvfile = open('output/A-08.csv', 'a', newline='', errors='ignore', encoding='gbk') writer = csv.writer(csvfile) writer.writerow([title, content_temp]) csvfile.flush() csvfile.close()
def parse_detail(self, response): id = re.search('comment\/(.*?)\?', response.url).group(1) url = response.url content = ''.join( response.xpath( '//div[@id="M_"]//span[@class="ctt"]//text()').extract()) print(id, url, content) comment_count = response.xpath( '//span[@class="pms"]//text()').re_first('评论\[(.*)]') forward_count = response.xpath( '//a[contains(.,"转发[")]//text()').re_first('转发\[.*]') like_count = response.xpath('//a[contains(.,"赞[")]').re_first( '赞\[(.*)]') print(comment_count, forward_count, like_count) posted_at = response.xpath('//div[@id="M_"]//span[@class="ct"]//text()' ).extract_first(default=None) user = response.xpath('//div[@id="M_"]/div[1]/a/text()').extract_first( default=None) keyword = response.meta['keyword'] weibo_item = WeiboItem() for field in weibo_item.fields: try: weibo_item[field] = eval(field) except NameError: self.logger.debug("Field is not defined" + field) yield weibo_item
def parse(self, response): nodesAll = response.xpath( '//*[@id = "J_RankTabBody"]/li[2]/ul/child::*').extract() rankNum = len(nodesAll) for i in range(rankNum): appName = response.xpath('//*[@id="J_RankTabBody"]/li[2]/ul/li[' + str(i + 1) + ']/div[1]/a/text()').extract()[0] appLink = response.xpath('//*[@id="J_RankTabBody"]/li[2]/ul/li[' + str(i + 1) + ']/a/@href').extract()[0] appRank = i appDownloadCount = response.xpath( '//*[@id="J_RankTabBody"]/li[2]/ul/li[' + str(i + 1) + ']/div[1]/div[1]/span/text()').extract()[0] print('parse.....rank = ' + str(appRank) + ', link = ' + str(appLink) + ', downloadCount = ' + str(appDownloadCount) + ' appName = ' + str(appName)) detailLink = str(appLink).replace('..', 'https://sj.qq.com') print(detailLink) yield scrapy.Request(detailLink, self.parseAppDetail, meta={'rank': appRank}, flags=[appRank, 'sdfsdf'])
def parse_index(self, response): # print(response.text) weibos = response.xpath('//div[@class="c" and contains(@id, "M_")]') print(weibos) for weibo in weibos: is_forward = bool( weibo.xpath('.//span[@class="cmt"]').extract_first()) if is_forward: detail_url = weibo.xpath( './/a[contains(., "原文评论[")]//@href').extract_first() else: detail_url = weibo.xpath( './/a[contains(., "评论[")]//@href').extract_first() print(detail_url) yield Request(detail_url, callback=self.parse_detail)
def parse(self, response): # the rules how to deal with the pages you get # learn more about 'xpath' grammar conference = response.xpath('//td[@height="24"]/a').re("<a href=\"([^\"]*)\"") # if the page you want is a secondary page and you can only get their urls, you can collect the urls and then use function Requset() urls = [] for i in range(len(conference)): url = 'http://www.gsc.org.cn' + conference[i] # conference[i].xpath('//a/@href').extract()[0] print(url) urls.append(url) for url in urls: # parameters can be passed by meta={'key':value} yield Request(url, meta={'url': url}, callback=self.parsecontent_temps)
def parse(self, response): # the rules how to deal with the pages you get # learn more about 'xpath' grammar conference = response.xpath( '//td[@style="text-align:left;line-height:28px;border-bottom:1px #DBDBDB dashed;"]/a' ).re("<a href=\"([^\"]*)\"") # if the page you want is a secondary page and you can only get their urls, you can collect the urls and then use function Requset() urls = [] for i in range(len(conference)): conference[i] = re.sub('amp;', '', conference[i]) url = 'http://www.geosociety.org.cn/' + conference[ i] # conference[i].xpath('//a/@href').extract()[0] print(url) urls.append(url) for url in urls: # parameters can be passed by meta={'key':value} yield Request(url, meta={'url': url}, callback=self.parsecontent_temps)
def parse_subpage(self, response): # print("subpage") try: global video_url, url, duration, video_keywords, title, description, image, modified_time, category, lang, modified_video_keywords url = response.xpath( "//meta[@property = 'og:url']/@content").extract_first() langcheck = response.meta['langcheck'] try: if (langcheck == "tamil"): url = response.xpath( '//script[contains(.,"__html5playerdata")]').re( '"media":"(.+)"')[0] else: url = response.xpath( '//script[contains(., "__html5playerdata")]/text()' ).re('"media_mp4":"(.+)"')[0] video = url.split('","')[0] video_url = video.replace('\\', '') # print(video_url) url = response.xpath( "//meta[@property = 'og:url']/@content").extract_first() # print(url) title = response.xpath( "//meta[@property = 'og:title']/@content").extract_first() # print(title) description = response.xpath( "//meta[@property = 'og:description']/@content" ).extract_first() # print(description) image = response.xpath( "//meta[@property = 'og:image']/@content").extract_first() # print(image) video_keywords = response.xpath( "//meta[@name = 'keywords']/@content").extract_first() # print(video_keywords) duration = response.xpath( '//script[@type = "application/ld+json"]/text()').re( '"duration": +"(.+)"')[0] # print(duration) duration = duration.replace('PT', '0:').replace('M', ':').replace( 'S', '') # print(duration) # yield { # 'food': response.xpath("//meta[@property = 'og:title']/@content").extract_first() # } category = response.meta['category'] lang = response.meta['lang'] if (langcheck == 'tamil'): lang = "tamil" else: try: title.encode(encoding='utf-8').decode('ascii') except UnicodeDecodeError: lang = "hindi" else: lang = "english" except: print("EXCEPTION HIT") try: time = duration.split(':') time1 = time[0] if int(time[0]) > 9 else '0' + time[0] time2 = time[1] if int(time[1]) > 9 else '0' + time[1] time3 = time[2] if int(time[2]) > 9 else '0' + time[2] modified_time = str(time1 + ":" + time2 + ":" + time3) except: print("TIME ERROR") # print(video_url, duration, title, description, image, video_keywords) try: modified_video_keywords = [ x.strip() for x in video_keywords.split() ] except: print("SPLIT ERROR") keywords = Queries.insert_keywords(self, modified_video_keywords) insertObject = { "video_title": title, "video_slug": slugify(title), "video_link": video_url, "video_description": description, "broadcaster": "5d3e9dde3b6d5e43e2ef58ec", #Not yet changed "videoformat": "5ce4f7eda5c038104cb76648", #Not yet changed "video_image": image, "videokeywords": keywords, "page_url": response.url, "duration": modified_time, "category": category, "language": lang, "keywords": ' | '.join(map(str, modified_video_keywords)) } if ((len(video_url) > 0 and len(image) > 0 and len(modified_time) == 8) and (video_url.endswith('.m3u8') or video_url.endswith('.mp4'))): # print("aaa------------------aaaa") # print(title) # print(video_url) # print(modified_time) # print(image) # print(category) # print(lang) # # # print(insertObject) # print("bbb------------------bbb") result = Queries.insert_api(self, insertObject) if result.status_code == 200: print("INSERTED") else: print(result.status_code) print(result) except: print("URL MISSING")
def parse(self, response): # 遍历当前页 所有人的主页链接地址 cxy = response.xpath("//div[@class='user-avatar ui tiny image']//@href").extract() for i in cxy: yield scrapy.Request(url=i, callback=self.zh, dont_filter=True) # 爬取到的页面如何处理?提交给zh方法处理
def zh(self, response): item = ForprogrammerItem() ''' introduction = response.xpath("string(//div[@class='introduction'])").extract()[0] #个人简介 item['introduction'] = introduction.replace('\n', '').replace(' ','').replace(',','-') workAndStudyExperience = response.xpath("//ul[@class='J_Works']//text()").extract() #工作经历+教育经历 if len(workAndStudyExperience)>0: item['workAndStudyExperience'] = ','.join(workAndStudyExperience).replace('\n', '').replace(' ','').replace(',','-') else: item['workAndStudyExperience'] = '无' skill = response.xpath("//div[@class='skill-list']//text()").extract() #技能 if len(skill)>0: item['skill'] = ','.join(skill).replace('\n', '').replace(' ','').replace(',','-') else: item['skill'] = '无' moneyAndWork = response.xpath("//div[@class='hire-info']//p//text()").extract() #获取薪资及工作特点 if len(moneyAndWork)>0: item['moneyAndWork'] = ','.join(moneyAndWork).replace('\n', '').replace(' ','').replace(',','-') else: item['moneyAndWork'] = '无' zuopin = response.xpath( #作品 "//div[@class='work-list']//ul//li//a[@class='media']//div[@class='info']//p//text()").extract() item['zuopin'] = ','.join(zuopin).replace(' ', '') ''' # ------------------ 以下内容是对照数据库 二次再加的------------------------------------- #城市 user_city = response.xpath("//div[@class='introduction']//text()").extract() if len(user_city)==3: user_city = user_city[1] elif len(user_city)==2: user_city = '-' #现在公司名称 user_nowcompany = response.xpath("//div[@class='introduction']//text()").extract() if len(user_nowcompany)==2: user_nowcompany = user_nowcompany[1].split()[0] elif len(user_nowcompany)==3: user_nowcompany = user_nowcompany[2].split()[0] #现在公司职称 user_nowoccupation = response.xpath("//div[@class='introduction']//text()").extract() if len(user_nowoccupation)==2: user_nowoccupation = user_nowoccupation[1].split()[1] elif len(user_nowoccupation)==3: user_nowoccupation = user_nowoccupation[2].split()[1] #个人介绍 user_introduction = response.xpath("string(//div[@class='overflowhidden editor-style content'])").extract()[0] user_introduction = user_introduction.replace('\n', '').replace(' ', '').replace(',', '-') #期望日薪 user_expectsalary = response.xpath("string(//div[@class='hire-info']//p[@class='work-price'])").extract()[0] user_expectsalary = user_expectsalary #毕业院校 user_school = response.xpath("//div[@class='panel proginn-work-history'][last()]//p[@class='title']//span[2]//text()").extract()[0] user_school = user_school #评论数 user_comment_number = response.xpath("//div[@id='proginn_wo_omment']//div[@class='content']//div[@class='content']//a").extract() user_comment_number = len(user_comment_number) #爬取源链接 source_link = response.xpath("//head//link[@rel='canonical']//@href").extract()[0] source_link = source_link #非假期每日工作时长 #user_noworktime = response.xpath("//div[@class='hire-info']//p[last()]//text()").extract()[0] #item['user_noworktime'] = user_noworktime #github #github = response.xpath("//div[@class='social-list']//a/@href").extract()[0] #item['github'] = github #能力描述 user_ability_describe = response.xpath("//div[@class='verify']//text()").extract() user_ability_describe = ','.join(user_ability_describe).lstrip().replace(',',' ').lstrip() #姓名 user_name = response.xpath("//a[@class='header']//text()").extract()[0] user_name = user_name #头像 user_picturehead = response.xpath("//div[@class='four wide column side-profile']//a//img//@src").extract()[0] user_picturehead = user_picturehead #用户详情的id details_id = response.xpath("//head//link[@rel='canonical']//@href").extract()[0] details_id = details_id[27:] item = ForprogrammerItem( user_city = user_city, user_nowcompany = user_nowcompany, user_nowoccupation= user_nowoccupation, user_introduction = user_introduction, user_expectsalary = user_expectsalary, user_school= user_school, user_comment_number = user_comment_number, source_link = source_link, user_ability_describe = user_ability_describe, user_name = user_name, user_picturehead = user_picturehead, details_id = details_id ) yield item