def parse(self, response): topic_kws = response.meta[ 'topic_kws' ] table_name = response.meta['table_name'] all_content = unicode_normal.main_get(response.body.encode('utf-8','ignore')) # json_content = json.loads(all_content.strip()[13:-3]) dicts_str = all_content.strip()[13:-3] # with open('d:/1.txt','wb') as f: # f.write(all_content.strip()[13:-3]) dicts = re.findall(self.dict_pa,dicts_str) item_list = [] for dict in dicts: topic_item = Topic_Item() try: dd = json.loads(dict) except: print dict raw_input() continue topic_item['topic_url'] = dd['linkurl'] topic_item['topic_title'] = dd['title'] topic_item['topic_post_time'] = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(float(dd['time']))) print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(float(dd['time']))) topic_item['table_name']=table_name topic_item['topic_db_message'] = topic_kws item_list.append(topic_item) res_items = self.sqldb.get_newest_time(item_list) for item in res_items: yield scrapy.Request(item['topic_url'],callback=self.parse_torrent,meta={'topic_item':item})
image_url_pattern=re.compile(r'src=\"(.*?)\"',re.S) id_pattern=re.compile(r'\/(\d+)\/',re.S) poster_pattern=re.compile(r'alt=\"(.*?)\"',re.S) #driver=webdriver.PhantomJS() driver=webdriver.Ie() #driver=webdriver.Ie(executable_path='C:\Users\MINUS\AppData\Local\Google\Chrome\Application\IEDriverServer.exe') print 'go in' #driver.get("http://www.weibo.com") base_url='http://weibo.com/aj/comment/big?_wv=5&id=3716227136365799&page=3' #driver.get('http://weibo.com/1364882532/B6fHNlmh8?type=repost') #driver.get('http://weibo.com/aj/comment/big?_wv=5&id=3716227136365799&page=3') for page in range(comm_page): url='http://weibo.com/aj/comment/big?_wv=5&id='+post_id+'&page='+str(page+1) #print url driver.get(url) print driver.title print driver.current_url #获取当前网页url print driver.name f=open(r'C:\Users\MINUS\Desktop\work\weibo_reaseach\page.txt','wb') time.sleep(5) con=driver.page_source con=con.lower() #print con con=main_get(con) f.write(con) f.close() os.system('python C:\\Users\\MINUS\\Desktop\\work\\weibo_reaseach\\parse_comm.py') ######################################################################################### time.sleep(2) driver.quit()
poster_pattern = re.compile(r'alt=\"(.*?)\"', re.S) #driver=webdriver.PhantomJS() driver = webdriver.Ie() #driver=webdriver.Ie(executable_path='C:\Users\MINUS\AppData\Local\Google\Chrome\Application\IEDriverServer.exe') print 'go in' #driver.get("http://www.weibo.com") base_url = 'http://weibo.com/aj/comment/big?_wv=5&id=3716227136365799&page=3' #driver.get('http://weibo.com/1364882532/B6fHNlmh8?type=repost') #driver.get('http://weibo.com/aj/comment/big?_wv=5&id=3716227136365799&page=3') for page in range(comm_page): url = 'http://weibo.com/aj/comment/big?_wv=5&id=' + post_id + '&page=' + str( page + 1) #print url driver.get(url) print driver.title print driver.current_url #获取当前网页url print driver.name f = open(r'C:\Users\MINUS\Desktop\work\weibo_reaseach\page.txt', 'wb') time.sleep(5) con = driver.page_source con = con.lower() #print con con = main_get(con) f.write(con) f.close() os.system( 'python C:\\Users\\MINUS\\Desktop\\work\\weibo_reaseach\\parse_comm.py' ) ######################################################################################### time.sleep(2) driver.quit()