Exemple #1
0
    def parse(self, response):
        topic_kws = response.meta[ 'topic_kws' ]
        table_name = response.meta['table_name']  
        
        all_content = unicode_normal.main_get(response.body.encode('utf-8','ignore'))
        
#         json_content = json.loads(all_content.strip()[13:-3])
        dicts_str = all_content.strip()[13:-3]
#         with open('d:/1.txt','wb') as f:
#             f.write(all_content.strip()[13:-3])
        dicts = re.findall(self.dict_pa,dicts_str)
        
        item_list = []
        for dict in dicts:
            topic_item = Topic_Item()
            try:
                dd = json.loads(dict)
            except:
                print dict
                raw_input()
                continue
            topic_item['topic_url'] = dd['linkurl'] 
            topic_item['topic_title'] = dd['title']
            topic_item['topic_post_time'] = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(float(dd['time'])))  
            print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(float(dd['time'])))
            
            
            topic_item['table_name']=table_name
            topic_item['topic_db_message'] = topic_kws      
            item_list.append(topic_item)

        res_items = self.sqldb.get_newest_time(item_list)
        for item in res_items:
            yield scrapy.Request(item['topic_url'],callback=self.parse_torrent,meta={'topic_item':item}) 
Exemple #2
0
image_url_pattern=re.compile(r'src=\"(.*?)\"',re.S)
id_pattern=re.compile(r'\/(\d+)\/',re.S)
poster_pattern=re.compile(r'alt=\"(.*?)\"',re.S)
#driver=webdriver.PhantomJS()
driver=webdriver.Ie()
#driver=webdriver.Ie(executable_path='C:\Users\MINUS\AppData\Local\Google\Chrome\Application\IEDriverServer.exe')
print 'go in'
#driver.get("http://www.weibo.com")
base_url='http://weibo.com/aj/comment/big?_wv=5&id=3716227136365799&page=3'
#driver.get('http://weibo.com/1364882532/B6fHNlmh8?type=repost')
#driver.get('http://weibo.com/aj/comment/big?_wv=5&id=3716227136365799&page=3')
for page in range(comm_page):
	url='http://weibo.com/aj/comment/big?_wv=5&id='+post_id+'&page='+str(page+1)
	#print url
	driver.get(url)
	print driver.title
	print driver.current_url  #获取当前网页url
	print driver.name
	f=open(r'C:\Users\MINUS\Desktop\work\weibo_reaseach\page.txt','wb')
	time.sleep(5)
	con=driver.page_source
	con=con.lower()
	#print con
	con=main_get(con)
	f.write(con)
	f.close()
	os.system('python C:\\Users\\MINUS\\Desktop\\work\\weibo_reaseach\\parse_comm.py')
	#########################################################################################
	time.sleep(2)
driver.quit()
Exemple #3
0
poster_pattern = re.compile(r'alt=\"(.*?)\"', re.S)
#driver=webdriver.PhantomJS()
driver = webdriver.Ie()
#driver=webdriver.Ie(executable_path='C:\Users\MINUS\AppData\Local\Google\Chrome\Application\IEDriverServer.exe')
print 'go in'
#driver.get("http://www.weibo.com")
base_url = 'http://weibo.com/aj/comment/big?_wv=5&id=3716227136365799&page=3'
#driver.get('http://weibo.com/1364882532/B6fHNlmh8?type=repost')
#driver.get('http://weibo.com/aj/comment/big?_wv=5&id=3716227136365799&page=3')
for page in range(comm_page):
    url = 'http://weibo.com/aj/comment/big?_wv=5&id=' + post_id + '&page=' + str(
        page + 1)
    #print url
    driver.get(url)
    print driver.title
    print driver.current_url  #获取当前网页url
    print driver.name
    f = open(r'C:\Users\MINUS\Desktop\work\weibo_reaseach\page.txt', 'wb')
    time.sleep(5)
    con = driver.page_source
    con = con.lower()
    #print con
    con = main_get(con)
    f.write(con)
    f.close()
    os.system(
        'python C:\\Users\\MINUS\\Desktop\\work\\weibo_reaseach\\parse_comm.py'
    )
    #########################################################################################
    time.sleep(2)
driver.quit()