class Share(): def __init__(self, username, password): self.launcher = Launcher(username, password) self.driver = self.launcher.login() self.es = Es_fb() self.list = [] self.share_list = self.launcher.get_share_list() def get_share(self): self.driver.get(self.share_list[0]) for ea in self.driver.find_elements_by_xpath( '//div[@id="repost_view_permalink"]/div/div[1]/div'): for each in ea.find_elements_by_xpath('./div'): author_name = each.find_element_by_xpath( './div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).text author_id = re.findall( re.compile('id=(\d+)'), each.find_element_by_xpath( './div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).get_attribute('data-hovercard')) pic_url = each.find_element_by_xpath( './div/div[2]/div/div[2]/div/div/a/div/img').get_attribute( 'src') try: content = each.find_element_by_xpath( './div/div[2]/div/div[2]/div[2]//p').text except Exception as e: content = 'None' timestamp = int( each.find_element_by_xpath( './div/div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/a/abbr' ).get_attribute('data-utime')) item = { 'nick_name': author_name, 'uid': author_id, 'photo_url': pic_url, 'text': content, 'timestamp': timestamp } self.list.append(item) return self.list def save(self, indexName, typeName, list): self.es.executeES(indexName, typeName, list)
class Share(): def __init__(self, username, password): self.launcher = Launcher(username, password) self.driver = self.launcher.login() self.es = Es_fb() self.list = [] self.share_list = self.launcher.get_share_list() self.update_time = int(time.time()) def get_share(self): try: for url in self.share_list: self.driver.get(url) time.sleep(1) # 退出通知弹窗进入页面 try: self.driver.find_element_by_xpath('//div[@class="_n8 _3qx uiLayer _3qw"]').click() except: pass for ea in self.driver.find_elements_by_xpath('//div[@role="feed"]/div'): for each in ea.find_elements_by_xpath('./div'): try: author_name = each.find_element_by_xpath('./div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a').text except: author_name = 'None' try: author_id = ''.join(re.findall(re.compile('id=(\d+)'),each.find_element_by_xpath('./div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a').get_attribute('data-hovercard'))) except: author_id = 'None' try: pic_url = each.find_element_by_xpath('./div[2]/div/div[2]/div/div/a/div/img').get_attribute('src') except: pic_url = 'None' try: content = each.find_element_by_xpath('./div[2]/div/div[2]/div[2]').text except: content = 'None' try: try: timestamp = int(each.find_element_by_xpath('./div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/a/abbr').get_attribute('data-utime')) except: timestamp = int(each.find_element_by_xpath('./div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[2]/span/a/abbr').get_attribute('data-utime')) except: timestamp = 'None' try: mid = ''.join(re.findall(re.compile('/(\d+)'),each.find_element_by_xpath('./div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/a').get_attribute('href'))) except: mid = 'None' try: root_mid = ''.join(re.findall(re.compile('story_fbid=(\d+)'),each.find_element_by_xpath('./div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/h5/span/span/a').get_attribute('href'))) except: root_mid = 'None' item = {'uid':author_id, 'photo_url':pic_url, 'nick_name':author_name, 'mid':mid, 'timestamp':timestamp,\ 'text':content, 'update_time':self.update_time, 'root_text':content, 'root_mid':root_mid} self.list.append(item) finally: self.driver.close() return self.list def save(self, indexName, typeName, list): self.es.executeES(indexName, typeName, list)
author_id = re.findall( re.compile('id=(\d+)'), each.find_element_by_xpath( './div/div[2]/div[1]/div[2]/div[1]/div/div/div[2]/div/div/div[2]/h5/span/span/span/a' ).get_attribute('data-hovercard')) pic_url = each.find_element_by_xpath( './div/div[2]/div/div[2]/div/div/a/div/img').get_attribute( 'src') try: content = each.find_element_by_xpath( './div/div[2]/div/div[2]/div[2]//p').text except Exception as e: content = 'None' time = each.find_element_by_xpath( './div/div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/div/span[3]/span/a/abbr' ).get_attribute('data-utime') root_url = 'https://www.facebook.com/' + each.find_element_by_xpath( './div/div[2]/div/div[2]/div/div/div/div[2]/div/div/div[2]/h5/span/span/a' ).get_attribute('href') def save(self, indexName, typeName, item): es.executeES(indexName, typeName, item) if __name__ == '__main__': fb = Launcher('18538728360', 'zyxing,0513') es = es_twitter() share_list = fb.get_share_list() share = Share() share.get_share()
class Share(): def __init__(self, username, password): self.launcher = Launcher(username, password) self.es = Es_fb() self.list = [] self.share_list, self.driver = self.launcher.get_share_list() self.update_time = int(time.time()) def get_share(self): for url in self.share_list: self.driver.get(url) time.sleep(120) # 退出通知弹窗进入页面 try: self.driver.find_element_by_xpath( '//div[@class="_n8 _3qx uiLayer _3qw"]').click() except: pass page = self.driver.page_source self.driver.save_screenshot('get_share000.png') #for ea in self.driver.find_elements_by_xpath('//div[@role="feed"]/div'): #for ea in divs: # for each in ea.find_elements_by_xpath('./div'): try: author_name = self.driver.find_element_by_xpath( '//table[@role="presentation"]/tbody/tr/td[2]/div/h3/strong/a' ).text except: author_name = '' print author_name try: author_id = ''.join( re.search(re.compile('id%3D(\d+)&'), url).group(1)) except: author_id = '' print author_id # try: # pic_url = each.find_element_by_xpath('./div[2]/div/div[2]/div/div/a/div/img').get_attribute('src') # except: # pic_url = 'None' try: content = self.driver.find_element_by_xpath( '/html/body/div/div/div[2]/div/div[1]/div[1]/div/div[1]/div[2]' ).text except: content = '' try: timestamp = int( re.search( re.compile('"publish_time":(\d+),'), page.replace(' ', '').replace('\n', '').replace('\t', '')).group(1)) except: timestamp = '' print timestamp try: mid = ''.join( re.search(re.compile('fbid%3D(\d+)%'), url).group(1)) except: mid = '' print mid try: root_mid = ''.join( re.search( re.compile( '"original_content_id":"(\d+)"' ), page).group(1)) except: root_mid = '' print root_mid try: root_text = self.driver.find_element_by_xpath( '/html/body/div/div/div[2]/div/div[1]/div[1]/div/div[1]/div[3]/div[2]/div/div/div[2]' ).text.replace(' ', '').replace('\n', '').replace('\t', '') except: root_text = '' print root_text item = {'uid':author_id, 'nick_name':author_name, 'mid':mid, 'timestamp':timestamp,\ 'text':content, 'update_time':self.update_time, 'root_text':root_text, 'root_mid':root_mid} self.list.append(item) self.driver.quit() return self.list def save(self, indexName, typeName, list): self.es.executeES(indexName, typeName, list)