def __init__(self, arguments): self.saveTweetPath = SETTINGS['SAVE_TWEET_PATH'] + arguments[ 'query_name'] self.saveUserPath = SETTINGS['SAVE_USER_PATH'] + arguments['query_name'] if arguments['spider_name'] == 'TweetScraper': mkdirs(self.saveTweetPath) mkdirs(self.saveUserPath)
def __init__(self): settings = get_project_settings() self.saveTweetPath = settings["SAVE_TWEET_PATH"] self.saveUserPath = settings["SAVE_USER_PATH"] mkdirs(self.saveTweetPath) # ensure the path exists mkdirs(self.saveUserPath)
def process_item(self, item, spider): newsaveTweetPath = self.saveTweetPath + item['location'] + "/" + item['category'][:-4] + "/" mkdirs(newsaveTweetPath) filename = item['filename'][:-6] if isinstance(item, Tweet): self.savePath = os.path.join(newsaveTweetPath, filename) if os.path.isfile(self.savePath): self.append_to_file(item, self.savePath)# simply skip existing items ### or you can rewrite the file, if you don't want to skip: # self.save_to_file(item,savePath) # logger.info("Update tweet:%s"%dbItem['url']) else: self.save_to_file(item, self.savePath) logger.debug("Add tweet:%s" % item['url']) elif isinstance(item, User): self.savePath = os.path.join(self.saveUserPath, item['ID']) if os.path.isfile(self.savePath): pass # simply skip existing items ### or you can rewrite the file, if you don't want to skip: # self.save_to_file(item,savePath) # logger.info("Update user:%s"%dbItem['screen_name']) else: self.save_to_file(item, self.savePath) logger.debug("Add user:%s" % item['screen_name']) else: logger.info("Item type is not recognized! type = %s" % type(item))
def __init__(self): self.tweets_file = None self.users_file = None self.saveTweetPath = settings['SAVE_TWEET_PATH'] self.saveUserPath = settings['SAVE_USER_PATH'] mkdirs(self.saveTweetPath) # ensure the path exists mkdirs(self.saveUserPath)
def scrap_following(self): browser = webdriver.PhantomJS( executable_path= 'C:/Users/ShuaibReeyaz/Downloads/phantomjs-2.1.1-windows/bin/phantomjs' ) browser.get("https://twitter.com/" + self.query[1:] + "/following") action = action_chains.ActionChains(browser) time.sleep(2) username = browser.find_element_by_css_selector( '.js-username-field.email-input.js-initial-focus') username.send_keys('*****@*****.**') password = browser.find_element_by_css_selector('.js-password-field') password.send_keys('ilias2019!') form = browser.find_element_by_css_selector( '.submit.EdgeButton.EdgeButton--primary.EdgeButtom--medium') form.submit() SCROLL_PAUSE_TIME = 0.5 # Get scroll height last_height = browser.execute_script( "return document.body.scrollHeight") while True: # Scroll down to bottom browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # Wait to load page time.sleep(SCROLL_PAUSE_TIME) # Calculate new scroll height and compare with last scroll height new_height = browser.execute_script( "return document.body.scrollHeight") if new_height == last_height: break last_height = new_height element = browser.find_element_by_xpath("//body") page = Selector(text=element.text) info = (element.text).split() self.saveUserPath = settings['SAVE_USER_FOLLOWERS_PATH'] mkdirs(self.saveUserPath) savePath = os.path.join(self.saveUserPath, "following" + ".txt") with open(savePath, 'a+') as f: for i in info: if i.startswith('@') and len(i) > 1 and i != self.query: f.write(i) f.write("\n") savePath = os.path.join(self.saveUserPath, "scrapped_users" + ".txt") with open(savePath, 'a+') as f: f.write(self.query) f.write("\n")
def check(self, stringToMatch): saveUserPath = settings['SAVE_USER_FOLLOWERS_PATH'] mkdirs(saveUserPath) savePath = os.path.join(saveUserPath, "scrapped_users" + ".txt") with open(savePath, 'a+') as file: for line in file: if stringToMatch == line: return True return False
def __init__(self): self.saveTweetPath = settings['SAVE_TWEET_PATH'] self.saveUserPath = settings['SAVE_USER_PATH'] self.savefollowersPath = settings['SAVE_USER_FOLLOWERS_PATH'] mkdirs(self.saveTweetPath) # ensure the path exists mkdirs(self.saveUserPath) mkdirs(self.savefollowersPath)
def __init__(self): self.saveTweetPath = settings['SAVE_TWEET_PATH'] self.saveUserPath = settings['SAVE_USER_PATH'] self.saveErrorPath = settings['SAVE_ERROR_PATH'] mkdirs(self.saveTweetPath) # ensure the path exists mkdirs(self.saveUserPath) mkdirs(self.saveErrorPath)
def __init__(self): self.saveTweetPath = SETTINGS['SAVE_TWEET_PATH'] self.saveUserPath = SETTINGS['SAVE_USER_PATH'] mkdirs(self.saveTweetPath) # ensure the path exists mkdirs(self.saveUserPath)
def __init__(self): self.saveTweetPath = settings['SAVE_TWEET_PATH'] self.saveUserPath = settings['SAVE_USER_PATH'] mkdirs(self.saveTweetPath) # 确保路径存在 mkdirs(self.saveUserPath)
def open_spider(self, spider): settings = spider.settings self.saveTweetPath = settings['SAVE_TWEET_PATH'] self.saveUserPath = settings['SAVE_USER_PATH'] mkdirs(self.saveTweetPath) # ensure the path exists mkdirs(self.saveUserPath)
def open_spider(self, spider): self.saveTweetPath = SETTINGS['SAVE_TWEET_PATH'] + spider.id + '/' self.saveUserPath = SETTINGS['SAVE_USER_PATH'] + spider.id + '/' mkdirs(self.saveTweetPath) # ensure the path exists mkdirs(self.saveUserPath)
def __init__(self): self.saveTweetPath = settings['SAVE_TWEET_PATH'] self.saveUserPath = settings['SAVE_USER_PATH'] mkdirs(self.saveTweetPath) # ensure the path exists mkdirs(self.saveUserPath)
def __init__(self): self.saveTweetPath = settings.SAVE_TWEET_PATH #self.saveUserPath = settings.SAVE_USER_PATH mkdirs(self.saveTweetPath) # ensure the path exists
def __init__(self): self.saveTweetPath = settings['SAVE_TWEET_PATH'] self.tweet_counts = 0 mkdirs(self.saveTweetPath) # ensure the path exists
def __init__(self): from scrapy.conf import settings self.saveTweetPath = settings['SAVE_TWEET_PATH'] self.saveUserPath = settings['SAVE_USER_PATH'] mkdirs(self.saveTweetPath) # ensure the path exists mkdirs(self.saveUserPath)