def check(self): if not path.exists(self.settings['data_dir'] + "/" + 'training/features.csv'): raise RunningError("Training score was not calculated. Do it and then run this command again") if not path.exists(self.settings['data_dir'] + "/" + 'organized'): makedirs(self.settings['data_dir'] + "/" + 'organized') if not path.exists(self.settings['data_dir'] + "/" + 'organized/hash'): makedirs(self.settings['data_dir'] + "/" + 'organized/hash') if not path.exists(self.settings['data_dir'] + "/" + 'organized/plain'): makedirs(self.settings['data_dir'] + "/" + 'organized/plain') if not path.exists(self.settings['data_dir'] + "/" + 'organized/trash'): makedirs(self.settings['data_dir'] + "/" + 'organized/trash')
def check(self): if not os.path.exists('data/organized'): raise RunningError( colorama.Fore.RED + "There aren't any organized dump files to process. Organize them before continuing." ) if not os.path.exists('data/processed'): os.makedirs('data/processed') if not os.path.exists('data/processed/hash'): os.makedirs('data/processed/hash') if not os.path.exists('data/processed/plain'): os.makedirs('data/processed/plain')
def check(self): if not path.exists(self.settings['data_dir'] + "/" + 'organized'): raise RunningError( "There aren't any organized dump files to process. Organize them before continuing." ) if not path.exists(self.settings['data_dir'] + "/" + 'processed'): makedirs(self.settings['data_dir'] + "/" + 'processed') if not path.exists(self.settings['data_dir'] + "/" + 'processed/hash'): makedirs(self.settings['data_dir'] + "/" + 'processed/hash') if not path.exists(self.settings['data_dir'] + "/" + 'processed/plain'): makedirs(self.settings['data_dir'] + "/" + 'processed/plain')
def check(self): if not path.exists(self.settings['data_dir'] + "/" + 'raw'): raise RunningError( "There aren't any dump files to process. Scrape them before continuing." ) if not path.exists(self.settings['data_dir'] + "/" + 'training'): makedirs(self.settings['data_dir'] + "/" + 'training') if not path.exists(self.settings['data_dir'] + "/" + 'training/hash'): makedirs(self.settings['data_dir'] + "/" + 'training/hash') if not path.exists(self.settings['data_dir'] + "/" + 'training/plain'): makedirs(self.settings['data_dir'] + "/" + 'training/plain') if not path.exists(self.settings['data_dir'] + "/" + 'training/trash'): makedirs(self.settings['data_dir'] + "/" + 'training/trash')
def check(self): if not os.path.exists('data/raw'): raise RunningError( colorama.Fore.RED + "There aren't any dump files to process. Scrape them before continuing." ) if not os.path.exists('data/training'): os.makedirs('data/training') if not os.path.exists('data/training/hash'): os.makedirs('data/training/hash') if not os.path.exists('data/training/plain'): os.makedirs('data/training/plain') if not os.path.exists('data/training/trash'): os.makedirs('data/training/trash')
def check(self): if not os.path.exists('data/training/features.csv'): raise RunningError( colorama.Fore.RED + "Training score was not calculated. Do it and then run this command again" ) if not os.path.exists('data/organized'): os.makedirs('data/organized') if not os.path.exists('data/organized/hash'): os.makedirs('data/organized/hash') if not os.path.exists('data/organized/plain'): os.makedirs('data/organized/plain') if not os.path.exists('data/organized/trash'): os.makedirs('data/organized/trash')
def update(self): """update(self) - Fill Queue with new Pastebin IDs""" new_pastes = [] raw = None while not raw: try: raw = requests_get('http://pastebin.com/archive').content if "Pastebin.com has blocked your IP" in raw: getLogger('dumpscraper').critical( "Pastebin blocked your IP. Wait a couple of hours and try again" ) raise RunningError() except ConnectionError: getLogger('dumpscraper').warn( 'Connection error, trying again in 5 seconds') raw = None sleep(5) results = BeautifulSoup(raw, "lxml").findAll( lambda tag: tag.name == 'td' and tag.a and '/archive/' not in tag. a['href'] and tag.a['href'][1:]) for entry in results: paste = PastebinPaste(entry.a['href'][1:]) # Check to see if we found our last checked URL if paste.id == self.ref_id: break new_pastes.append(paste) # Don't cry if we don't have any results try: # Let's save the starting id, so I can skip already processed pastes self.ref_id = results[0].a['href'][1:] except IndexError: dump_logger = getLogger('dumpscraper') dump_logger.info("\tArchive links not found") dump_logger.debug('\t\tFetched page:') for row in results: dump_logger.debug('\t\t\t' + row) for entry in new_pastes[::-1]: self.put(entry)
def run(self): prev_day = '1970-05-01' since_id = None if not self.settings['last_id'] else self.settings[ 'last_id'] max_id = None if not self.settings['max_id'] else self.settings[ 'max_id'] processed = 0 connection = twitter.Api( consumer_key=self.settings['app_key'], consumer_secret=self.settings['app_secret'], access_token_key=self.settings['token'], access_token_secret=self.settings['token_secret']) # Let's check if we really have some valid credentials try: connection.VerifyCredentials() except twitter.error.TwitterError as error: raise RunningError(colorama.Fore.RED + 'Twitter error: ' + error.message[0]['message']) dump_logger = getLogger('dumpscraper') while processed <= self.settings['processing_limit']: tweets = connection.GetUserTimeline(screen_name='dumpmon', max_id=max_id, exclude_replies=True, include_rts=False, count=self.settings['limit'], since_id=since_id) if not len(tweets): break removed = 0 processed += len(tweets) for tweet in tweets: max_id = tweet.id if not max_id else min(max_id, tweet.id) max_id -= 1 self.settings['last_id'] = max(since_id, tweet.id) try: link = tweet.urls[0].expanded_url except KeyError: continue dObject = datetime.datetime.fromtimestamp( tweet.created_at_in_seconds) day = dObject.strftime('%Y-%m-%d') if day != prev_day: prev_day = day dump_logger.info("Processing day: " + day) # Let's create the folder name using year/month/(full-date) structure folder = dObject.strftime('%Y') + '/' + dObject.strftime( '%m') + '/' + dObject.strftime('%d') target_dir = os.path.realpath(self.settings['data_dir'] + "/raw/" + folder) # If I already have the file, let's skip it if os.path.isfile(target_dir + '/' + str(tweet.id) + '.txt'): continue if not os.path.exists(target_dir): os.makedirs(target_dir) sleep(self.settings['delay']) data = requests.get(link) if not data.text: continue if "Pastebin.com has blocked your IP" in data.text: self.settings['last_id'] = since_id raise RunningError( colorama.Fore.RED + "Pastebin blocked your IP. Wait a couple of hours and try again, raising the delay between tweets" ) if "has been removed" in data.text: removed += 1 continue with open(target_dir + "/" + str(tweet.id) + ".txt", 'w+') as dump_file: dump_file.write(data.text.encode('utf-8')) dump_logger.info("Processed " + str(processed) + " tweets") dump_logger.info("Found " + str(removed) + " removed tweets in this batch") dump_logger.info("Total processed tweets: " + str(processed))
def run(self): prev_day = '1970-05-01' since_id = None if not self.settings['last_id'] else self.settings[ 'last_id'] max_id = None if not self.settings['max_id'] else self.settings[ 'max_id'] processed = 0 connection = twitter.Api( consumer_key=self.settings['app_key'], consumer_secret=self.settings['app_secret'], access_token_key=self.settings['token'], access_token_secret=self.settings['token_secret']) # Let's check if we really have some valid credentials try: connection.VerifyCredentials() except twitter.error.TwitterError as error: raise RunningError(colorama.Fore.RED + 'Twitter error: ' + error.message[0]['message']) while processed <= self.settings['processing_limit']: tweets = connection.GetUserTimeline(screen_name='dumpmon', max_id=max_id, exclude_replies=True, include_rts=False, count=self.settings['limit'], since_id=since_id) if not len(tweets): break removed = 0 processed += len(tweets) for tweet in tweets: max_id = tweet.id if not max_id else min(max_id, tweet.id) max_id -= 1 self.settings['last_id'] = max(since_id, tweet.id) try: link = tweet.urls[0].expanded_url except KeyError: continue day = datetime.datetime.fromtimestamp( tweet.created_at_in_seconds).strftime('%Y-%m-%d') if day != prev_day: prev_day = day print("") print("Processing day: " + day) folder = day if not os.path.exists(os.path.realpath("data/raw/" + folder)): os.makedirs(os.path.realpath("data/raw/" + folder)) sleep(self.settings['delay']) data = requests.get(link) if not data.text: continue if "Pastebin.com has blocked your IP" in data.text: self.settings['last_id'] = since_id raise RunningError( colorama.Fore.RED + "Pastebin blocked your IP. Wait a couple of hours and try again, raising the delay between tweets" ) if "has been removed" in data.text: removed += 1 sys.stdout.write('x') sys.stdout.flush() continue sys.stdout.write('.') sys.stdout.flush() with open( os.path.realpath("data/raw/" + folder + "/" + str(tweet.id) + ".txt"), 'w+') as dump_file: dump_file.write(data.text.encode('utf-8')) print("") print("\tprocessed " + str(processed) + " tweets") print("\tFound " + str(removed) + " removed tweets in this batch") print("") print("Total processed tweets: " + str(processed))
def run(self): base_url = 'https://twitter.com/i/search/timeline?f=realtime&q=' base_query = 'from:dumpmon since:%s until:%s' prev_day = '1970-05-01' processed = 0 origurl = base_url + urllib.quote( base_query % (self.parentArgs.since, self.parentArgs.until)) processing = True url = origurl # We have to pass an user agent, otherwise Twitter will display an empty content headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0' } dump_logger = getLogger('dumpscraper') while processing: r = requests.get(url, headers=headers) json_data = json.loads(r.content) raw_html = json_data['items_html'].strip() if not raw_html: processing = False continue html = LH.fromstring(raw_html) removed = 0 tweets = html.cssselect('.original-tweet') if not tweets: processing = False for tweet in tweets: link = tweet.cssselect('.twitter-timeline-link') if not link: continue link = link[0] processed += 1 paste_link = link.get('data-expanded-url') timestamp = tweet.cssselect('.js-short-timestamp')[0].get( 'data-time') tweetid = tweet.get('data-tweet-id') if not paste_link: continue dObject = datetime.datetime.fromtimestamp(float(timestamp)) day = dObject.strftime('%Y-%m-%d') if day != prev_day: prev_day = day dump_logger.info("Processing day: " + day) # Let's create the folder name using year/month/(full-date) structure folder = dObject.strftime('%Y') + '/' + dObject.strftime( '%m') + '/' + dObject.strftime('%d') target_dir = os.path.realpath(self.settings['data_dir'] + "/raw/" + folder) if not os.path.exists(target_dir): os.makedirs(target_dir) sleep(self.settings['delay']) # Sometimes we download virus and the AV drops the connection try: data = requests.get(paste_link) except requests.exceptions.ConnectionError: continue if not data.text: continue if "Pastebin.com has blocked your IP" in data.text: raise RunningError( colorama.Fore.RED + "Pastebin blocked your IP. Wait a couple of hours and try again, raising the delay between tweets" ) if "has been removed" in data.text: removed += 1 continue with open( os.path.realpath(target_dir + "/" + str(tweetid) + ".txt"), 'w+') as dump_file: dump_file.write(data.text.encode('utf-8')) # Let's setup the url for the next iteration url = origurl + '&scroll_cursor=' + json_data['scroll_cursor'] dump_logger.info("Total processed tweets: " + str(processed))
def check(self): if not os.path.exists('data/raw'): raise RunningError(colorama.Fore.RED + "There aren't any dump files to process. Scrape them before continuing.")
def check(self): if not os_path.exists(self.settings['data_dir'] + '/organized'): raise RunningError( "There aren't any organized dump files to process. Organize them before continuing." )