def move_warcs(self): while True: for dir_ in [ dir_ for dir_ in os.listdir('.') if os.path.isdir(dir_) ]: if dir_ == settings.dir_ready: continue while not settings.upload_running: time.sleep(1) files = [ file for file in os.listdir(dir_) if os.path.isfile(os.path.join(dir_, file)) and file.endswith('.warc.gz') ] grab_finished = len( filter(lambda file: file.endswith('-meta.warc.gz'), files)) != 0 if grab_finished: for file in files: os.rename(os.path.join(dir_, file), os.path.join(settings.dir_ready, file)) else: for file in files: warc_num = int(file[-13:-8]) warc_num_second = str(warc_num + 1).zfill(5) if file[:-13] + warc_num_second + '.warc.gz' in files: os.rename(os.path.join(dir_, file), os.path.join(settings.dir_ready, file)) if grab_finished: shutil.rmtree(dir_) self.upload() time.sleep(10)
def upload(self): for file in [file for file in os.listdir(settings.dir_ready) if file.endswith('.warc.gz') and not os.path.isfile(os.path.join(settings.dir_ready, file+'.upload'))]: time.sleep(1) if self.concurrent_uploads > settings.max_concurrent_uploads: time.sleep(1) self.uploads[file] = threading.Thread(target=self.upload_single, args=(file,)) self.uploads[file].daemon = True self.uploads[file].start()
def upload(self): while True: for file in [ file for file in os.listdir(settings.dir_ready) if file.endswith('.warc.gz') and not os.path.isfile( os.path.join(settings.dir_ready, file + '.upload')) ]: while not settings.upload_running: time.sleep(1) time.sleep(1) while self.concurrent_uploads > settings.max_concurrent_uploads or not self.upload_allowed( ): time.sleep(10) self.concurrent_uploads += 1 open(os.path.join(settings.dir_ready, file + '.upload'), 'a').close() date = re.search(r'([0-9]{4}-[0-9]{2}-[0-9]{2})', file).group(1) if not file in self.uploads: self.uploads[file] = {} self.uploads[file]['date'] = date.replace('-', '') self.uploads[file]['size'] = os.path.getsize(file) if not self.uploads[file]['date'] in self.items: self.items[self.uploads[file]['date']] = {} self.items[self.uploads[file]['date']]['item_num'] = 0 self.items[self.uploads[file]['date']]['item_size'] = 0 elif self.items[self.uploads[file]['date']][ 'item_size'] > self.max_item_size: self.items[self.uploads[file]['date']]['item_num'] += 1 self.items[self.uploads[file]['date']]['item_size'] = 0 self.items[self.uploads[file]['date']][ 'item_size'] += self.uploads[file]['size'] self.uploads[file]['item_num'] = self.items[ self.uploads[file]['date']]['item_num'] self.uploads[file]['item_size'] = self.items[ self.uploads[file]['date']]['item_size'] name = self.uploads[file]['date'] + str( self.uploads[file]['item_num']).zfill(4) ia_args = { 'title': 'Archive Team Newsgrab: {name}'.format(name=name), 'mediatype': 'web', 'description': 'A collection of news articles grabbed from a wide variety of sources around the world automatically by Archive Team scripts.', 'collection': 'archiveteam_newssites', 'date': date } threading.thread(target=self.upload_single, args=(name, file, ia_args)).start() self.concurrent_uploads -= 1 os.remove(os.path.join(settings.dir_ready, file + '.upload')) if os.path.isfile(os.path.join(settings.dir_ready, file)): settings.irc_bot.send( 'PRIVMSG', '{name} uploaded unsuccessful.'.format(name=file), settings.irc_channel_bot)
def start_services(self): for key, value in settings.services.iteritems(): settings.services[key].running = False for file in [file for file in os.listdir('services') if file.startswith( 'web__') and file.endswith('.py')]: service_name = file.replace('.py', '') if not service_name in self.assigned_services: continue self.services.append(service_name) settings.services[service_name] = Service(service_name) settings.services[service_name].daemon = True settings.services[service_name].start()
def upload(self): for file in [ file for file in os.listdir(settings.dir_ready) if file.endswith('.warc.gz') and not os.path.isfile( os.path.join(settings.dir_ready, file + '.upload')) ]: while not settings.upload_running: time.sleep(1) time.sleep(1) while self.concurrent_uploads > self.max_concurrent_uploads: time.sleep(1) self.uploads[file] = threading.Thread(target=self.upload_single, args=(file, )) self.uploads[file].daemon = True self.uploads[file].start()
def start_services(self): for file in [file for file in os.listdir('services') if file.startswith( 'web__') and file.endswith('.py')]: service_name = file.replace('.py', '') if not service_name in self.services: self.new_services += 1 self.services.append(service_name) settings.services[service_name] = Service(service_name) settings.services[service_name].daemon = True settings.services[service_name].start() settings.services[service_name].read_urls() settings.services[service_name].get_data() settings.irc_bot.send('PRIVMSG', 'Found {new_services} new services'.format( new_services=self.new_services), settings.irc_channel_bot) self.new_services = 0 if not settings.get_urls: settings.get_urls = Urls() settings.get_urls.daemon = True settings.get_urls.start() self.distribute_services()
def move_warcs(self): while True: for dir_ in [dir_ for dir_ in os.listdir('.') if os.path.isdir(dir_)]: if dir_ == settings.dir_ready: continue if not settings.upload_running: time.sleep(1) files = [file for file in os.listdir(dir_) if os.path.isfile(os.path.join(dir_, file)) and file.endswith('.warc.gz')] grab_finished = len(filter(lambda file: file.endswith('-meta.warc.gz'), files)) != 0 if grab_finished: for file in files: os.rename(os.path.join(dir_, file), os.path.join(settings.dir_ready, file)) else: for file in files: warc_num = int(file[-13:-8]) warc_num_second = str(warc_num + 1).zfill(5) if file[:-13] + warc_num_second + '.warc.gz' in file: os.rename(os.path.join(dir_, file), os.path.join(settings.dir_ready, file)) if grab_finished: shutil.rmtree(dir_) self.upload()
def upload(self): while True: for file in [ file for file in os.listdir(settings.dir_ready) if file.endswith('.warc.gz') and not os.path.isfile(os.path.join(settings.dir_ready, file+'.upload'))]: while not settings.upload_running: time.sleep(1) time.sleep(1) if concurrent_uploads > settings.max_concurrent_uploads or not self.upload_allowed(): time.sleep(10) concurrent_uploads += 1 open(os.path.join(settings.dir_ready, file+'.upload'), 'a').close() date = re.search(r'([0-9]{4}-[0-9]{2}-[0-9]{2})', file).group(1) if not file in self.uploads: self.uploads[file] = {} self.uploads[file]['date'] = date.replace('-', '') self.uploads[file]['size'] = os.path.getsize(file) if not self.uploads[file]['date'] in self.items: self.items[self.uploads[file]['date']] = {} self.items[self.uploads[file]['date']]['item_num'] = 0 self.items[self.uploads[file]['date']]['item_size'] = 0 elif self.items[self.uploads[file]['date']]['item_size'] > self.max_item_size: self.items[self.uploads[file]['date']]['item_num'] += 1 self.items[self.uploads[file]['date']]['item_size'] = 0 self.items[self.uploads[file]['date']]['item_size'] += self.uploads[file]['size'] self.uploads[file]['item_num'] = self.items[self.uploads[file]['date']]['item_num'] self.uploads[file]['item_size'] = self.items[self.uploads[file]['date']]['item_size'] name = self.uploads[file]['date']+str(self.uploads[file]['item_num']).zfill(4) ia_args = {'title': 'Archive Team Newsgrab: {name}'.format(name=name), 'mediatype': 'web', 'description': 'A collection of news articles grabbed from a wide variety of sources around the world automatically by Archive Team scripts.', 'collection': 'archiveteam_newssites', 'date': date} threading.thread(target=self.upload_single, args=(name, file, ia_args)).start() concurrent_uploads -= 1 os.remove(os.path.join(settings.dir_ready, file+'.upload')) if os.path.isfile(os.path.join(settings.dir_ready, file)): settings.irc_bot.send('PRIVMSG', '{name} uploaded unsuccessful.'.format( name=file), settings.irc_channel_bot)