def move_warcs(self):
     while True:
         for dir_ in [
                 dir_ for dir_ in os.listdir('.') if os.path.isdir(dir_)
         ]:
             if dir_ == settings.dir_ready:
                 continue
             while not settings.upload_running:
                 time.sleep(1)
             files = [
                 file for file in os.listdir(dir_)
                 if os.path.isfile(os.path.join(dir_, file))
                 and file.endswith('.warc.gz')
             ]
             grab_finished = len(
                 filter(lambda file: file.endswith('-meta.warc.gz'),
                        files)) != 0
             if grab_finished:
                 for file in files:
                     os.rename(os.path.join(dir_, file),
                               os.path.join(settings.dir_ready, file))
             else:
                 for file in files:
                     warc_num = int(file[-13:-8])
                     warc_num_second = str(warc_num + 1).zfill(5)
                     if file[:-13] + warc_num_second + '.warc.gz' in files:
                         os.rename(os.path.join(dir_, file),
                                   os.path.join(settings.dir_ready, file))
             if grab_finished:
                 shutil.rmtree(dir_)
         self.upload()
         time.sleep(10)
Example #2
0
 def upload(self):
     for file in [file for file in os.listdir(settings.dir_ready) if file.endswith('.warc.gz')
             and not os.path.isfile(os.path.join(settings.dir_ready, file+'.upload'))]:
         time.sleep(1)
         if self.concurrent_uploads > settings.max_concurrent_uploads:
             time.sleep(1)
         self.uploads[file] = threading.Thread(target=self.upload_single, args=(file,))
         self.uploads[file].daemon = True
         self.uploads[file].start()
Example #3
0
 def upload(self):
     while True:
         for file in [
                 file for file in os.listdir(settings.dir_ready)
                 if file.endswith('.warc.gz') and not os.path.isfile(
                     os.path.join(settings.dir_ready, file + '.upload'))
         ]:
             while not settings.upload_running:
                 time.sleep(1)
             time.sleep(1)
             while self.concurrent_uploads > settings.max_concurrent_uploads or not self.upload_allowed(
             ):
                 time.sleep(10)
             self.concurrent_uploads += 1
             open(os.path.join(settings.dir_ready, file + '.upload'),
                  'a').close()
             date = re.search(r'([0-9]{4}-[0-9]{2}-[0-9]{2})',
                              file).group(1)
             if not file in self.uploads:
                 self.uploads[file] = {}
                 self.uploads[file]['date'] = date.replace('-', '')
                 self.uploads[file]['size'] = os.path.getsize(file)
                 if not self.uploads[file]['date'] in self.items:
                     self.items[self.uploads[file]['date']] = {}
                     self.items[self.uploads[file]['date']]['item_num'] = 0
                     self.items[self.uploads[file]['date']]['item_size'] = 0
                 elif self.items[self.uploads[file]['date']][
                         'item_size'] > self.max_item_size:
                     self.items[self.uploads[file]['date']]['item_num'] += 1
                     self.items[self.uploads[file]['date']]['item_size'] = 0
                 self.items[self.uploads[file]['date']][
                     'item_size'] += self.uploads[file]['size']
                 self.uploads[file]['item_num'] = self.items[
                     self.uploads[file]['date']]['item_num']
                 self.uploads[file]['item_size'] = self.items[
                     self.uploads[file]['date']]['item_size']
             name = self.uploads[file]['date'] + str(
                 self.uploads[file]['item_num']).zfill(4)
             ia_args = {
                 'title': 'Archive Team Newsgrab: {name}'.format(name=name),
                 'mediatype': 'web',
                 'description':
                 'A collection of news articles grabbed from a wide variety of sources around the world automatically by Archive Team scripts.',
                 'collection': 'archiveteam_newssites',
                 'date': date
             }
             threading.thread(target=self.upload_single,
                              args=(name, file, ia_args)).start()
             self.concurrent_uploads -= 1
             os.remove(os.path.join(settings.dir_ready, file + '.upload'))
             if os.path.isfile(os.path.join(settings.dir_ready, file)):
                 settings.irc_bot.send(
                     'PRIVMSG',
                     '{name} uploaded unsuccessful.'.format(name=file),
                     settings.irc_channel_bot)
Example #4
0
 def start_services(self):
     for key, value in settings.services.iteritems():
         settings.services[key].running = False
     for file in [file for file in os.listdir('services') if file.startswith(
             'web__') and file.endswith('.py')]:
         service_name = file.replace('.py', '')
         if not service_name in self.assigned_services:
             continue
         self.services.append(service_name)
         settings.services[service_name] = Service(service_name)
         settings.services[service_name].daemon = True
         settings.services[service_name].start()
Example #5
0
 def start_services(self):
     for key, value in settings.services.iteritems():
         settings.services[key].running = False
     for file in [file for file in os.listdir('services') if file.startswith(
           'web__') and file.endswith('.py')]:
         service_name = file.replace('.py', '')
         if not service_name in self.assigned_services:
             continue
         self.services.append(service_name)
         settings.services[service_name] = Service(service_name)
         settings.services[service_name].daemon = True
         settings.services[service_name].start()
 def upload(self):
     for file in [
             file for file in os.listdir(settings.dir_ready)
             if file.endswith('.warc.gz') and not os.path.isfile(
                 os.path.join(settings.dir_ready, file + '.upload'))
     ]:
         while not settings.upload_running:
             time.sleep(1)
         time.sleep(1)
         while self.concurrent_uploads > self.max_concurrent_uploads:
             time.sleep(1)
         self.uploads[file] = threading.Thread(target=self.upload_single,
                                               args=(file, ))
         self.uploads[file].daemon = True
         self.uploads[file].start()
Example #7
0
 def start_services(self):
     for file in [file for file in os.listdir('services') if file.startswith(
             'web__') and file.endswith('.py')]:
         service_name = file.replace('.py', '')
         if not service_name in self.services:
             self.new_services += 1
             self.services.append(service_name)
             settings.services[service_name] = Service(service_name)
             settings.services[service_name].daemon = True
             settings.services[service_name].start()
             settings.services[service_name].read_urls()
         settings.services[service_name].get_data()
     settings.irc_bot.send('PRIVMSG', 'Found {new_services} new services'.format(
         new_services=self.new_services), settings.irc_channel_bot)
     self.new_services = 0
     if not settings.get_urls:
         settings.get_urls = Urls()
         settings.get_urls.daemon = True
         settings.get_urls.start()
     self.distribute_services()
Example #8
0
 def start_services(self):
     for file in [file for file in os.listdir('services') if file.startswith(
             'web__') and file.endswith('.py')]:
         service_name = file.replace('.py', '')
         if not service_name in self.services:
             self.new_services += 1
             self.services.append(service_name)
             settings.services[service_name] = Service(service_name)
             settings.services[service_name].daemon = True
             settings.services[service_name].start()
             settings.services[service_name].read_urls()
         settings.services[service_name].get_data()
     settings.irc_bot.send('PRIVMSG', 'Found {new_services} new services'.format(
             new_services=self.new_services), settings.irc_channel_bot)
     self.new_services = 0
     if not settings.get_urls:
         settings.get_urls = Urls()
         settings.get_urls.daemon = True
         settings.get_urls.start()
     self.distribute_services()
Example #9
0
 def move_warcs(self):
     while True:
         for dir_ in [dir_ for dir_ in os.listdir('.') if os.path.isdir(dir_)]:
             if dir_ == settings.dir_ready:
                 continue
             if not settings.upload_running:
                 time.sleep(1)
             files = [file for file in os.listdir(dir_) if os.path.isfile(os.path.join(dir_, file)) and file.endswith('.warc.gz')]
             grab_finished = len(filter(lambda file: file.endswith('-meta.warc.gz'), files)) != 0
             if grab_finished:
                 for file in files:
                     os.rename(os.path.join(dir_, file), os.path.join(settings.dir_ready, file))
             else:
                 for file in files:
                     warc_num = int(file[-13:-8])
                     warc_num_second = str(warc_num + 1).zfill(5)
                     if file[:-13] + warc_num_second + '.warc.gz' in file:
                         os.rename(os.path.join(dir_, file), os.path.join(settings.dir_ready, file))
             if grab_finished:
                 shutil.rmtree(dir_)
         self.upload()
Example #10
0
 def upload(self):
     while True:
         for file in [
                 file for file in os.listdir(settings.dir_ready) if file.endswith('.warc.gz')
                     and not os.path.isfile(os.path.join(settings.dir_ready, file+'.upload'))]:
             while not settings.upload_running:
                 time.sleep(1)
             time.sleep(1)
             if concurrent_uploads > settings.max_concurrent_uploads or not self.upload_allowed():
                 time.sleep(10)
             concurrent_uploads += 1
             open(os.path.join(settings.dir_ready, file+'.upload'), 'a').close()
             date = re.search(r'([0-9]{4}-[0-9]{2}-[0-9]{2})', file).group(1)
             if not file in self.uploads:
                 self.uploads[file] = {}
                 self.uploads[file]['date'] = date.replace('-', '')
                 self.uploads[file]['size'] = os.path.getsize(file)
                 if not self.uploads[file]['date'] in self.items:
                     self.items[self.uploads[file]['date']] = {}
                     self.items[self.uploads[file]['date']]['item_num'] = 0
                     self.items[self.uploads[file]['date']]['item_size'] = 0
                 elif self.items[self.uploads[file]['date']]['item_size'] > self.max_item_size:
                     self.items[self.uploads[file]['date']]['item_num'] += 1
                     self.items[self.uploads[file]['date']]['item_size'] = 0
                 self.items[self.uploads[file]['date']]['item_size'] += self.uploads[file]['size']
                 self.uploads[file]['item_num'] = self.items[self.uploads[file]['date']]['item_num']
                 self.uploads[file]['item_size'] = self.items[self.uploads[file]['date']]['item_size']
             name = self.uploads[file]['date']+str(self.uploads[file]['item_num']).zfill(4)
             ia_args = {'title': 'Archive Team Newsgrab: {name}'.format(name=name),
                        'mediatype': 'web',
                        'description': 'A collection of news articles grabbed from a wide variety of sources around the world automatically by Archive Team scripts.',
                        'collection': 'archiveteam_newssites',
                        'date': date}
             threading.thread(target=self.upload_single, args=(name, file, ia_args)).start()
             concurrent_uploads -= 1
             os.remove(os.path.join(settings.dir_ready, file+'.upload'))
             if os.path.isfile(os.path.join(settings.dir_ready, file)):
                 settings.irc_bot.send('PRIVMSG', '{name} uploaded unsuccessful.'.format(
                         name=file), settings.irc_channel_bot)