def add_key (alg, key, nm): try: name, machine = nm.split('@') except ValueError: return False name = name d = './' + keydir + '/' + machine kf = d + '/' + prepend + name + '.pub' if not mre.match (machine) or not nre.match (name) or not are.match (alg): return False if os.path.exists(kf): print 'ignoring duplicate key for:', kf return True # we do this so that we don't leak info f = NamedTemporaryFile(delete=False) f.file.write ('%s %s %s@%s\n' % (alg, key, name, machine)) f.close() p = Popen(['ssh-vulnkey', f.name], stdin=PIPE, stdout=PIPE, stderr=PIPE) p.stdin.close() if p.stderr.read().__len__() > 1: f.unlink (f.name) return False if not os.path.exists (d): os.makedirs (d) shutil.move (f.name, kf) print "Imported", kf return True
def test_dump(self): cbd1 = CitiBikeData(source_url=self.test_data_url) self.assert_data_loaded(cbd1) js = StringIO() cbd1.dump(js) self.assert_data_loaded(cbd1) self.assertGreater(len(js.getvalue()), 0) js.reset() cbd2 = CitiBikeData(load_on_init=False) self.assert_data_not_loaded(cbd2) cbd2.load(js) self.assert_data_loaded(cbd2) self.assertDictEqual(cbd1.json, cbd2.json) ntf = NamedTemporaryFile(delete=False) cbd1.dump(ntf) self.assert_data_loaded(cbd1) self.assertGreater(len(js.getvalue()), 0) ntf.close() cbd3 = CitiBikeData(source_url="file:"+ntf.name) self.assert_data_loaded(cbd3) self.assertDictEqual(cbd1.json, cbd3.json) ntf.unlink(ntf.name) # delete file
def atomic_write(content, target): t = NamedTemporaryFile(dir="/tmp", delete=False) t.file.write(content) t.file.flush() t.close() copy(t.name, target) t.unlink(t.name)
def export(self, out_f=None, format='mp3'): out_f = _fd_or_path_or_tempfile(out_f, 'wb+') out_f.seek(0) data = NamedTemporaryFile(mode="wb", delete=False) wave_data = wave.open(data) wave_data.setnchannels(self.channels) wave_data.setsampwidth(self.sample_width) wave_data.setframerate(self.frame_rate) wave_data.setnframes(self.frame_count()) wave_data.writeframesraw(self._data) wave_data.close() output = NamedTemporaryFile(mode="w+") # read stdin / write stdout subprocess.call(['ffmpeg', '-y', # always overwrite existing files "-f", "wav", "-i", data.name, # input options (filename last) "-f", format, output.name, # output options (filename last) ], # make ffmpeg shut up stderr=open(os.devnull)) output.seek(0) out_f.write(output.read()) data.unlink(data.name) out_f.seek(0) return out_f
def _save_samba_share(conf): temp = NamedTemporaryFile('w', delete=False) conf.write(temp) temp.close() bkp_date = datetime.datetime.now().strftime('%Y-%m-%d_%Hh%Mm%Ss') process = run_as_root('cp "/etc/samba/smb.conf" '\ '"/etc/samba/smb.conf-{0}.bkp"' .format(bkp_date)) process.expect(pexpect.EOF) process = run_as_root('cp "{0}" "/etc/samba/smb.conf"' .format(temp.name)) process.expect(pexpect.EOF) process = run_as_root('chmod 644 /etc/samba/smb.conf') process.expect(pexpect.EOF) process = run_as_root('chown root:root /etc/samba/smb.conf') process.expect(pexpect.EOF) temp.unlink(temp.name)
def launcher_hadoop_job(self, data_type, input, output, result_companyId, map_tasks=8, red_tasks=8): """Runs the Hadoop job uploading task configuration""" # create report to save on completion or error report = { 'started_at': datetime.now(), 'state': 'launched', 'input': input } # Create temporary file to upload with json extension to identify it in HDFS job_extra_config = self.config.copy() job_extra_config.update({'companyId': result_companyId}) f = NamedTemporaryFile(delete=False, suffix='.json') f.write(json.dumps(job_extra_config)) f.close() self.logger.debug( 'Created temporary config file to upload into hadoop and read from job: %s' % f.name) # create hadoop job instance adding file location to be uploaded if data_type == "billing": mr_job = MRJob_clean_billing_data(args=[ '-r', 'hadoop', 'hdfs://' + input, '--file', f.name, '-c', 'module_edinet/edinet_clean_daily_data_etl/mrjob.conf', '--output-dir', 'hdfs://' + output, '--jobconf', 'mapred.job.name=edinet_clean_daily_data_etl_billing', '--jobconf', 'mapred.reduce.tasks={}'.format(self.num_reducers) ]) elif data_type == "metering": mr_job = MRJob_clean_metering_data(args=[ '-r', 'hadoop', 'hdfs://' + input, '--file', f.name, '-c', 'module_edinet/edinet_clean_daily_data_etl/mrjob.conf', '--output-dir', 'hdfs://' + output, '--jobconf', 'mapred.job.name=edinet_clean_daily_data_etl_metering', '--jobconf', 'mapred.reduce.tasks={}'.format(self.num_reducers) ]) else: raise Exception( "The job with data type {} can not be treated".format( data_type)) with mr_job.make_runner() as runner: try: runner.run() except Exception as e: f.unlink(f.name) raise Exception( 'Error running MRJob process using hadoop: {}'.format(e)) f.unlink(f.name) self.logger.debug( 'Temporary config file uploaded has been deleted from FileSystem') report['finished_at'] = datetime.now() report['state'] = 'finished' return report
def test_pipehandler(self): tmp = NamedTemporaryFile(delete=False) tmp.close() Handler = HandlerFactory("pipe", {"path":tmp.name}) h = Handler("test") h([]) with open(tmp.name,"r") as f: self.assertEqual(f.read(), "test") tmp.unlink(tmp.name)
def launcher_hadoop_job(self, type, input, company=None, devices=None, stations=None, map_tasks=8, red_tasks=8): """Runs the Hadoop job uploading task configuration""" # create report to save on completion or error report = { 'started_at': datetime.now(), 'state': 'launched', 'input': input } # Create temporary file to upload with json extension to identify it in HDFS job_extra_config = self.config.copy() job_extra_config.update({ 'devices': devices, 'company': company, 'stations': stations, 'task_id': self.task_UUID }) f = NamedTemporaryFile(delete=False, suffix='.json') f.write(json.dumps(job_extra_config)) f.close() report['config_temp_file'] = f.name self.logger.debug( 'Created temporary config file to upload into hadoop and read from job: %s' % f.name) # create hadoop job instance adding file location to be uploaded mr_job = MRJob_align(args=[ '-r', 'hadoop', 'hdfs://' + input, '--file', f.name, '-c', 'module_edinet/edinet_baseline_hourly_module/mrjob.conf', '--jobconf', 'mapred.job.name=edinet_baseline_hourly_module', '--jobconf', 'mapred.reduce.tasks={}'.format(self.num_reducers) ]) # mr_job = MRJob_align(args=['-r', 'hadoop', 'hdfs://'+input, '--file', f.name, '--output-dir', '/tmp/prova_dani', '--python-archive', path.dirname(lib.__file__)]) # debugger with mr_job.make_runner() as runner: try: runner.run() except Exception as e: f.unlink(f.name) raise Exception( 'Error running MRJob process using hadoop: {}'.format(e)) f.unlink(f.name) self.logger.debug( 'Temporary config file uploaded has been deleted from FileSystem') report['finished_at'] = datetime.now() report['state'] = 'finished' return report
def test_clbhandler(self): tmp = NamedTemporaryFile(delete=False) tmp.file.write("def raise_(): raise FutureWarning\n") tmp.file.write("def raise__(smt): raise smt") tmp.close() Handler = HandlerFactory("callback", {"path": tmp.name}) self.assertRaises(FutureWarning, Handler("raise_()"), []) self.assertRaises(FutureWarning, Handler("raise__({0})"), [FutureWarning]) tmp.unlink(tmp.name)
def export(self, out_f=None, format='mp3', codec=None): out_f = _fd_or_path_or_tempfile(out_f, 'wb+') out_f.seek(0) # for wav output we can just write the data directly to out_f if format == "wav": data = out_f else: data = NamedTemporaryFile(mode="wb", delete=False) wave_data = wave.open(data, 'wb') wave_data.setnchannels(self.channels) wave_data.setsampwidth(self.sample_width) wave_data.setframerate(self.frame_rate) wave_data.setnframes(self.frame_count()) wave_data.writeframesraw(self._data) wave_data.close() # for wav files, we're done (wav data is written directly to out_f) if format == 'wav': return out_f output = NamedTemporaryFile(mode="w+") # build call args args = [ self.ffmpeg, '-y', # always overwrite existing files "-f", "wav", "-i", data.name, # input options (filename last) ] if codec is not None: # force audio encoder args.extend(["-acodec", codec]) args.extend([ "-f", format, output.name, # output options (filename last) ]) # read stdin / write stdout subprocess.call( args, # make ffmpeg shut up stderr=open(os.devnull)) output.seek(0) out_f.write(output.read()) data.unlink(data.name) out_f.seek(0) return out_f
def upload(self, src_name, dst_name, compress=True): if not compress: self._upload(src_name, dst_name) fout = NamedTemporaryFile(suffix='.gz', mode='wb', delete=False) try: fout.close() logging.debug('Compressing file %s...', src_name) with \ open(src_name, 'rb') as fin, \ closing(gzip.GzipFile(fout.name, mode='wb')) as gzout: for chunk in iterchunks(fin): gzout.write(chunk) return self._upload(fout.name, dst_name + '.gz') finally: fout.unlink(fout.name)
def export(self, out_f=None, format='mp3', codec=None): out_f = _fd_or_path_or_tempfile(out_f, 'wb+') out_f.seek(0) # for wav output we can just write the data directly to out_f if format == "wav": data = out_f else: data = NamedTemporaryFile(mode="wb", delete=False) wave_data = wave.open(data, 'wb') wave_data.setnchannels(self.channels) wave_data.setsampwidth(self.sample_width) wave_data.setframerate(self.frame_rate) wave_data.setnframes(self.frame_count()) wave_data.writeframesraw(self._data) wave_data.close() # for wav files, we're done (wav data is written directly to out_f) if format == 'wav': return out_f output = NamedTemporaryFile(mode="w+") # build call args args =[self.ffmpeg, '-y', # always overwrite existing files "-f", "wav", "-i", data.name, # input options (filename last) ] if codec is not None: # force audio encoder args.extend(["-acodec", codec]) args.extend([ "-f", format, output.name, # output options (filename last) ]) # read stdin / write stdout subprocess.call(args, # make ffmpeg shut up stderr=open(os.devnull) ) output.seek(0) out_f.write(output.read()) data.unlink(data.name) out_f.seek(0) return out_f
def hadoop_job(self, input, companyId): """Runs the Hadoop job uploading task configuration""" # create report to save on completion or error report = { 'started_at': datetime.now(), 'state': 'launched', 'input': input } # Create temporary file to upload with json extension to identify it in HDFS f = NamedTemporaryFile(delete=False, suffix='.json') f.write(json.dumps(self.config)) f.close() self.logger.debug( 'Created temporary config file to upload into hadoop and read from job: {}' .format(f.name)) # create hadoop job instance adding file location to be uploaded dtnow = datetime.now() str_dtnow = dtnow.strftime("%Y%m%d%H%M") mr_job = Hadoop_ETL(args=[ '-r', 'hadoop', input, '--file', f.name, '--output-dir', "{}/{}/{}" .format(self.config['error_measures'], str(companyId), str_dtnow), '-c', 'module_edinet/edinet_billing_measures_etl/mrjob.conf', '--jobconf', 'mapred.job.name=edinet_billing_measures_etl' ]) # mr_job = Hadoop_ETL(args=['-r', 'hadoop', input, '--file', f.name, '--python-archive', path.dirname(lib.__file__)]) with mr_job.make_runner() as runner: try: runner.run() except Exception as e: f.unlink(f.name) raise Exception( 'Error running MRJob ETL process using hadoop: {}'.format( e)) f.unlink(f.name) self.logger.debug( 'Temporary config file uploaded has been deleted from FileSystem') report['finished_at'] = datetime.now() report['state'] = 'finished' return report
def validate_sparse_support(directory): SIZE = 1024 * 1024 # 1 MiB temp file seems reasonable if not directory.exists(): raise FileNotFoundError(f"No such file or directory: '{directory}'") elif not directory.is_dir(): raise ValueError(f"Expected '{directory}' to be a directory") # Try to write a small sparse file in the same directory as `filepath` and then read # the actual size to check that it's sparse temp_file = NamedTemporaryFile(dir=directory, delete=False) temp_file.close() temp_file = Path(temp_file.name) write_sparse_file(temp_file, b"~Testing~", SIZE) temp_file_size = size_on_disk(temp_file) if temp_file_size is None: return False is_sparse = temp_file_size < SIZE temp_file.unlink() return is_sparse
def upload(self, src_name, dst_name, compress=True, use_gzip=False): if compress: fout = NamedTemporaryFile(suffix='.gz', mode='wb', delete=False) try: if use_gzip: logging.debug('Compressing file %s with gzip...', src_name) p = subprocess.Popen(["gzip", '-c', src_name], stdout=fout) assert p.wait() == 0, 'Gzip compression failed' fout.close() return self._upload(fout.name, dst_name + '.gz') else: fout.close() logging.debug('Compressing file %s...', src_name) with \ open(src_name, 'rb') as fin, \ closing(gzip.GzipFile(fout.name, mode='wb')) as gzout: for chunk in iterchunks(fin): gzout.write(chunk) return self._upload(fout.name, dst_name + '.gz') finally: fout.unlink(fout.name) else: self._upload(src_name, dst_name)
def _save_samba_share(conf): temp = NamedTemporaryFile('w', delete=False) conf.write(temp) temp.close() bkp_date = datetime.datetime.now().strftime('%Y-%m-%d_%Hh%Mm%Ss') process = run_as_root('cp "/etc/samba/smb.conf" '\ '"/etc/samba/smb.conf-{0}.bkp"' .format(bkp_date)) process.expect(pexpect.EOF) process = run_as_root('cp "{0}" "/etc/samba/smb.conf"'.format(temp.name)) process.expect(pexpect.EOF) process = run_as_root('chmod 644 /etc/samba/smb.conf') process.expect(pexpect.EOF) process = run_as_root('chown root:root /etc/samba/smb.conf') process.expect(pexpect.EOF) temp.unlink(temp.name)
def handle(self, *args, **options): if not GOOGLE_DOCS_ACCOUNT and GOOGLE_DOCS_PASSWORD and GOOGLE_DOCS_RESOURCE_ID: raise CommandError('You must set both GOOGLE_DOCS_ACCOUNT, GOOGLE_DOCS_PASSWORD and GOOGLE_DOCS_RESOURCE_ID in your settings file.') verbosity = int(options.get('verbosity', 1)) output_all = options.get('output_all') dry_run = options.get('dry_run') fields = ('email', 'first_name', 'last_name', 'phone', 'city', 'state', 'zipcode', 'is_a', 'broadcasters', 'date_created', 'share_info') profile_list = NonUserProfile.objects.order_by('-date_created') if not output_all: profile_list = profile_list.filter(share_info=True) if len(profile_list): if verbosity > 1: self.stdout.write('{0} signups to record.'.format(len(profile_list))) fp = NamedTemporaryFile(delete=False) writer = csv.DictWriter(fp, fields) writer.writeheader() for signup in profile_list: output = { 'email': signup.email, 'first_name': signup.first_name, 'last_name': signup.last_name, 'phone': signup.phone, 'city': signup.city, 'state': signup.state, 'zipcode': signup.zipcode, 'is_a': signup.is_a, 'date_created': signup.date_created.strftime('%m/%d/%Y %H:%M:%S'), 'share_info': signup.share_info } extra_fields_data = signup.extra_fields for extra_field in SIGNUP_EXTRA_FIELDS: input_val = None if isinstance(extra_fields_data[extra_field], list): input_val = ', '.join(extra_fields_data[extra_field]) else: input_val = extra_fields_data[extra_field] output[extra_field] = input_val writer.writerow(output) if dry_run: self.stdout.write('Row created:\n{0}\n'.format('|'.join([str(output[f]) for f in fields]))) del(writer) if not dry_run: client = gdata.docs.client.DocsClient() login_token = client.ClientLogin(GOOGLE_DOCS_ACCOUNT, GOOGLE_DOCS_PASSWORD, 'politicaladsleuth') fp.close() media = gdata.data.MediaSource(file_path=fp.name, content_type='text/csv') try: resource = client.get_resource_by_id(GOOGLE_DOCS_RESOURCE_ID) updated_resource = client.update_resource(resource, media=media, update_metadata=False, new_revision=True) self.stdout.write('Data uploaded to "%s"\n'.format(updated_resource.title.text)) except gdata.client.RequestError as e: self.stderr.write(e.message + '\n') self.stdout.write('****Upload may have succeeded despite an InvalidEntryException error****\n') fp.close() fp.unlink(fp.name) else: self.stdout.write('No signups for the given parameters\n')
class FacadeTest(TestCase): def setUp(self): cfg = ConfigParser() cfg.add_section('some_section') cfg.set('some_section', 'foo', 'bar') cfg.set('some_section', 'host', 'foo') cfg.set('some_section', 'port', '29192') self.config_file = NamedTemporaryFile(mode='w', delete=False) cfg.write(self.config_file) self.config_file.close() self.env_file = NamedTemporaryFile(mode='w', delete=False) cfg = ConfigParser() cfg.add_section('some_section') cfg.set('some_section', 'host', 'SOME_HOST_NAME') cfg.set('some_section', 'port', 'SOME_PORT_NAME') cfg.add_section('other_section') cfg.set('other_section', 'foo', 'bar') cfg.write(self.env_file) self.env_file.close() def tearDown(self): self.config_file.unlink(self.config_file.name) self.env_file.unlink(self.env_file.name) def test_not_existing_config(self): configuration = get_configuration('non_existing_section') self.assertIsNone(configuration) def test_not_existing_variables(self): configuration = get_configuration('messaging', config_file='non-existing') self.assertIsNone(configuration) def test_existing_config(self): configuration = get_configuration(section_name='some_section', config_file=self.config_file.name, variables=self.env_file.name) self.assertTrue('host' in configuration) self.assertTrue('port' in configuration) self.assertEquals(configuration['host'], 'foo') self.assertEquals(configuration['port'], '29192') self.assertFalse('foo' in configuration) def test_existing_env(self): os.environ['SOME_HOST_NAME'] = 'bar' os.environ['SOME_PORT_NAME'] = '6661' configuration = get_configuration(section_name='some_section', config_file=self.config_file.name, variables=self.env_file.name) self.assertTrue('host' in configuration) self.assertTrue('port' in configuration) self.assertEquals(configuration['host'], 'bar') self.assertEquals(configuration['port'], '6661') self.assertFalse('foo' in configuration) os.environ.pop('SOME_HOST_NAME') os.environ.pop('SOME_PORT_NAME') def test_not_in_config(self): configuration = get_configuration(section_name='other_section', config_file=self.config_file.name, variables=self.env_file.name) self.assertIsNone(configuration)
def store(self, file=None, content=None, ctype=None, **kwd): """save a file-like item""" if content is None and not hasattr(file, 'read'): raise TypeError('invalid file-like object') data = content if content is not None else file.read() size = len(data) ext = guessImageType(data[:32]) if ext is None: raise ValueError('invalid image file') hashes = [md5(data).hexdigest()] _exists_id = self.exists(hashed=hashes[0]) if _exists_id: id = _exists_id filename = _make_filename(id, ext) print('id {} or hash {} exists!!'.format(id, hashes[0])) #raise DuplicateError('already exists') return [True, id, filename] ids = [_make_id(hashes[0])] if 'id' in kwd and kwd['id'] and kwd['id'] not in ids: ids += [kwd['id']] from image import SimpImage, MIN_QUALITY max_file_size = int(self.get_config('max_file_size')) max_jpeg_quality = int(self.get_config('max_jpeg_quality')) max_width = int(self.get_config('max_width')) max_height = int(self.get_config('max_height')) if size > max_file_size: max_jpeg_quality -= 1 if max_jpeg_quality < MIN_QUALITY: max_jpeg_quality = MIN_QUALITY im = SimpImage(blob=data) meta = im.meta if meta['width'] > max_width or meta['height'] > max_height: if self.get_config('auto_scale') and im.thumbnail( max_width, max_height): if im.format == 'JPEG' and im.quality > max_jpeg_quality: im.quality = max_jpeg_quality data = im.get_blob() size = len(data) print im.meta print 'new scaled size {}'.format(size) hashes += [md5(data).hexdigest()] else: raise ValueError( 'file: {} dimension {}x{} is too big, max is {}x{}'.format( kwd['name'] if 'name' in kwd else '', meta['width'], meta['height'], max_width, max_height)) if im.format == 'JPEG': if im.quality > max_jpeg_quality: print 'quality {} is too high, hash {}'.format( im.quality, hashes[0]) from tempfile import NamedTemporaryFile _tmp = NamedTemporaryFile('w+b', dir=self.get_config('temp_root'), delete=False) _tmp.file.close() save_file(_tmp.name, blob=data) if jpegoptim(_tmp.name): fp = open(_tmp.name) data = fp.read() size = len(data) # print 'new optimized size {}'.format(size) fp.close() _tmp.unlink(_tmp.name) del im im = SimpImage(blob=data) meta = im.meta hashes += [md5(data).hexdigest()] else: raise EnvironmentError( 'jpeg qualty is too high, or need jpegoptim') elif im.format == 'PNG' and self.get_config('force_jpeg'): im.format = 'JPEG' im.quality = max_jpeg_quality data = im.get_blob() size = len(data) hashes += [md5(data).hexdigest()] ext = 'jpg' meta = im.meta del im if (size > max_file_size): raise ValueError('file: {} size {} is too big, max is {}'.format( kwd['name'] if 'name' in kwd else '', size, max_file_size)) hashed = hashes[len(hashes) - 1] #md5(data).hexdigest() # print ('md5 hash: {}'.format(hashed)) # TODO: add for support (md5 + size) id id = _make_id(hashed) # print ('new filename: %r' % filename) # TODO: fix for support s3 front browse _exists_id = self.exists(id) or self.exists(hashed=hashed) if _exists_id: id = _exists_id filename = _make_filename(id, ext) print('id {} or hash {} exists!!'.format(id, hashed)) #raise DuplicateError('already exists') return [True, id, filename] filename = _make_filename(id, ext) # print ('id: {}'.format(id)) # if ctype is None or ctype == '': from _util import guess_mimetype ctype = guess_mimetype(filename) # save to mongodb spec = { '_id': id, 'filename': filename, 'hash': hashes, 'mime': ctype, 'size': size, 'meta': meta, 'ids': ids } if 'name' in kwd and isinstance(kwd['name'], (str, unicode)): spec['name'] = kwd['name'] for k in ['created', 'app_id']: if k in kwd and kwd[k]: spec[k] = kwd[k] if self._store_exists(id, filename=filename): self._save_meta(id, spec) return [True, id, filename] rr = self._put(data, **spec) if rr: return [True, rr, filename]
class ConfigManagerTest(unittest.TestCase): def setUp(self): cfg = ConfigParser.ConfigParser() cfg.add_section('some_section') cfg.set('some_section', 'host', 'foo') cfg.set('some_section', 'port', '29192') cfg.set('some_section', 'backend', 'tests.test_config_manager.Backend') self.config_file = NamedTemporaryFile(mode='w', delete=False) cfg.write(self.config_file) self.config_file.close() self.env_file = NamedTemporaryFile(mode='w', delete=False) cfg = ConfigParser.ConfigParser() cfg.add_section('some_section') cfg.set('some_section', 'host', 'SOME_HOST_NAME') cfg.set('some_section', 'port', 'SOME_PORT_NAME') cfg.add_section('other_section') cfg.set('other_section', 'foo', 'bar') cfg.write(self.env_file) self.env_file.close() def tearDown(self): self.config_file.unlink(self.config_file.name) self.env_file.unlink(self.env_file.name) def test_not_existing_section(self): configuration = get_configuration('not-existing-section', config_file=self.config_file.name) self.assertIsNone(configuration) def test_existing_config(self): configuration = get_configuration(section_name='some_section', config_file=self.config_file.name, variables_file=self.env_file.name) self.assertTrue('host' in configuration) self.assertTrue('port' in configuration) self.assertEquals(configuration['host'], 'foo') self.assertEquals(configuration['port'], '29192') self.assertFalse('foo' in configuration) def test_existing_env(self): os.environ['SOME_HOST_NAME'] = 'bar' os.environ['SOME_PORT_NAME'] = '6661' configuration = get_configuration(section_name='some_section', config_file=self.config_file.name, variables_file=self.env_file.name) self.assertTrue('host' in configuration) self.assertTrue('port' in configuration) self.assertEquals(configuration['host'], 'bar') self.assertEquals(configuration['port'], '6661') self.assertFalse('foo' in configuration) os.environ.pop('SOME_HOST_NAME') os.environ.pop('SOME_PORT_NAME') def test_get_backend_class(self): configuration = get_configuration(section_name='some_section', config_file=self.config_file.name, variables_file=self.env_file.name) backend_class = get_backend_class(configuration) self.assertTrue('Backend' in str(backend_class))
def module_task(self, params): self.logger.info('Starting Module for edinet baseline...') """CHECK INCONSISTENCIES IN params""" try: result_companyId = params['result_companyId'] ts_to = params['ts_to'] ts_from = params[ 'ts_from'] if 'ts_from' in params else date_n_month( ts_to, -24) energyTypeList = params['type'] if 'type' in params else [] except KeyError as e: raise Exception( 'Not enough parameters provided to module: {}'.format(e)) ###################################################################################################################################################################################### """ GET DATA FROM MONGO TO MAKE QUERYS """ ###################################################################################################################################################################################### if not energyTypeList: energyTypeList = list( set([ x['type'] for x in self.mongo['readings'].find({}, {'type': 1}) ])) ##################################################################################################################################################################################### """ LOAD DATA FROM HIVE """ ###################################################################################################################################################################################### self.logger.info('Extracting data from mongodb') # setting variables for readability collection = self.config['mongodb']['modelling_units_collection'] self.logger.debug('Querying for modelling units in MongoDB') cursor = self.mongo[collection].find({}) device_key = {} stations = {} for item in cursor: if len(item['devices']) > 0: # to avoid empty list of devices for dev in item['devices']: stations[str(dev['deviceId'].encode('utf-8'))] = str( item['stationId']) if 'stationId' in item else None if str(dev['deviceId'].encode( 'utf-8')) in device_key.keys(): device_key[str( dev['deviceId'].encode('utf-8'))].append( str(item['modellingUnitId']) + '~' + str(item['devices'])) else: device_key[str(dev['deviceId'].encode('utf-8'))] = [ str(item['modellingUnitId']) + '~' + str(item['devices']) ] self.logger.info('A mongo query process has loaded {} devices'.format( len(device_key.keys()))) ###################################################################################################################################################################################### """ HIVE QUERY TO PREPARE DATA FOR MRJOB """ ###################################################################################################################################################################################### # create a table to link devices with stations device_stations_df = pd.DataFrame(data={ "deviceId": stations.keys(), "stationId": stations.values() }, columns=["deviceId", "stationId"]) f = NamedTemporaryFile(delete=False, suffix='.csv') device_stations_df.to_csv(f.name, header=None, index=None) f.close() call([ "hadoop", "fs", "-mkdir", "-p", f.name, self.config['paths']['stations'] ]) call([ "hadoop", "fs", "-copyFromLocal", f.name, self.config['paths']['stations'] ]) f.unlink(f.name) device_stations = create_hive_module_input_table( self.hive, 'edinet_device_stations_table', self.config['paths']['stations'], [('deviceId', 'string'), ('stationId', 'string')], self.task_UUID, sep=",") self.context.add_clean_hive_tables(device_stations) # create a table with the devices values fields = [('deviceId', 'string'), ('ts', 'int'), ('value', 'float'), ('energyType', 'string'), ('source', 'string'), ('temperature', 'string')] location = self.config['paths']['measures'] input_table = create_hive_module_input_table(self.hive, 'edinet_baseline_input', location, fields, self.task_UUID) #add input table to be deleted after execution self.context.add_clean_hive_tables(input_table) qbr = RawQueryBuilder(self.hive) sentence = """ INSERT OVERWRITE TABLE {input_table} SELECT a.deviceId, a.ts, a.value, a.energyType, a.source, c.temperature FROM (SELECT ai.deviceid as deviceId, ai.ts as ts, ai.value as value, ai.energyType as energyType, ai.source as source FROM edinet_hourly_consumption ai WHERE ai.ts >= UNIX_TIMESTAMP("{ts_from}","yyyy-MM-dd HH:mm:ss") AND ai.ts <= UNIX_TIMESTAMP("{ts_to}","yyyy-MM-dd HH:mm:ss")) a JOIN {device_stations} b on a.deviceId==b.deviceId JOIN edinet_meteo c on b.stationId==c.stationId and SUBSTR(FROM_UNIXTIME(a.ts), 1, 13) == SUBSTR(FROM_UNIXTIME(c.ts), 1, 13) """.format(input_table=input_table, ts_from=ts_from, ts_to=ts_to, device_stations=device_stations) self.logger.debug(sentence) qbr.execute_query(sentence) ###################################################################################################################################################################################### """ SETUP MAP REDUCE JOB """ ###################################################################################################################################################################################### self.logger.info('Getting') try: # Launch MapReduce job ## Buffered measures to HBase self.logger.debug('MRJob Align') self.launcher_hadoop_job('align', location, result_companyId, device_key, stations) except Exception as e: raise Exception('MRJob ALIGN process job has failed: {}'.format(e)) self.logger.info('Module EDINET_baseline execution finished...')
def store(self, file=None, content=None, ctype=None, **kwd): """save a file-like item""" if content is None and not hasattr(file, 'read'): raise TypeError('invalid file-like object') data = content if content is not None else file.read() size = len(data) ext = guessImageType(data[:32]) if ext is None: raise ValueError('invalid image file') hashes = [md5(data).hexdigest()] _exists_id = self.exists(hashed=hashes[0]) if _exists_id: id = _exists_id filename = _make_filename(id, ext) print ('id {} or hash {} exists!!'.format(id, hashes[0])) #raise DuplicateError('already exists') return [True, id, filename] ids = [_make_id(hashes[0])] if 'id' in kwd and kwd['id'] and kwd['id'] not in ids: ids += [kwd['id']] from image import SimpImage, MIN_QUALITY max_file_size = int(self.get_config('max_file_size')) max_jpeg_quality = int(self.get_config('max_jpeg_quality')) max_width = int(self.get_config('max_width')) max_height = int(self.get_config('max_height')) if size > max_file_size: max_jpeg_quality -= 1 if max_jpeg_quality < MIN_QUALITY: max_jpeg_quality = MIN_QUALITY im = SimpImage(blob=data) meta = im.meta if meta['width'] > max_width or meta['height'] > max_height: if self.get_config('auto_scale') and im.thumbnail(max_width, max_height): if im.format == 'JPEG' and im.quality > max_jpeg_quality: im.quality = max_jpeg_quality data = im.get_blob() size = len(data) print im.meta print 'new scaled size {}'.format(size) hashes += [md5(data).hexdigest()] else: raise ValueError('file: {} dimension {}x{} is too big, max is {}x{}'.format(kwd['name'] if 'name' in kwd else '', meta['width'], meta['height'], max_width, max_height)) if im.format == 'JPEG': if im.quality > max_jpeg_quality: print 'quality {} is too high, hash {}'.format(im.quality, hashes[0]) from tempfile import NamedTemporaryFile _tmp = NamedTemporaryFile('w+b',dir=self.get_config('temp_root'),delete=False) _tmp.file.close() save_file(_tmp.name, blob=data) if jpegoptim(_tmp.name): fp = open(_tmp.name) data = fp.read() size = len(data) # print 'new optimized size {}'.format(size) fp.close() _tmp.unlink(_tmp.name) del im im = SimpImage(blob=data) meta = im.meta hashes += [md5(data).hexdigest()] else: raise EnvironmentError('jpeg qualty is too high, or need jpegoptim') elif im.format == 'PNG' and self.get_config('force_jpeg'): im.format = 'JPEG' im.quality = max_jpeg_quality data = im.get_blob() size = len(data) hashes += [md5(data).hexdigest()] ext = 'jpg' meta = im.meta del im if (size > max_file_size): raise ValueError('file: {} size {} is too big, max is {}'.format(kwd['name'] if 'name' in kwd else '', size, max_file_size)) hashed = hashes[len(hashes)-1] #md5(data).hexdigest() # print ('md5 hash: {}'.format(hashed)) # TODO: add for support (md5 + size) id id = _make_id(hashed) # print ('new filename: %r' % filename) # TODO: fix for support s3 front browse _exists_id = self.exists(id) or self.exists(hashed=hashed) if _exists_id: id = _exists_id filename = _make_filename(id, ext) print ('id {} or hash {} exists!!'.format(id, hashed)) #raise DuplicateError('already exists') return [True, id, filename] filename = _make_filename(id, ext) # print ('id: {}'.format(id)) # if ctype is None or ctype == '': from _util import guess_mimetype ctype = guess_mimetype(filename) # save to mongodb spec = {'_id': id,'filename': filename, 'hash': hashes, 'mime': ctype, 'size': size, 'meta': meta, 'ids': ids} if 'name' in kwd and isinstance(kwd['name'], (str, unicode)): spec['name'] = kwd['name'] for k in ['created', 'app_id']: if k in kwd and kwd[k]: spec[k] = kwd[k] if self._store_exists(id, filename=filename): self._save_meta(id, spec) return [True, id, filename] rr = self._put(data, **spec) if rr: return [True, rr, filename]
def _merge(self, cc_dir): md5_hashes = {} if HASH_FILE.exists(): for line in [l.strip() for l in HASH_FILE.read_text().split("\n")]: if line: values = line.rsplit(":", maxsplit=1) if len(values) == 2 and values[0] and values[1]: md5_hashes[values[0]] = values[1] try: p = subprocess.run( "git -C \"{cc_dir}\" status --porcelain -uall".format( **locals()), shell=True, stdout=subprocess.PIPE, check=True) merge_files = [] status = str(p.stdout, "utf-8").strip() for line in [s.strip() for s in status.split("\n")]: if line: merge_files.append(tuple(line.split())) except subprocess.CalledProcessError as err: raise nicfit.CommandError(str(err)) for st, file in merge_files + self.args.extra_merge: dst = Path(file) src = cc_dir / dst hasher = md5() try: hasher.update(src.read_bytes()) except FileNotFoundError as notfound: perr(notfound) continue md5sum = hasher.hexdigest() merge_file = (self.args.ignore_md5s or file not in md5_hashes or md5sum != md5_hashes[file]) pout("Comparing {} hash({}): {}".format( file, md5sum, Fg.blue("new") if merge_file else Fg.green("merged"))) md5_hashes[file] = md5sum if merge_file: tmp_dst = None if not dst.exists(): tmp_dst = NamedTemporaryFile("w", suffix=dst.suffix, delete=False) # Write the file to exist on disk for diff and merge tmp_dst.close() tmp_dst = Path(tmp_dst.name) dst_file = str(dst if tmp_dst is None else tmp_dst) diffs = subprocess.run("diff '{src}' '{dst_file}' >/dev/null" .format(**locals()), shell=True)\ .returncode != 0 pout("Differences: {}".format(diffs)) if diffs: merge_cmd = self.args.merge_cmd if merge_cmd is None: for cmd, opts in MERGE_TOOLS.items(): if shutil.which(cmd): merge_cmd = " ".join([cmd, opts or ""]) break if merge_cmd is not None: subprocess.run( "{merge_cmd} '{src}' '{dst_file}'".format( **locals()), shell=True, check=True) else: perr("Merge disabled, no merge command found. Install " "a merge tool such as: {tools}.\nOr use " "--merge-cmd to specify your own.".format( tools=", ".join(MERGE_TOOLS.keys()))) if tmp_dst and tmp_dst.stat().st_size == 0: tmp_dst.unlink() elif tmp_dst: # Move tmp file into place and create parent dirs if not dst.parent.exists(): dst.parent.mkdir(0o755, parents=True) shutil.move(str(tmp_dst), str(dst)) with HASH_FILE.open("w") as hash_file: for f in sorted(md5_hashes.keys()): hash_file.write("{}:{}\n".format(f, md5_hashes[f]))
def handle(self, *args, **options): if not GOOGLE_DOCS_ACCOUNT and GOOGLE_DOCS_PASSWORD and GOOGLE_DOCS_RESOURCE_ID: raise CommandError( 'You must set both GOOGLE_DOCS_ACCOUNT, GOOGLE_DOCS_PASSWORD and GOOGLE_DOCS_RESOURCE_ID in your settings file.' ) verbosity = int(options.get('verbosity', 1)) output_all = options.get('output_all') dry_run = options.get('dry_run') fields = ('email', 'first_name', 'last_name', 'phone', 'city', 'state', 'zipcode', 'is_a', 'broadcasters', 'date_created', 'share_info') profile_list = NonUserProfile.objects.order_by('-date_created') if not output_all: profile_list = profile_list.filter(share_info=True) if len(profile_list): if verbosity > 1: self.stdout.write('{0} signups to record.'.format( len(profile_list))) fp = NamedTemporaryFile(delete=False) writer = csv.DictWriter(fp, fields) writer.writeheader() for signup in profile_list: output = { 'email': signup.email, 'first_name': signup.first_name, 'last_name': signup.last_name, 'phone': signup.phone, 'city': signup.city, 'state': signup.state, 'zipcode': signup.zipcode, 'is_a': signup.is_a, 'date_created': signup.date_created.strftime('%m/%d/%Y %H:%M:%S'), 'share_info': signup.share_info } extra_fields_data = signup.extra_fields for extra_field in SIGNUP_EXTRA_FIELDS: input_val = None if isinstance(extra_fields_data[extra_field], list): input_val = ', '.join(extra_fields_data[extra_field]) else: input_val = extra_fields_data[extra_field] output[extra_field] = input_val writer.writerow(output) if dry_run: self.stdout.write('Row created:\n{0}\n'.format('|'.join( [str(output[f]) for f in fields]))) del (writer) if not dry_run: client = gdata.docs.client.DocsClient() login_token = client.ClientLogin(GOOGLE_DOCS_ACCOUNT, GOOGLE_DOCS_PASSWORD, 'politicaladsleuth') fp.close() media = gdata.data.MediaSource(file_path=fp.name, content_type='text/csv') try: resource = client.get_resource_by_id( GOOGLE_DOCS_RESOURCE_ID) updated_resource = client.update_resource( resource, media=media, update_metadata=False, new_revision=True) self.stdout.write('Data uploaded to "%s"\n'.format( updated_resource.title.text)) except gdata.client.RequestError as e: self.stderr.write(e.message + '\n') self.stdout.write( '****Upload may have succeeded despite an InvalidEntryException error****\n' ) fp.close() fp.unlink(fp.name) else: self.stdout.write('No signups for the given parameters\n')