def close(self): """ Closes any connection to vertica """ if self.connection.opened(): logger.info(' connection closed.') self.connection.close()
def test_schema_versions(self): logger.info("Schemas") name = 'property' self.register = Registry(path_configs) r_schema_1 = self.register.get(name, version=1) r_schema_2 = self.register.get(name, version=2) r_schema_3 = self.register.get(name, version=3) r_schema_4 = self.register.get(name, version=4) r_schema_5 = self.register.get(name, version=5) _file_test_1 = schema.Parse(_avro_test_1) _file_test_2 = schema.Parse(_avro_test_2) _file_test_3 = schema.Parse(_avro_test_3) _file_test_4 = schema.Parse(_avro_test_4) _file_test_5 = schema.Parse(_avro_test_5) self.assertEqual(r_schema_1, _file_test_1) self.assertEqual(r_schema_2, _file_test_2) self.assertEqual(r_schema_3, _file_test_3) self.assertEqual(r_schema_4, _file_test_4) self.assertEqual(r_schema_5, _file_test_5) self.assertRaises(SchemaVersionNotFound, lambda: self.register.get(name, version=6))
def get(self, name=None, version=VERSION): key = '%s_%s' % (name, str(version)) _schema = os.path.join(self.path, name) if key in self.cache_schemas: logger.debug('key : %s ', key) return self.cache_schemas[key] if os.path.exists(_schema): element = os.listdir(_schema) avro_file = '%s.avsc' % version logger.info("Files - > %s" % sorted(element)) if avro_file in sorted(element): _file = os.path.join(self.path, name, avro_file) try: with open(_file, 'rb') as f: data = f.read() self._cache(key, avro.schema.Parse(data)) return avro.schema.Parse(data) except IOError as e: logger.warning("See exception below; skipping file %s", _file) logger.exception(e) else: raise SchemaVersionNotFound else: raise SchemaNotFound
def execute(self, context): if self.source is None: self.source = self.dag_params['source'] # start the Postgres client client = Postgres() # Save the data into the Postgres from the csv path result = Postgres.copy_expert(client, self.table, self.source) logger.info("The CSV has been saved successfully") return result
def execute(self, context): if self.source is None: self.source = self.dag_params['source'] logger.info(self.destination) s3_client = ClientS3(s3['bucket']) result = s3_client.upload_multiple(self.source, self.destination, self.extension) logger.info("The files has been saved successfully") return self.source
def test_upload(self): files_path = os.path.join(PATH, 'files', 'file1.txt') logger.info('Files path %s' % files_path) try: self.client.upload(files_path, 'file1.txt') passed = True except Exception as e: logger.info(f"No such config file in {str(e)}") passed = False assert passed
def list(self): """ List objects into bucket """ objects = [] bucket = self.clientS3.Bucket(self.bucket) for s3_file in bucket.objects.all(): logger.info('File > %s' % s3_file.key) objects.append(s3_file.key) return objects
def test_download(self): path_down = os.path.join(PATH, 'files', 'file1-down.txt') passed = False self.client.download('file1.txt', path_down) try: passed = True except Exception as e: passed = False logger.info(f"No such config file' in {str(e)}") assert passed
def test_serialize_avro(self): logger.info("testing") serializer = AvroSerializer(NAME, version=VERSION) data = { "name": "TEXT INTO MESSAGE", "favorite_color": "111", "favorite_number": random.randint(0, 10) } serialize = serializer.serialize(data) self.assertIn(bytes("TEXT INTO MESSAGE", "utf-8"), serialize)
def test_read(self): """ Execute the ```read``` function and verify if the file exist. """ logger.info("Read") try: _file = read(file_path) self.assertIsNotNone(_file) passed = True except Exception as e: passed = 'No such file' in str(e) self.assertEqual(passed, True)
def get_folder_id(self, name): results = self.service.files().list( pageSize=10, q=("name = '{0}'".format(name) + " and mimeType = 'application/vnd.google-apps.folder'"), corpora="user", fields="nextPageToken, files(id, name, webContentLink, " + "createdTime, modifiedTime)").execute() item = results.get('files', []) logger.info(item) if not item: return None return item[0]['id']
def test_list(self): """ Find list for success. """ passed = False try: list_s3 = self.client.list() logger.info('list s3 > %s' % list_s3) passed = True except Exception as e: passed = False logger.info(f"No such config file in {str(e)}") assert passed
def test_create(self): """ Execute the ```create``` function and verify if the file was created. """ logger.setLevel(logging.DEBUG) logger.info("testing") url = 'https://www.facebook.com/elvikito' params = {} _file = create(url, params) self.assertIsNotNone(_file) response = requests.get(url, params=params) content = response.content self.assertEqual(_file[1], content[1])
def connect(self): """ Creates a connection or returns the current one. Return: connection """ if self.connection is not None: logger.info(" connection: %s " % (self.connection is not None)) return self.connection try: self.connection = DataPostgres.connect(**self.options) except Exception as e: logger.critical("Unable to connect to DB: {0}".format(e.message)) raise return self.connection
def __init__(self, lookups={}, *args, **kwargs): """ Operator to make accesible xcom params in `self.dag_params`. The `lookup` param, maps how the xcom params are going to be stored. It uses the keys as param keys in self.dag_params and the values as the taskids. I.E:: t = XComParams(lookups={'var1': 'taskid_1'}) In function `execute` the `self.dag_params` var will be populated as a dictionary: >>> t.dag_params {'var1': 'what taskid_id task returned in `execute` function'} """ logger.info(kwargs) self.lookups = lookups BaseOperator.__init__(self, *args, **kwargs)
def download(self, gpath, path): if self.listfiles() is not None: items = self.listfiles() for item in items: if gpath in item['name']: request = self.service.files().get_media(fileId=item['id']) fh = io.BytesIO() downloader = MediaIoBaseDownload(fh, request) done = False while done is False: status, done = downloader.next_chunk() logger.info(int(status.progress() * 100)) f = open(path + '/' + item['name'], 'wb') f.write(fh.getvalue()) f.close() else: logger.info('No files found.')
def upload_recursive(self, path, s3path, extension=None): ''' Upload recursivitly all files inside in the directory even the files in directories inside the path. Args: path (str): The directory path. s3path (str): The s3 path to put the directory. extension (str): Filter through extension of the files to be uploaded. ''' for root, dirs, files in os.walk(path): d = root.replace(path, '') d = d[1:] if d.startswith('/') else d s3path_to_load = os.path.join(s3path, d) self.upload_multiple(root, s3path_to_load, extension=extension) logger.info(f"Files uploaded {num_files}") return num_files
def __init__(self, table, source=None, *args, **kwargs): """ Insert data into table `{table}` all csv files found in source, source should be accesible thourhg `self.dag_params` if not passed. Args: table (str): Vertica table name source (str): path to csv file, in case of None the class should look at `dag_params` """ """ Save the csv path into the Vertica table jobs.jobs """ logger.info("Starting to save the CSV to Vertica") super(PopulatePostgres, self).__init__(*args, **kwargs) self.source = source self.table = table
def test_upload_multiple(self): directory_path = os.path.join(PATH, 'files', 'multiple') s3_directory_path = os.path.join('test', 'multiple') logger.info('Files path %s' % directory_path) list_files = [] try: self.client.upload_multiple(directory_path, s3_directory_path, extension='txt') bucket = self.client.clientS3.Bucket(self.client.bucket) list_test = bucket.objects.all() list_files = list(map(lambda x: x._key, list(list_test))) passed = True except Exception as e: logger.error(e) passed = False assert passed assert 'test/multiple/file1.txt' in list_files assert 'test/multiple/file2.txt' in list_files
def connect(self): """ Creates a connection or returns the current one. Return: connection """ if self.connection is not None: logger.info(" connection: %s " % (self.connection is not None)) if not self.connection.opened(): logger.info("connection is closed") return self.reconect() if self.connection.opened(): return self.connection try: self.connection = connect(**self.options) except Exception as e: logger.critical("Unable to connect to DB: {0}".format(e.message)) raise return self.connection
def upload(self, path, gpath): mime = MimeTypes() file_metadata = { 'name': os.path.basename(path), } if self.get_item_id(gpath) is not None: file_metadata['parents'] = [self.get_folder_id(gpath)] media = MediaFileUpload(path, mimetype=mime.guess_type( os.path.basename(path))[0], resumable=True) id_file = [] try: file = self.service.files().create(body=file_metadata, media_body=media, fields='id').execute() id_file = file.get('id') except HttpError: logger.info('corrupted file') pass return id_file
def __init__(self, path=None): logger.info(f"++ Registry.init") self.cache_schemas = {} if path is not None: self.path = path elif os.environ.get('PIPE_SCHEMA_REGISTRY'): self.path = os.environ.get('PIPE_SCHEMA_REGISTRY') else: self.path = os.path.join(HOME, '.pipeutils', 'registry') logger.info(f" path: {os.environ.get('PIPE_SCHEMA_REGISTRY')}") logger.info(f" path: {self.path}")
def close(self): """ Closes any connection to vertica """ logger.info(' connection closed.') return self.connection.close()
import unittest import os import logging import json from pipeutils.avro import Registry, SchemaVersionNotFound, SchemaNotFound from pipeutils import logger from avro import schema logger.setLevel(logging.DEBUG) path = os.path.dirname(os.path.realpath(__file__)) path_configs = os.path.join(path, 'registry') logger.info(f"path_configs: {path_configs}") _schema = json.dumps({ "type": "record", "name": "X", "fields": [{ "name": "y", "type": { "type": "record", "name": "Y", "fields": [{ "name": "Z", "type": "X" }] } }] })
def listfiles(self): items = results.get('files', []) if not items: logger.info('No files found.') else: return items
def test_get_schema(self): register = Registry(path_configs) r_schema = register.get(name, version) logger.info(r_schema) self.assertIn('name', r_schema.to_json()) self.assertEqual('test', r_schema.to_json()['name'])