def test_files(self): folder = Path(__file__).parent / "temp" os.makedirs(folder, exist_ok=True) if os.path.isdir(folder): # pragma: no cover shutil.rmtree(folder) plan = self.get_default_plan() self.kraken_run(kraken_simple_method, plan, cache_to_folder=folder, special_iterations=[3]) file = Query.folder(folder).single() self.assertEqual('3.kraken.pkl', file.name) results = self.kraken_run( kraken_simple_method, plan, cache_to_folder=folder) # type: List[kraken.IterationResult] for index, result in enumerate(results): self.assertEqual( kraken.IterationStatus.Skipped if index == 3 else kraken.IterationStatus.Success, result.status) self.assertIsNone(result.condition) self.assertIsNone(result.result) loaded_results = Query.en(kraken.Kraken.load( folder, None)).order_by(lambda z: z.result).to_list() self.assertResult([11, 12, 21, 22], loaded_results) shutil.rmtree(folder)
def get_header_tags(self): result = (Query.dict(self._get_header_tags_base()).to_dictionary( lambda z: f'header_{z.key}', lambda z: ' / '.join( Query.dict(z.value).order_by(lambda x: x.key).select( lambda x: x.value)))) headers = ' / '.join( Query.dict(result).order_by(lambda z: z.key).select( lambda z: z.value)) result['headers'] = headers for key, value in self.custom_tags.items(): result['tag_' + key] = value return result
def get_resources(self) -> List[str]: """ Returns list of resources' names """ return Query.folder( self.resources_location).select(lambda z: z.name).to_list()
def _get_current_records(records: List[PartitionedDatasetRecord], from_timestamp: Optional[datetime], to_timestamp: datetime): records = Query.en(records).where( lambda z: z.timestamp <= to_timestamp).order_by_descending( lambda z: z.timestamp).to_list() first_major = Query.en(records).with_indices().where( lambda z: z.value.is_major).select( lambda z: z.key).first_or_default() if first_major is None: raise ValueError( f"There are no major revisions before {to_timestamp}") records = records[:first_major + 1] if from_timestamp is not None: records = [r for r in records if r.timestamp >= from_timestamp] return records
def cache_from(self, src: Queryable, cnt=None) -> None: """ Caches data from a given queryable (for instance, from one produced by DataSource::get_data). Args: src: Queryable to cache from cnt: amount of objects to cache Returns: """ q = src if cnt is not None: q = q.take(cnt) full_path = str(self.path) os.makedirs(Path(full_path).parent.__str__(), exist_ok=True) tmp_path = full_path + '.tmp' file_agg = self._get_writing_aggregator(tmp_path) pipeline = Query.push().split_pipelines( file=file_agg, cnt=agg.Count() ) result = q.feed(pipeline) if os.path.isfile(full_path): os.remove(full_path) shutil.move(tmp_path, full_path) FileIO.write_text(str(result['cnt']), self.length_path)
def get_splits(self, query_template): if self.custom_shards is not None: shards = self.custom_shards else: shards = list(range(self.shard_count)) return (Query.en(shards).select( lambda z: dict(shard=z, shard_count=self.shard_count)).select( lambda z: query_template.format(**z)).to_list())
def get_data(self): splits = list(self.get_splits(self.query_template)) query = Query.en(splits) if self.with_progress_bar: query = query.feed(fluq.with_progress_bar()) query = query.select_many( lambda z: self.downloader_factory(z).get_data()) return query
def test_extracting_warnings(self): df = Query.en(range(10)).select(lambda z: (z,z)).to_dataframe(columns=['x','y']) pipe = make_pipeline( DataFrameTransformer([ContinousTransformer(['x'])]), LinearRegression() ) pipe.fit(df[['x']],df.y) pipe.predict(pd.DataFrame(dict(x=[None]))) warnings = TGWarningStorage.get_report() self.assertEqual(1,len(warnings)) TGWarningStorage.clear()
def _get_module_name_and_version(path: Path): try: file = tarfile.open(path, 'r:gz') properties = (Query.en( file.getmembers()).where(lambda z: z.name.endswith( 'properties.json')).order_by(lambda z: len(z.name)).first()) stream = file.extractfile(properties).read() props = json.loads(stream) return props['full_module_name'], props['version'] except: module_name, version = re.match('([^-/]+)-(.+)\.tar\.gz$', path.name).groups() return module_name, version
def _read_bundle(path: Path): index_frame = pd.read_parquet(path.joinpath('index.parquet')) files = (Query .folder(path) .where(lambda z: z.name!='index.parquet') .where(lambda z: z.name.endswith('.parquet')) .to_list() ) data_frames = Query.en(files).to_dictionary(lambda z: z.name.split('.')[0], lambda z: pd.read_parquet(z)) return DataBundle( index_frame, data_frames )
def _load_data(self, data: Union[str, Path, pd.DataFrame]): if isinstance(data, pd.DataFrame): return data if isinstance(data, str) or isinstance(data, Path): data = str(data) if os.path.isfile(data): return pd.read_parquet(data) elif os.path.isdir(data): dfs = Query.folder(data).select(pd.read_parquet).to_list() return pd.concat(dfs, sort=False) else: raise ValueError(f'Data was `{data}`, but there is neither file nor folder on this location') else: raise ValueError(f"Data was `{data}`, but the format is not supported")
def download_folder(bucket: str, s3_path: str, folder: Path, report=None): if os.path.exists(folder.__str__()): shutil.rmtree(folder.__str__()) os.makedirs(folder.__str__()) s3_resource = boto3.resource('s3') bucket_obj = s3_resource.Bucket(bucket) keys = [z.key for z in bucket_obj.objects.filter(Prefix=s3_path)] keys = Query.en(keys) if report == 'tqdm': keys = keys.feed(fluq.with_progress_bar()) for key in keys: proper_key = key[len(s3_path):] if proper_key.startswith('/'): proper_key = proper_key[1:] filename = folder.joinpath(proper_key) S3Handler.download_file(bucket, key, filename)
def upload_folder(bucket_name: str, s3_path: str, folder: Path): aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID', None) aws_secret_access = os.environ.get('AWS_SECRET_ACCESS_KEY', None) if aws_access_key_id is not None and aws_secret_access is not None: kwargs = dict(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access) else: kwargs = {} s3 = boto3.resource('s3', **kwargs) bucket = s3.Bucket(bucket_name) bucket.objects.filter(Prefix=s3_path).delete() client = boto3.client('s3', **kwargs) for file_path in Query.folder(folder): file_path_str = file_path.__str__() joint_path = os.path.join(s3_path, file_path.name) client.upload_file(file_path_str, bucket_name, joint_path)
def _get_data_iter(self, start_date: datetime.datetime, end_date: datetime.datetime): start_date_str = str(start_date) end_date_str = str(end_date) logger.info( f"Retrieving updated ids from {start_date_str} to {end_date_str}") sql = self.id_retrieve_sql_template.format(start_date=start_date_str, end_date=end_date_str) id_src = self.source_factory(sql) ids = id_src.get_data().select(lambda z: z['id']).select(str).to_list() partitions = Query.en(ids).feed( fluq.partition_by_count(self.partition_size)).to_list() logger.info( f'Retrieving {len(ids)} records, {len(partitions)} partitions') for index, partition in enumerate(partitions): id_list = ','.join(partition) sql = self.download_sql_template.format(id_list=id_list) src = self.source_factory(sql) for item in src.get_data(): yield item logger.info(f"Partition {index} is processed")
def send(self): aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID', None) aws_secret_access = os.environ.get('AWS_SECRET_ACCESS_KEY', None) if aws_access_key_id is not None and aws_secret_access is not None: kwargs = dict(aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access) else: kwargs = {} s3 = boto3.resource('s3', **kwargs) bucket = s3.Bucket(self.s3_bucket) for name in self.names: s3_full_path = os.path.join(self.s3_path, name) bucket.objects.filter(Prefix=s3_full_path).delete() client = boto3.client('s3', **kwargs) for name in self.names: for file_path in Query.folder(self.location.joinpath(name)): client.upload_file( file_path.__str__(), self.s3_bucket, os.path.join(self.s3_path, name, file_path.name))
def test_zip_file(self): src = Query.en(range(10)) path = Path(__file__).parent.joinpath('test_cache') cache = ZippedFileDataSource(path, buffer_size=4) self.assertEqual(False, cache.is_available()) cache.cache_from(src, 7) self.assertEqual(True, cache.is_available()) self.assertEqual( "7", FileIO.read_text(path.__str__() + '.pkllines.zip.length')) stored = Query.file.zipped_folder(path.__str__() + '.pkllines.zip').to_dictionary() self.assertEqual(2, len(stored)) self.assertListEqual([0, 1, 2, 3], stored['0']) self.assertListEqual([4, 5, 6], stored['1']) result = cache.get_data().to_list() self.assertListEqual(list(range(7)), result) os.unlink(path.__str__() + '.pkllines.zip.length') os.unlink(path.__str__() + '.pkllines.zip')
def get_splits(self, query_template, **kwargs): splits = (Query.loop( self.date_from, timedelta(days=1), self.date_to).select( lambda z: dict(day=z.day, month=z.month, year=z.year)).select( lambda z: query_template.format(**z)).to_list()) return splits
def read_parquet(filename: Path) -> List['PartitionedDatasetRecord']: df = pd.read_parquet(filename) return Query.df(df).select( lambda z: PartitionedDatasetRecord(**z)).to_list()
def get_data(self): if MockUpdateSource.state == 1: return Query.en([2, 1, 0, 3, 4, 5]).select(lambda z: dict(id=z)) else: return Query.en([])
def make_package( task: PackagingTask, dst_location: Optional[Union[Path, str]] = None) -> PackageInfo: """ Creates the package out of the :class:``PackagingTask``, and returns :class:``PackagingInfo``` describing this package """ if dst_location is None: dst_location = Loc.temp_path.joinpath('release/package') elif isinstance(dst_location, str): dst_location = Path(dst_location) elif not isinstance(dst_location, Path): raise ValueError( f'dst_location was {dst_location}, while str or Path is expected') if not os.path.isdir(dst_location): os.makedirs(dst_location, exist_ok=True) root = Loc.tg_path # type:Path release = Loc.temp_path.joinpath('release/package_tmp') # type:Path try: shutil.rmtree(release.__str__()) except: pass os.makedirs(release.__str__()) full_module_name = _full_module_name(task.name, task.version) lib = release.joinpath(full_module_name) shutil.copytree(root.__str__(), lib.joinpath(Loc.tg_name).__str__()) resources = lib.joinpath('resources') # type: Path os.makedirs(resources.__str__()) props = dict( module_name=task.name, version=task.version, full_module_name=full_module_name, dependencies=','.join(f"'{z}'" for dep_list in task.dependencies for z in dep_list.dependencies), tg_name=Loc.tg_name, full_tg_name=full_module_name + '.' + Loc.tg_name, ) for key, value in task.payload.items(): FileIO.write_pickle(value, resources.joinpath(key)) FileIO.write_text(_MANIFEST_TEMPLATE.format(**props), release.joinpath('MANIFEST.in')) FileIO.write_text(_SETUP_TEMPLATE.format(**props), release.joinpath('setup.py')) FileIO.write_json(props, release.joinpath('properties.json')) FileIO.write_text(_INIT_TEMPLATE.format(**props), lib.joinpath('__init__.py')) pwd = os.getcwd() os.chdir(release.__str__()) subprocess.call([sys.executable, 'setup.py', 'sdist']) os.chdir(pwd) file = Query.folder(release.joinpath('dist')).single() dst_location = dst_location.joinpath( f'{full_module_name}-{task.version}.tar.gz') shutil.copy(file.__str__(), dst_location.__str__()) shutil.rmtree(release.__str__()) return PackageInfo(task, full_module_name, dst_location)
def upload_folder(self, local_path, remote_path): for file in Query.folder(local_path,'**/*').where(lambda z: z.is_file()): inner = str(file)[len(str(local_path)):] if inner.startswith('/'): inner = inner[1:] self.upload_file(os.path.join(local_path, inner), os.path.join(remote_path, inner))
def get_data(self) -> Queryable: return Query.df(self.df)
def parse(self) -> Queryable[CorpusFragment]: return Query.en(self._parse_iter())
def _postprocess(self, df: pd.DataFrame) -> pd.DataFrame: return df.set_index('a') class MyFeaturizerFailing(DataframeFeaturizer): def __init__(self): super(MyFeaturizerFailing, self).__init__() def _featurize(self, item: Any) -> List[Any]: return [item] def _validate(self): raise ValueError() data = Query.en(range(5)).select(lambda z: dict(a=z)).to_dataframe() class BatchJobTestCase(TestCase): def test_simple(self): mem = InMemoryJobDestination() job = FeaturizationJob( 'test', 'test', MockDfDataSource(data), { 'def': MyFeaturizerSimple() }, mem, None, None
def get_data(self): return Query.en(self._get_data_iter(self.start_date, self.end_date))
def get_data(self, **kwargs): return Query.en([1])