Esempio n. 1
0
    def test_files(self):
        folder = Path(__file__).parent / "temp"
        os.makedirs(folder, exist_ok=True)
        if os.path.isdir(folder):  # pragma: no cover
            shutil.rmtree(folder)

        plan = self.get_default_plan()

        self.kraken_run(kraken_simple_method,
                        plan,
                        cache_to_folder=folder,
                        special_iterations=[3])

        file = Query.folder(folder).single()
        self.assertEqual('3.kraken.pkl', file.name)

        results = self.kraken_run(
            kraken_simple_method, plan,
            cache_to_folder=folder)  # type: List[kraken.IterationResult]

        for index, result in enumerate(results):
            self.assertEqual(
                kraken.IterationStatus.Skipped if index == 3 else
                kraken.IterationStatus.Success, result.status)
            self.assertIsNone(result.condition)
            self.assertIsNone(result.result)

        loaded_results = Query.en(kraken.Kraken.load(
            folder, None)).order_by(lambda z: z.result).to_list()
        self.assertResult([11, 12, 21, 22], loaded_results)

        shutil.rmtree(folder)
Esempio n. 2
0
 def get_header_tags(self):
     result = (Query.dict(self._get_header_tags_base()).to_dictionary(
         lambda z: f'header_{z.key}', lambda z: ' / '.join(
             Query.dict(z.value).order_by(lambda x: x.key).select(
                 lambda x: x.value))))
     headers = ' / '.join(
         Query.dict(result).order_by(lambda z: z.key).select(
             lambda z: z.value))
     result['headers'] = headers
     for key, value in self.custom_tags.items():
         result['tag_' + key] = value
     return result
Esempio n. 3
0
    def get_resources(self) -> List[str]:
        """
        Returns list of resources' names

        """
        return Query.folder(
            self.resources_location).select(lambda z: z.name).to_list()
Esempio n. 4
0
 def _get_current_records(records: List[PartitionedDatasetRecord],
                          from_timestamp: Optional[datetime],
                          to_timestamp: datetime):
     records = Query.en(records).where(
         lambda z: z.timestamp <= to_timestamp).order_by_descending(
             lambda z: z.timestamp).to_list()
     first_major = Query.en(records).with_indices().where(
         lambda z: z.value.is_major).select(
             lambda z: z.key).first_or_default()
     if first_major is None:
         raise ValueError(
             f"There are no major revisions before {to_timestamp}")
     records = records[:first_major + 1]
     if from_timestamp is not None:
         records = [r for r in records if r.timestamp >= from_timestamp]
     return records
Esempio n. 5
0
    def cache_from(self, src: Queryable, cnt=None) -> None:
        """
        Caches data from a given queryable (for instance, from one produced by DataSource::get_data).
        Args:
            src: Queryable to cache from
            cnt: amount of objects to cache

        Returns:

        """
        q = src
        if cnt is not None:
            q = q.take(cnt)
        full_path = str(self.path)
        os.makedirs(Path(full_path).parent.__str__(), exist_ok=True)
        tmp_path = full_path + '.tmp'
        file_agg = self._get_writing_aggregator(tmp_path)
        pipeline = Query.push().split_pipelines(
            file=file_agg,
            cnt=agg.Count()
        )
        result = q.feed(pipeline)
        if os.path.isfile(full_path):
            os.remove(full_path)
        shutil.move(tmp_path, full_path)
        FileIO.write_text(str(result['cnt']), self.length_path)
Esempio n. 6
0
 def get_splits(self, query_template):
     if self.custom_shards is not None:
         shards = self.custom_shards
     else:
         shards = list(range(self.shard_count))
     return (Query.en(shards).select(
         lambda z: dict(shard=z, shard_count=self.shard_count)).select(
             lambda z: query_template.format(**z)).to_list())
Esempio n. 7
0
 def get_data(self):
     splits = list(self.get_splits(self.query_template))
     query = Query.en(splits)
     if self.with_progress_bar:
         query = query.feed(fluq.with_progress_bar())
     query = query.select_many(
         lambda z: self.downloader_factory(z).get_data())
     return query
Esempio n. 8
0
 def test_extracting_warnings(self):
     df = Query.en(range(10)).select(lambda z: (z,z)).to_dataframe(columns=['x','y'])
     pipe = make_pipeline(
         DataFrameTransformer([ContinousTransformer(['x'])]),
         LinearRegression()
     )
     pipe.fit(df[['x']],df.y)
     pipe.predict(pd.DataFrame(dict(x=[None])))
     warnings = TGWarningStorage.get_report()
     self.assertEqual(1,len(warnings))
     TGWarningStorage.clear()
Esempio n. 9
0
def _get_module_name_and_version(path: Path):
    try:
        file = tarfile.open(path, 'r:gz')
        properties = (Query.en(
            file.getmembers()).where(lambda z: z.name.endswith(
                'properties.json')).order_by(lambda z: len(z.name)).first())
        stream = file.extractfile(properties).read()
        props = json.loads(stream)
        return props['full_module_name'], props['version']
    except:
        module_name, version = re.match('([^-/]+)-(.+)\.tar\.gz$',
                                        path.name).groups()
        return module_name, version
Esempio n. 10
0
 def _read_bundle(path: Path):
     index_frame = pd.read_parquet(path.joinpath('index.parquet'))
     files = (Query
              .folder(path)
              .where(lambda z: z.name!='index.parquet')
              .where(lambda z: z.name.endswith('.parquet'))
              .to_list()
              )
     data_frames = Query.en(files).to_dictionary(lambda z: z.name.split('.')[0], lambda z: pd.read_parquet(z))
     return DataBundle(
         index_frame,
         data_frames
     )
Esempio n. 11
0
 def _load_data(self, data: Union[str, Path, pd.DataFrame]):
     if isinstance(data, pd.DataFrame):
         return data
     if isinstance(data, str) or isinstance(data, Path):
         data = str(data)
         if os.path.isfile(data):
             return pd.read_parquet(data)
         elif os.path.isdir(data):
             dfs = Query.folder(data).select(pd.read_parquet).to_list()
             return pd.concat(dfs, sort=False)
         else:
             raise ValueError(f'Data was `{data}`, but there is neither file nor folder on this location')
     else:
         raise ValueError(f"Data was `{data}`, but the format is not supported")
Esempio n. 12
0
    def download_folder(bucket: str, s3_path: str, folder: Path, report=None):
        if os.path.exists(folder.__str__()):
            shutil.rmtree(folder.__str__())
        os.makedirs(folder.__str__())
        s3_resource = boto3.resource('s3')
        bucket_obj = s3_resource.Bucket(bucket)

        keys = [z.key for z in bucket_obj.objects.filter(Prefix=s3_path)]
        keys = Query.en(keys)
        if report == 'tqdm':
            keys = keys.feed(fluq.with_progress_bar())

        for key in keys:
            proper_key = key[len(s3_path):]
            if proper_key.startswith('/'):
                proper_key = proper_key[1:]
            filename = folder.joinpath(proper_key)
            S3Handler.download_file(bucket, key, filename)
Esempio n. 13
0
    def upload_folder(bucket_name: str, s3_path: str, folder: Path):

        aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID', None)
        aws_secret_access = os.environ.get('AWS_SECRET_ACCESS_KEY', None)
        if aws_access_key_id is not None and aws_secret_access is not None:
            kwargs = dict(aws_access_key_id=aws_access_key_id,
                          aws_secret_access_key=aws_secret_access)
        else:
            kwargs = {}

        s3 = boto3.resource('s3', **kwargs)
        bucket = s3.Bucket(bucket_name)
        bucket.objects.filter(Prefix=s3_path).delete()

        client = boto3.client('s3', **kwargs)
        for file_path in Query.folder(folder):
            file_path_str = file_path.__str__()
            joint_path = os.path.join(s3_path, file_path.name)
            client.upload_file(file_path_str, bucket_name, joint_path)
Esempio n. 14
0
 def _get_data_iter(self, start_date: datetime.datetime,
                    end_date: datetime.datetime):
     start_date_str = str(start_date)
     end_date_str = str(end_date)
     logger.info(
         f"Retrieving updated ids from {start_date_str} to {end_date_str}")
     sql = self.id_retrieve_sql_template.format(start_date=start_date_str,
                                                end_date=end_date_str)
     id_src = self.source_factory(sql)
     ids = id_src.get_data().select(lambda z: z['id']).select(str).to_list()
     partitions = Query.en(ids).feed(
         fluq.partition_by_count(self.partition_size)).to_list()
     logger.info(
         f'Retrieving {len(ids)} records, {len(partitions)} partitions')
     for index, partition in enumerate(partitions):
         id_list = ','.join(partition)
         sql = self.download_sql_template.format(id_list=id_list)
         src = self.source_factory(sql)
         for item in src.get_data():
             yield item
         logger.info(f"Partition {index} is processed")
Esempio n. 15
0
    def send(self):
        aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID', None)
        aws_secret_access = os.environ.get('AWS_SECRET_ACCESS_KEY', None)
        if aws_access_key_id is not None and aws_secret_access is not None:
            kwargs = dict(aws_access_key_id=aws_access_key_id,
                          aws_secret_access_key=aws_secret_access)
        else:
            kwargs = {}

        s3 = boto3.resource('s3', **kwargs)
        bucket = s3.Bucket(self.s3_bucket)

        for name in self.names:
            s3_full_path = os.path.join(self.s3_path, name)
            bucket.objects.filter(Prefix=s3_full_path).delete()

        client = boto3.client('s3', **kwargs)
        for name in self.names:
            for file_path in Query.folder(self.location.joinpath(name)):
                client.upload_file(
                    file_path.__str__(), self.s3_bucket,
                    os.path.join(self.s3_path, name, file_path.name))
    def test_zip_file(self):
        src = Query.en(range(10))
        path = Path(__file__).parent.joinpath('test_cache')
        cache = ZippedFileDataSource(path, buffer_size=4)

        self.assertEqual(False, cache.is_available())
        cache.cache_from(src, 7)
        self.assertEqual(True, cache.is_available())

        self.assertEqual(
            "7", FileIO.read_text(path.__str__() + '.pkllines.zip.length'))

        stored = Query.file.zipped_folder(path.__str__() +
                                          '.pkllines.zip').to_dictionary()
        self.assertEqual(2, len(stored))
        self.assertListEqual([0, 1, 2, 3], stored['0'])
        self.assertListEqual([4, 5, 6], stored['1'])

        result = cache.get_data().to_list()
        self.assertListEqual(list(range(7)), result)

        os.unlink(path.__str__() + '.pkllines.zip.length')
        os.unlink(path.__str__() + '.pkllines.zip')
Esempio n. 17
0
 def get_splits(self, query_template, **kwargs):
     splits = (Query.loop(
         self.date_from, timedelta(days=1), self.date_to).select(
             lambda z: dict(day=z.day, month=z.month, year=z.year)).select(
                 lambda z: query_template.format(**z)).to_list())
     return splits
Esempio n. 18
0
 def read_parquet(filename: Path) -> List['PartitionedDatasetRecord']:
     df = pd.read_parquet(filename)
     return Query.df(df).select(
         lambda z: PartitionedDatasetRecord(**z)).to_list()
Esempio n. 19
0
 def get_data(self):
     if MockUpdateSource.state == 1:
         return Query.en([2, 1, 0, 3, 4, 5]).select(lambda z: dict(id=z))
     else:
         return Query.en([])
Esempio n. 20
0
def make_package(
        task: PackagingTask,
        dst_location: Optional[Union[Path, str]] = None) -> PackageInfo:
    """
    Creates the package out of the :class:``PackagingTask``, and returns :class:``PackagingInfo``` describing this package
    """
    if dst_location is None:
        dst_location = Loc.temp_path.joinpath('release/package')
    elif isinstance(dst_location, str):
        dst_location = Path(dst_location)
    elif not isinstance(dst_location, Path):
        raise ValueError(
            f'dst_location was {dst_location}, while str or Path is expected')
    if not os.path.isdir(dst_location):
        os.makedirs(dst_location, exist_ok=True)

    root = Loc.tg_path  # type:Path
    release = Loc.temp_path.joinpath('release/package_tmp')  # type:Path
    try:
        shutil.rmtree(release.__str__())
    except:
        pass
    os.makedirs(release.__str__())

    full_module_name = _full_module_name(task.name, task.version)
    lib = release.joinpath(full_module_name)

    shutil.copytree(root.__str__(), lib.joinpath(Loc.tg_name).__str__())

    resources = lib.joinpath('resources')  # type: Path
    os.makedirs(resources.__str__())

    props = dict(
        module_name=task.name,
        version=task.version,
        full_module_name=full_module_name,
        dependencies=','.join(f"'{z}'" for dep_list in task.dependencies
                              for z in dep_list.dependencies),
        tg_name=Loc.tg_name,
        full_tg_name=full_module_name + '.' + Loc.tg_name,
    )

    for key, value in task.payload.items():
        FileIO.write_pickle(value, resources.joinpath(key))

    FileIO.write_text(_MANIFEST_TEMPLATE.format(**props),
                      release.joinpath('MANIFEST.in'))
    FileIO.write_text(_SETUP_TEMPLATE.format(**props),
                      release.joinpath('setup.py'))
    FileIO.write_json(props, release.joinpath('properties.json'))

    FileIO.write_text(_INIT_TEMPLATE.format(**props),
                      lib.joinpath('__init__.py'))

    pwd = os.getcwd()
    os.chdir(release.__str__())

    subprocess.call([sys.executable, 'setup.py', 'sdist'])

    os.chdir(pwd)

    file = Query.folder(release.joinpath('dist')).single()

    dst_location = dst_location.joinpath(
        f'{full_module_name}-{task.version}.tar.gz')

    shutil.copy(file.__str__(), dst_location.__str__())
    shutil.rmtree(release.__str__())
    return PackageInfo(task, full_module_name, dst_location)
 def upload_folder(self, local_path, remote_path):
     for file in Query.folder(local_path,'**/*').where(lambda z: z.is_file()):
         inner = str(file)[len(str(local_path)):]
         if inner.startswith('/'):
             inner = inner[1:]
         self.upload_file(os.path.join(local_path, inner), os.path.join(remote_path, inner))
Esempio n. 22
0
 def get_data(self) -> Queryable:
     return Query.df(self.df)
Esempio n. 23
0
 def parse(self) -> Queryable[CorpusFragment]:
     return Query.en(self._parse_iter())
Esempio n. 24
0
    def _postprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.set_index('a')


class MyFeaturizerFailing(DataframeFeaturizer):
    def __init__(self):
        super(MyFeaturizerFailing, self).__init__()

    def _featurize(self, item: Any) -> List[Any]:
        return [item]

    def _validate(self):
        raise ValueError()


data = Query.en(range(5)).select(lambda z: dict(a=z)).to_dataframe()


class BatchJobTestCase(TestCase):
    def test_simple(self):
        mem = InMemoryJobDestination()
        job = FeaturizationJob(
            'test',
            'test',
            MockDfDataSource(data),
            {
                'def': MyFeaturizerSimple()
            },
            mem,
            None,
            None
Esempio n. 25
0
 def get_data(self):
     return Query.en(self._get_data_iter(self.start_date, self.end_date))
Esempio n. 26
0
 def get_data(self, **kwargs):
     return Query.en([1])