Beispiel #1
0
 def _get_current_records(records: List[PartitionedDatasetRecord],
                          from_timestamp: Optional[datetime],
                          to_timestamp: datetime):
     records = Query.en(records).where(
         lambda z: z.timestamp <= to_timestamp).order_by_descending(
             lambda z: z.timestamp).to_list()
     first_major = Query.en(records).with_indices().where(
         lambda z: z.value.is_major).select(
             lambda z: z.key).first_or_default()
     if first_major is None:
         raise ValueError(
             f"There are no major revisions before {to_timestamp}")
     records = records[:first_major + 1]
     if from_timestamp is not None:
         records = [r for r in records if r.timestamp >= from_timestamp]
     return records
Beispiel #2
0
    def test_files(self):
        folder = Path(__file__).parent / "temp"
        os.makedirs(folder, exist_ok=True)
        if os.path.isdir(folder):  # pragma: no cover
            shutil.rmtree(folder)

        plan = self.get_default_plan()

        self.kraken_run(kraken_simple_method,
                        plan,
                        cache_to_folder=folder,
                        special_iterations=[3])

        file = Query.folder(folder).single()
        self.assertEqual('3.kraken.pkl', file.name)

        results = self.kraken_run(
            kraken_simple_method, plan,
            cache_to_folder=folder)  # type: List[kraken.IterationResult]

        for index, result in enumerate(results):
            self.assertEqual(
                kraken.IterationStatus.Skipped if index == 3 else
                kraken.IterationStatus.Success, result.status)
            self.assertIsNone(result.condition)
            self.assertIsNone(result.result)

        loaded_results = Query.en(kraken.Kraken.load(
            folder, None)).order_by(lambda z: z.result).to_list()
        self.assertResult([11, 12, 21, 22], loaded_results)

        shutil.rmtree(folder)
Beispiel #3
0
 def get_splits(self, query_template):
     if self.custom_shards is not None:
         shards = self.custom_shards
     else:
         shards = list(range(self.shard_count))
     return (Query.en(shards).select(
         lambda z: dict(shard=z, shard_count=self.shard_count)).select(
             lambda z: query_template.format(**z)).to_list())
Beispiel #4
0
 def get_data(self):
     splits = list(self.get_splits(self.query_template))
     query = Query.en(splits)
     if self.with_progress_bar:
         query = query.feed(fluq.with_progress_bar())
     query = query.select_many(
         lambda z: self.downloader_factory(z).get_data())
     return query
Beispiel #5
0
 def test_extracting_warnings(self):
     df = Query.en(range(10)).select(lambda z: (z,z)).to_dataframe(columns=['x','y'])
     pipe = make_pipeline(
         DataFrameTransformer([ContinousTransformer(['x'])]),
         LinearRegression()
     )
     pipe.fit(df[['x']],df.y)
     pipe.predict(pd.DataFrame(dict(x=[None])))
     warnings = TGWarningStorage.get_report()
     self.assertEqual(1,len(warnings))
     TGWarningStorage.clear()
Beispiel #6
0
def _get_module_name_and_version(path: Path):
    try:
        file = tarfile.open(path, 'r:gz')
        properties = (Query.en(
            file.getmembers()).where(lambda z: z.name.endswith(
                'properties.json')).order_by(lambda z: len(z.name)).first())
        stream = file.extractfile(properties).read()
        props = json.loads(stream)
        return props['full_module_name'], props['version']
    except:
        module_name, version = re.match('([^-/]+)-(.+)\.tar\.gz$',
                                        path.name).groups()
        return module_name, version
 def _read_bundle(path: Path):
     index_frame = pd.read_parquet(path.joinpath('index.parquet'))
     files = (Query
              .folder(path)
              .where(lambda z: z.name!='index.parquet')
              .where(lambda z: z.name.endswith('.parquet'))
              .to_list()
              )
     data_frames = Query.en(files).to_dictionary(lambda z: z.name.split('.')[0], lambda z: pd.read_parquet(z))
     return DataBundle(
         index_frame,
         data_frames
     )
Beispiel #8
0
    def download_folder(bucket: str, s3_path: str, folder: Path, report=None):
        if os.path.exists(folder.__str__()):
            shutil.rmtree(folder.__str__())
        os.makedirs(folder.__str__())
        s3_resource = boto3.resource('s3')
        bucket_obj = s3_resource.Bucket(bucket)

        keys = [z.key for z in bucket_obj.objects.filter(Prefix=s3_path)]
        keys = Query.en(keys)
        if report == 'tqdm':
            keys = keys.feed(fluq.with_progress_bar())

        for key in keys:
            proper_key = key[len(s3_path):]
            if proper_key.startswith('/'):
                proper_key = proper_key[1:]
            filename = folder.joinpath(proper_key)
            S3Handler.download_file(bucket, key, filename)
Beispiel #9
0
 def _get_data_iter(self, start_date: datetime.datetime,
                    end_date: datetime.datetime):
     start_date_str = str(start_date)
     end_date_str = str(end_date)
     logger.info(
         f"Retrieving updated ids from {start_date_str} to {end_date_str}")
     sql = self.id_retrieve_sql_template.format(start_date=start_date_str,
                                                end_date=end_date_str)
     id_src = self.source_factory(sql)
     ids = id_src.get_data().select(lambda z: z['id']).select(str).to_list()
     partitions = Query.en(ids).feed(
         fluq.partition_by_count(self.partition_size)).to_list()
     logger.info(
         f'Retrieving {len(ids)} records, {len(partitions)} partitions')
     for index, partition in enumerate(partitions):
         id_list = ','.join(partition)
         sql = self.download_sql_template.format(id_list=id_list)
         src = self.source_factory(sql)
         for item in src.get_data():
             yield item
         logger.info(f"Partition {index} is processed")
    def test_zip_file(self):
        src = Query.en(range(10))
        path = Path(__file__).parent.joinpath('test_cache')
        cache = ZippedFileDataSource(path, buffer_size=4)

        self.assertEqual(False, cache.is_available())
        cache.cache_from(src, 7)
        self.assertEqual(True, cache.is_available())

        self.assertEqual(
            "7", FileIO.read_text(path.__str__() + '.pkllines.zip.length'))

        stored = Query.file.zipped_folder(path.__str__() +
                                          '.pkllines.zip').to_dictionary()
        self.assertEqual(2, len(stored))
        self.assertListEqual([0, 1, 2, 3], stored['0'])
        self.assertListEqual([4, 5, 6], stored['1'])

        result = cache.get_data().to_list()
        self.assertListEqual(list(range(7)), result)

        os.unlink(path.__str__() + '.pkllines.zip.length')
        os.unlink(path.__str__() + '.pkllines.zip')
Beispiel #11
0
    def _postprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.set_index('a')


class MyFeaturizerFailing(DataframeFeaturizer):
    def __init__(self):
        super(MyFeaturizerFailing, self).__init__()

    def _featurize(self, item: Any) -> List[Any]:
        return [item]

    def _validate(self):
        raise ValueError()


data = Query.en(range(5)).select(lambda z: dict(a=z)).to_dataframe()


class BatchJobTestCase(TestCase):
    def test_simple(self):
        mem = InMemoryJobDestination()
        job = FeaturizationJob(
            'test',
            'test',
            MockDfDataSource(data),
            {
                'def': MyFeaturizerSimple()
            },
            mem,
            None,
            None
Beispiel #12
0
 def get_data(self):
     return Query.en(self._get_data_iter(self.start_date, self.end_date))
 def parse(self) -> Queryable[CorpusFragment]:
     return Query.en(self._parse_iter())
Beispiel #14
0
 def get_data(self):
     if MockUpdateSource.state == 1:
         return Query.en([2, 1, 0, 3, 4, 5]).select(lambda z: dict(id=z))
     else:
         return Query.en([])
Beispiel #15
0
 def get_data(self, **kwargs):
     return Query.en([1])