def dump_real_series(config):
    series = series_io.load_from_df_chunks(config.raw_input, config.col_name)
    norm_series = series_transform.normalize_series(config, series)
    delta_series = series_transform.normal_to_delta_series(config, norm_series)
    dump_path = series_io.dump_as_np_series(config, tmp_stage_name(config),
                                            delta_series)
    logger.info('%r=%r / %r', config.raw_input, dump_path, delta_series)
def dump_as_plain_txt(config):
  series = series_io.load_from_df_chunks(config.raw_input, config.col_name)
  dump_path = series_io.dump_as_plain_txt(config, tmp_stage_name(config), series)
  logger.info('raw series=%r', dump_path)
  series = series_transform.normalize_series(config, series)
  dump_path = series_io.dump_as_plain_txt(config, tmp_stage_name(config), series)
  logger.info('normal series=%r', dump_path)
  series = series_transform.normal_to_delta_series(config, series)
  dump_path = series_io.dump_as_plain_txt(config, tmp_stage_name(config), series)
  logger.info('delta series=%r', dump_path)
def calculate_statistics(config, prefix):
  infile = prefix + '.gz'
  series_out, stats_out = tmp_stage_name(config, prefix + '.bin'), tmp_stage_name(config, prefix + '.prob')
  series = series_io.load_from_df_chunks(infile, config.col_name)
  series = series_transform.normalize_series(config, series)
  series = series_transform.normal_to_delta_series(config, series)
  stats = series_stats_calc.calc_stats_from_delta_series(config, series)
  logger.info('stats=%r', stats)
  series_io.dump_as_np_series(config, series_out, series)
  series_io.dump_prob_distribution(config, stats_out, stats)
  logger.info('%s bin=%r, stats=%r', infile, series_out, stats_out)
def dump_as_plain_txt(config):
    series = series_io.load_from_df_chunks(config.raw_input, config.col_name)
    dump_path = series_io.dump_as_plain_txt(config, tmp_stage_name(config),
                                            series)
    logger.info('raw series=%r', dump_path)
    series = series_transform.normalize_series(config, series)
    dump_path = series_io.dump_as_plain_txt(config, tmp_stage_name(config),
                                            series)
    logger.info('normal series=%r', dump_path)
    series = series_transform.normal_to_delta_series(config, series)
    dump_path = series_io.dump_as_plain_txt(config, tmp_stage_name(config),
                                            series)
    logger.info('delta series=%r', dump_path)
def calculate_statistics(config, prefix):
    infile = prefix + '.gz'
    series_out, stats_out = tmp_stage_name(config,
                                           prefix + '.bin'), tmp_stage_name(
                                               config, prefix + '.prob')
    series = series_io.load_from_df_chunks(infile, config.col_name)
    series = series_transform.normalize_series(config, series)
    series = series_transform.normal_to_delta_series(config, series)
    stats = series_stats_calc.calc_stats_from_delta_series(config, series)
    logger.info('stats=%r', stats)
    series_io.dump_as_np_series(config, series_out, series)
    series_io.dump_prob_distribution(config, stats_out, stats)
    logger.info('%s bin=%r, stats=%r', infile, series_out, stats_out)
 def test_stat_calc_on_sample(self):
   max_prob = 2 ** self.config.int_len - 1
   series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL)
   series = series_transform.normalize_series(self.config, series)
   series = series_transform.normal_to_delta_series(self.config, series)
   stats = series_stats_calc.calc_stats_from_delta_series(self.config, series)
   #logger.info("result=\n%r", stats.prob_dstrb)
   self.assertTrue(stats.prob_dstrb[0][1] < max_prob)
   self.assertEqual(stats.prob_dstrb[0][2], stats.prob_dstrb[0][1])
   self.assertEqual(stats.prob_dstrb[-1][1], max_prob)
   self.assertTrue(all( cum > 0 and cum <= max_prob for sym,cum,dlt in stats.prob_dstrb ))
   self.assertTrue(all( dlt > 0 and dlt <= max_prob and dlt <= cum for sym,cum,dlt in stats.prob_dstrb ))
   self.assertTrue(self.percentiles_ok(stats.full_histo.perc))
   self.assertTrue(self.percentiles_ok(stats.norm_histo.perc))
  def test_normalize_series(self):
    series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL)
    norm_series = series_transform.normalize_series(self.config, series)
    lerp_max = series_transform.get_lerp_max(self.config)

    for meta1,meta2 in zip(series.meta, norm_series.meta):
      self.assertEqual(meta1.sid, meta2.sid)
      self.assertEqual(meta1.count, meta2.count)
      self.assertEqual(meta1.start, meta2.start)
    for meta2,data2 in zip(norm_series.meta, norm_series.data):
      vmin, vmax = data2.min(), data2.max()
      self.assertTrue(np.any(data2 == 0))
      self.assertTrue(meta2.max - meta2.min >= MIN_SCALE)
      self.assertTrue(vmax - vmin >= 0, "%r < %r" % (vmin,vmax))
      self.assertTrue(np.all(data2 <= lerp_max), '%r = [%d, %d]' % (meta2, vmin, vmax))
    def test_dump_as_plain_txt(self):
        series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL)
        dump_path = series_io.dump_as_plain_txt(
            TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series)
        assert os.path.isfile(dump_path)
        with my_open(TestSeriesIO.config, dump_path, 'r') as fileobj:
            lines = fileobj.readlines()
        expected_lines = sum(len(d) + 1 for d in series.data)
        self.assertGreaterEqual(len(lines), expected_lines)

        series = series_transform.normalize_series(self.config, series)
        dump_path = series_io.dump_as_plain_txt(
            TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series)
        series = series_transform.normal_to_delta_series(self.config, series)
        dump_path = series_io.dump_as_plain_txt(
            TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series)
    def test_normalize_series(self):
        series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL)
        norm_series = series_transform.normalize_series(self.config, series)
        lerp_max = series_transform.get_lerp_max(self.config)

        for meta1, meta2 in zip(series.meta, norm_series.meta):
            self.assertEqual(meta1.sid, meta2.sid)
            self.assertEqual(meta1.count, meta2.count)
            self.assertEqual(meta1.start, meta2.start)
        for meta2, data2 in zip(norm_series.meta, norm_series.data):
            vmin, vmax = data2.min(), data2.max()
            self.assertTrue(np.any(data2 == 0))
            self.assertTrue(meta2.max - meta2.min >= MIN_SCALE)
            self.assertTrue(vmax - vmin >= 0, "%r < %r" % (vmin, vmax))
            self.assertTrue(np.all(data2 <= lerp_max),
                            '%r = [%d, %d]' % (meta2, vmin, vmax))
  def test_dump_as_plain_txt(self):
    series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL)
    dump_path = series_io.dump_as_plain_txt(
      TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series)
    assert os.path.isfile(dump_path)
    with my_open(TestSeriesIO.config, dump_path, 'r') as fileobj:
      lines = fileobj.readlines()
    expected_lines = sum( len(d) + 1 for d in series.data )
    self.assertGreaterEqual(len(lines), expected_lines)

    series = series_transform.normalize_series(self.config, series)
    dump_path = series_io.dump_as_plain_txt(
      TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series)
    series = series_transform.normal_to_delta_series(self.config, series)
    dump_path = series_io.dump_as_plain_txt(
      TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series)
  def test_cycle_transformations(self):
    series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL)
    norm_series = series_transform.normalize_series(self.config, series)
    delta_series = series_transform.normal_to_delta_series(self.config, norm_series)
    raw_series = series_transform.delta_to_raw_series(self.config, delta_series)
    atol = MIN_SCALE / 2 ** (self.config.int_len-1)

    #logger.debug('start : %r\nend : %r', series.data[2], raw_series.data[2])
    for meta1,meta2 in zip(series.meta, raw_series.meta):
      self.assertEqual(meta1.sid, meta2.sid)
      self.assertEqual(meta1.count, meta2.count)
      self.assertEqual(meta1.start, meta2.start)
    for data1,data2 in zip(series.data, raw_series.data):
      outliers = ((data1 - data2) / data2).compressed()
      outliers = outliers[np.logical_or(outliers<-1e-3, outliers>1e-3)]
      self.assertTrue(np.array_equal(data1.mask, data2.mask))
      self.assertTrue(np.allclose(data1, data2, atol=atol, rtol=1e-3), '\n%r' % outliers)
 def test_stat_calc_on_sample(self):
     max_prob = 2**self.config.int_len - 1
     series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL)
     series = series_transform.normalize_series(self.config, series)
     series = series_transform.normal_to_delta_series(self.config, series)
     stats = series_stats_calc.calc_stats_from_delta_series(
         self.config, series)
     #logger.info("result=\n%r", stats.prob_dstrb)
     self.assertTrue(stats.prob_dstrb[0][1] < max_prob)
     self.assertEqual(stats.prob_dstrb[0][2], stats.prob_dstrb[0][1])
     self.assertEqual(stats.prob_dstrb[-1][1], max_prob)
     self.assertTrue(
         all(cum > 0 and cum <= max_prob
             for sym, cum, dlt in stats.prob_dstrb))
     self.assertTrue(
         all(dlt > 0 and dlt <= max_prob and dlt <= cum
             for sym, cum, dlt in stats.prob_dstrb))
     self.assertTrue(self.percentiles_ok(stats.full_histo.perc))
     self.assertTrue(self.percentiles_ok(stats.norm_histo.perc))
    def test_cycle_transformations(self):
        series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL)
        norm_series = series_transform.normalize_series(self.config, series)
        delta_series = series_transform.normal_to_delta_series(
            self.config, norm_series)
        raw_series = series_transform.delta_to_raw_series(
            self.config, delta_series)
        atol = MIN_SCALE / 2**(self.config.int_len - 1)

        #logger.debug('start : %r\nend : %r', series.data[2], raw_series.data[2])
        for meta1, meta2 in zip(series.meta, raw_series.meta):
            self.assertEqual(meta1.sid, meta2.sid)
            self.assertEqual(meta1.count, meta2.count)
            self.assertEqual(meta1.start, meta2.start)
        for data1, data2 in zip(series.data, raw_series.data):
            outliers = ((data1 - data2) / data2).compressed()
            outliers = outliers[np.logical_or(outliers < -1e-3,
                                              outliers > 1e-3)]
            self.assertTrue(np.array_equal(data1.mask, data2.mask))
            self.assertTrue(np.allclose(data1, data2, atol=atol, rtol=1e-3),
                            '\n%r' % outliers)
def dump_real_series(config):
  series = series_io.load_from_df_chunks(config.raw_input, config.col_name)
  norm_series = series_transform.normalize_series(config, series)
  delta_series = series_transform.normal_to_delta_series(config, norm_series)
  dump_path = series_io.dump_as_np_series(config, tmp_stage_name(config), delta_series)
  logger.info('%r=%r / %r', config.raw_input, dump_path, delta_series)