class RandomTable(DataFrameModule): def __init__(self, columns, rows=-1, random=np.random.rand, throttle=False, force_valid_ids=False, **kwds): super(RandomTable, self).__init__(**kwds) self.default_step_size = 1000 if isinstance(columns, int): self.columns = list(range(1, columns + 1)) elif isinstance(columns, (list, np.ndarray)): self.columns = columns else: raise ProgressiveError("Invalid type for columns") cols = len(self.columns) self.columns.append(self.UPDATE_COLUMN) self.rows = rows self.random = random if throttle and isinstance(throttle, (int, float)): self.throttle = throttle else: self.throttle = False self._df = create_dataframe(self.columns, types=cols * [np.dtype(float)] + [np.dtype(int)]) if force_valid_ids: force_valid_id_columns(self._df) self.columns = self._df.columns # reuse the pandas index structure self._buffer = BufferedDataFrame() def run_step(self, run_number, step_size, howlong): if step_size == 0: # bug logger.error("Received a step_size of 0") return self._return_run_step(self.state_ready, steps_run=0, creates=0) logger.info("generating %d lines", step_size) if self.throttle: step_size = np.min([self.throttle, step_size]) if self.rows >= 0 and (len(self._df) + step_size) > self.rows: step_size = self.rows - len(self._df) if step_size <= 0: raise StopIteration logger.info("truncating to %d lines", step_size) values = {} for c in self.columns[:-1]: s = pd.Series(self.random(step_size)) values[c] = s values[self.UPDATE_COLUMN] = pd.Series(step_size * [run_number], dtype=np.dtype(int)) df = pd.DataFrame(values, columns=self.columns) with self.lock: self._buffer.append(df) self._df = self._buffer.df() next_state = self.state_blocked if self.throttle else self.state_ready return self._return_run_step(next_state, steps_run=step_size)
class CSVLoader(DataFrameModule): def __init__(self, filepath_or_buffer=None, filter=None, force_valid_ids=False, **kwds): """CSVLoader(filepath_or_buffer=None, sep=', ', dialect=None, compression='infer', doublequote=True, escapechar=None, quotechar='"', quoting=0, skipinitialspace=False, lineterminator=None, header='infer', index_col=None, names=None, prefix=None, skiprows=None, skipfooter=None, skip_footer=0, na_values=None, na_fvalues=None, true_values=None, false_values=None, delimiter=None, converters=None, dtype=None, usecols=None, engine=None, delim_whitespace=False, as_recarray=False, na_filter=True, compact_ints=False, use_unsigned=False, low_memory=True, buffer_lines=None, warn_bad_lines=True, error_bad_lines=True, keep_default_na=True, thousands=None, comment=None, decimal='.', parse_dates=False, keep_date_col=False, dayfirst=False, date_parser=None, memory_map=False, float_precision=None, nrows=None, chunksize=None, verbose=False, encoding=None, squeeze=False, mangle_dupe_cols=True, tupleize_cols=False, infer_datetime_format=False, skip_blank_lines=True, force_valid_ids=False, id=None,scheduler=None,tracer=None,predictor=None,storage=None,input_descriptors=[],output_descriptors=[]) """ self._add_slots(kwds,'input_descriptors', [SlotDescriptor('filenames', type=pd.DataFrame,required=False)]) super(CSVLoader, self).__init__(**kwds) self.default_step_size = kwds.get('chunksize', 1000) # initial guess kwds.setdefault('chunksize', self.default_step_size) # Filter out the module keywords from the csv loader keywords csv_kwds = self._filter_kwds(kwds, pd.read_csv) # When called with a specified chunksize, it returns a parser self.filepath_or_buffer = filepath_or_buffer self.force_valid_ids = force_valid_ids self.parser = None self.csv_kwds = csv_kwds self._rows_read = 0 if filter is not None and not callable(filter): raise ProgressiveError('filter parameter should be callable or None') self._filter = filter self._buffer = BufferedDataFrame() def rows_read(self): return self._rows_read def is_ready(self): fn = self.get_input_slot('filenames') if fn and fn.has_created(): return True return super(CSVLoader, self).is_ready() def validate_parser(self, run_number): if self.parser is None: if self.filepath_or_buffer is not None: try: self.parser = pd.read_csv(self.filepath_or_buffer, **self.csv_kwds) except IOError as e: logger.error('Cannot open file %s: %s', self.filepath_or_buffer, e) self.parser = None return self.state_terminated self.filepath_or_buffer = None else: fn_slot = self.get_input_slot('filenames') if fn_slot is None or fn_slot.output_module is None: return self.state_terminated with fn_slot.lock: fn_slot.update(run_number) if fn_slot.has_deleted() or fn_slot.has_updated(): raise ProgressiveError('Cannot handle input file changes') df = fn_slot.data() while self.parser is None: indices = fn_slot.next_created(1) if indices.stop==indices.start: return self.state_blocked filename = df.at[indices.start, 'filename'] try: self.parser = pd.read_csv(filename, **self.csv_kwds) except IOError as e: logger.error('Cannot open file %s: %s', filename, e) self.parser = None # fall through return self.state_ready def run_step(self,run_number,step_size, howlong): if step_size==0: # bug logger.error('Received a step_size of 0') return self._return_run_step(self.state_ready, steps_run=0, creates=0) status = self.validate_parser(run_number) if status==self.state_terminated: raise StopIteration('no more filenames') elif status==self.state_blocked: return self._return_run_step(status, steps_run=0, creates=0) elif status != self.state_ready: logger.error('Invalid state returned by validate_parser: %d', status) raise StopIteration('Unexpected situation') logger.info('loading %d lines', step_size) try: with self.lock: df = self.parser.read(step_size) # raises StopIteration at EOF except StopIteration: fn_slot = self.get_input_slot('filenames') if fn_slot is None or fn_slot.output_module is None: raise self.parser = None return self._return_run_step(self.state_ready, steps_run=0, creates=0) creates = len(df) if creates == 0: # should not happen logger.error('Received 0 elements') raise StopIteration if self._filter != None: df = self._filter(df) creates = len(df) if creates == 0: logger.info('frame has been filtered out') else: self._rows_read += creates logger.info('Loaded %d lines', self._rows_read) if self.force_valid_ids: force_valid_id_columns(df) df[self.UPDATE_COLUMN] = run_number with self.lock: self._buffer.append(df) self._df = self._buffer.df() return self._return_run_step(self.state_ready, steps_run=creates)
class HDFLoader(DataFrameModule): def __init__(self, filepath_or_buffer=None, filter=None, force_valid_ids=False, **kwds): """HDFLoader(filepath_or_buffer=None, force_valid_ids=False, id=None,scheduler=None,tracer=None,predictor=None,storage=None) """ super(HDFLoader, self).__init__(**kwds) assert False, "Not working yet" self.default_step_size = kwds.get('chunksize', 1000) # initial guess kwds.setdefault('chunksize', self.default_step_size) # Filter out the module keywords from the csv loader keywords hdf_kwds = self._filter_kwds(kwds, pd.read_hdf) # When called with a specified chunksize, it returns a parser self.filepath_or_buffer = filepath_or_buffer self.force_valid_ids = force_valid_ids self.hdf_kwds = hdf_kwds self._rows_read = 0 if filter is not None and not callable(filter): raise ProgressiveError('filter parameter should be callable or None') self._filter = filter self._buffer = BufferedDataFrame() def rows_read(self): return self._rows_read def is_ready(self): fn = self.get_input_slot('filenames') if fn and fn.has_created(): return True return super(HDFLoader, self).is_ready() def validate_parser(self, run_number): if self.parser is None: if self.filepath_or_buffer is not None: try: self.parser = pd.read_hdf(self.filepath_or_buffer, **self.hdf_kwds) except IOError as e: logger.error('Cannot open file %s: %s', self.filepath_or_buffer, e) self.parser = None return self.state_terminated self.filepath_or_buffer = None return self.state_ready def run_step(self,run_number,step_size, howlong): if step_size==0: # bug logger.error('Received a step_size of 0') return self._return_run_step(self.state_ready, steps_run=0, creates=0) status = self.validate_parser(run_number) if status==self.state_terminated: raise StopIteration() elif status != self.state_ready: logger.error('Invalid state returned by validate_parser: %d', status) raise StopIteration('Unexpected situation') logger.info('loading %d lines', step_size) df = self.parser.read(step_size) # raises StopIteration at EOF creates = len(df) if creates == 0: # should not happen logger.error('Received 0 elements') raise StopIteration if self._filter != None: df = self._filter(df) creates = len(df) if creates == 0: logger.info('frame has been filtered out') else: self._rows_read += creates logger.info('Loaded %d lines', self._rows_read) if self.force_valid_ids: self.force_valid_id_columns(df) df[self.UPDATE_COLUMN] = run_number with self.lock: self._buffer.append(df) self._df = self._buffer.df() return self._return_run_step(self.state_ready, steps_run=creates)
class VECLoader(DataFrameModule): pattern = re.compile(r"\(([0-9]+),([-+.0-9]+)\)[ ]*") def __init__(self, filename, dtype=np.float64, **kwds): self._add_slots(kwds,'output_descriptors', [SlotDescriptor('array', type=csr_matrix, required=False)]) super(VECLoader, self).__init__(**kwds) self._dtype = dtype self.default_step_size = kwds.get('chunksize', 100) # initial guess openf=open if filename.endswith('.bz2'): openf=BZ2File elif filename.endswith('.gz') or filename.endswith('.Z'): openf=GzipFile self.f = openf(filename) # When created with a specified chunksize, it returns the parser self._rows_read = 0 self._csr_matrix = None self._buffer = BufferedDataFrame() def rows_read(self): return self._rows_read def toarray(self): if self._csr_matrix is None: docs = self.df()['document'] dv=DictVectorizer() #TODO: race condition when using threads, cleanup_run can reset between #setting the value here and returning it at the next instruction self._csr_matrix = dv.fit_transform(docs) return self._csr_matrix def cleanup_run(self, run_number): self._csr_matrix = None super(VECLoader, self).cleanup_run(run_number) def get_data(self, name): if name=='array': return self.toarray() return super(VECLoader, self).get_data(name) def run_step(self,run_number,step_size, howlong): if self.f is None: raise StopIteration() dataset = [] try: while len(dataset) < step_size: line = self.f.next() line=line.rstrip('\n\r') if len(line)==0: continue doc = {} for match in re.finditer(self.pattern, line): termidx = int(match.group(1)) termfrx = self._dtype(match.group(2)) doc[termidx] = termfrx if len(doc)!=0: dataset.append(doc) except StopIteration: self.f.close() self.f = None creates = len(dataset) if creates==0: raise StopIteration() df = pd.DataFrame({'document': dataset, self.UPDATE_COLUMN: run_number}) self._rows_read += creates with self.lock: self._buffer.append(df) self._df = self._buffer.df() return self._return_run_step(self.state_ready, steps_run=creates, creates=creates)
class MBKMeans(DataFrameModule): """ Mini-batch k-means using the sklearn implementation. """ def __init__(self, n_clusters, columns=None, batch_size=100, tol=0.0, is_input=True, random_state=None,**kwds): self._add_slots(kwds, 'input_descriptors', [SlotDescriptor('df', type=pd.DataFrame, required=True)]) self._add_slots(kwds,'output_descriptors', [SlotDescriptor('labels', type=pd.DataFrame, required=False)]) super(MBKMeans, self).__init__(**kwds) self.mbk = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, verbose=True, tol=tol, random_state=random_state) self.columns = columns self.n_clusters = n_clusters self.default_step_size = 100 self._buffer = None self._labels = None self._is_input = is_input def reset(self, init='k-means++'): print "Reset, init=", init self.mbk = MiniBatchKMeans(n_clusters=self.mbk.n_clusters, batch_size=self.mbk.batch_size, init=init, #tol=self._rel_tol, random_state=self.mbk.random_state) dfslot = self.get_input_slot('df') dfslot.reset() if self._buffer is not None: self._buffer.reset() self._df = None self._labels = None self.set_state(self.state_ready) def validate_outputs(self): valid = super(MBKMeans, self).validate_inputs() if valid: opt_slot = self.get_output_slot('labels') if opt_slot: logger.debug('Maintaining labels') self._buffer = BufferedDataFrame() else: logger.debug('Not maintaining labels') return valid def labels(self): return self._labels def get_data(self, name): if name=='labels': return self.labels() return super(MBKMeans, self).get_data(name) def run_step(self, run_number, step_size, howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_deleted() or dfslot.has_updated(): logger.debug('has deleted or updated, reseting') self.reset() dfslot.update(run_number) print('dfslot has buffered %d elements'% dfslot.created_length()) if dfslot.created_length() < self.mbk.n_clusters: # Should add more than k items per loop return self._return_run_step(self.state_blocked, steps_run=0) indices = dfslot.next_created(step_size) # returns a slice steps = indices_len(indices) if steps==0: return self._return_run_step(self.state_blocked, steps_run=0) input_df = dfslot.data() X = self.filter_columns(input_df, fix_loc(indices)).values batch_size = self.mbk.batch_size or 100 for batch in gen_batches(steps, batch_size): self.mbk.partial_fit(X[batch]) if self._buffer is not None: df = pd.DataFrame({'labels': self.mbk.labels_}) df[self.UPDATE_COLUMN] = run_number self._buffer.append(df) with self.lock: self._df = pd.DataFrame(self.mbk.cluster_centers_, columns=self.columns) self._df[self.UPDATE_COLUMN] = run_number if self._buffer is not None: logger.debug('Setting the labels') self._labels = self._buffer.df() return self._return_run_step(dfslot.next_state(), steps_run=steps) def is_visualization(self): return False def to_json(self, short=False): json = super(MBKMeans, self).to_json(short) if short: return json return self._centers_to_json(json) def _centers_to_json(self, json): if self._df is not None: json['cluster_centers'] = self._df.to_json() return json def set_centroid(self, c, values): try: c = int(c) except: pass centroids = self._df if c not in centroids.index: raise ProgressiveError('Expected %s values, received %s', len(self.columns), values) if len(values)!=len(self.columns): raise ProgressiveError('Expected %s of values, received %s', len(self.columns), values) run_number = self.scheduler().for_input(self) centroids.loc[c, self.columns] = values centroids.loc[c, self.UPDATE_COLUMN] = run_number self.mbk.cluster_centers_[c] = centroids.loc[c, self.columns] return values def is_input(self): return self._is_input def from_input(self, msg): logger.info('Received message %s', msg) for c in msg: self.set_centroid(c, msg[c]) self.reset(init=self.mbk.cluster_centers_)