Example #1
0
class RandomTable(DataFrameModule):
    def __init__(self, columns, rows=-1, random=np.random.rand, throttle=False, force_valid_ids=False, **kwds):
        super(RandomTable, self).__init__(**kwds)
        self.default_step_size = 1000
        if isinstance(columns, int):
            self.columns = list(range(1, columns + 1))
        elif isinstance(columns, (list, np.ndarray)):
            self.columns = columns
        else:
            raise ProgressiveError("Invalid type for columns")
        cols = len(self.columns)
        self.columns.append(self.UPDATE_COLUMN)
        self.rows = rows
        self.random = random
        if throttle and isinstance(throttle, (int, float)):
            self.throttle = throttle
        else:
            self.throttle = False
        self._df = create_dataframe(self.columns, types=cols * [np.dtype(float)] + [np.dtype(int)])
        if force_valid_ids:
            force_valid_id_columns(self._df)
        self.columns = self._df.columns  # reuse the pandas index structure
        self._buffer = BufferedDataFrame()

    def run_step(self, run_number, step_size, howlong):
        if step_size == 0:  # bug
            logger.error("Received a step_size of 0")
            return self._return_run_step(self.state_ready, steps_run=0, creates=0)
        logger.info("generating %d lines", step_size)
        if self.throttle:
            step_size = np.min([self.throttle, step_size])
        if self.rows >= 0 and (len(self._df) + step_size) > self.rows:
            step_size = self.rows - len(self._df)
            if step_size <= 0:
                raise StopIteration
            logger.info("truncating to %d lines", step_size)

        values = {}
        for c in self.columns[:-1]:
            s = pd.Series(self.random(step_size))
            values[c] = s
        values[self.UPDATE_COLUMN] = pd.Series(step_size * [run_number], dtype=np.dtype(int))
        df = pd.DataFrame(values, columns=self.columns)
        with self.lock:
            self._buffer.append(df)
            self._df = self._buffer.df()
        next_state = self.state_blocked if self.throttle else self.state_ready
        return self._return_run_step(next_state, steps_run=step_size)
Example #2
0
 def validate_outputs(self):
     valid = super(MBKMeans, self).validate_inputs()
     if valid:
         opt_slot = self.get_output_slot('labels')
         if opt_slot:
             logger.debug('Maintaining labels')
             self._buffer = BufferedDataFrame()
         else:
             logger.debug('Not maintaining labels')
     return valid
Example #3
0
 def __init__(self, filename, dtype=np.float64, **kwds):
     self._add_slots(kwds,'output_descriptors',
                     [SlotDescriptor('array', type=csr_matrix, required=False)])
     super(VECLoader, self).__init__(**kwds)
     self._dtype = dtype
     self.default_step_size = kwds.get('chunksize', 100)  # initial guess
     openf=open
     if filename.endswith('.bz2'):
         openf=BZ2File
     elif filename.endswith('.gz') or filename.endswith('.Z'):
         openf=GzipFile
     self.f = openf(filename)
     # When created with a specified chunksize, it returns the parser
     self._rows_read = 0
     self._csr_matrix = None
     self._buffer = BufferedDataFrame()
Example #4
0
 def __init__(self, filepath_or_buffer=None, filter=None, force_valid_ids=False, **kwds):
     """HDFLoader(filepath_or_buffer=None, force_valid_ids=False, id=None,scheduler=None,tracer=None,predictor=None,storage=None)
     """
     super(HDFLoader, self).__init__(**kwds)
     assert False, "Not working yet"
     self.default_step_size = kwds.get('chunksize', 1000)  # initial guess
     kwds.setdefault('chunksize', self.default_step_size)
     # Filter out the module keywords from the csv loader keywords
     hdf_kwds = self._filter_kwds(kwds, pd.read_hdf)
     # When called with a specified chunksize, it returns a parser
     self.filepath_or_buffer = filepath_or_buffer
     self.force_valid_ids = force_valid_ids
     self.hdf_kwds = hdf_kwds
     self._rows_read = 0
     if filter is not None and not callable(filter):
         raise ProgressiveError('filter parameter should be callable or None')
     self._filter = filter
     self._buffer = BufferedDataFrame()
Example #5
0
 def __init__(self, filepath_or_buffer=None, filter=None, force_valid_ids=False, **kwds):
     """CSVLoader(filepath_or_buffer=None, sep=', ', dialect=None, compression='infer', doublequote=True, escapechar=None, quotechar='"', quoting=0, skipinitialspace=False, lineterminator=None, header='infer', index_col=None, names=None, prefix=None, skiprows=None, skipfooter=None, skip_footer=0, na_values=None, na_fvalues=None, true_values=None, false_values=None, delimiter=None, converters=None, dtype=None, usecols=None, engine=None, delim_whitespace=False, as_recarray=False, na_filter=True, compact_ints=False, use_unsigned=False, low_memory=True, buffer_lines=None, warn_bad_lines=True, error_bad_lines=True, keep_default_na=True, thousands=None, comment=None, decimal='.', parse_dates=False, keep_date_col=False, dayfirst=False, date_parser=None, memory_map=False, float_precision=None, nrows=None, chunksize=None, verbose=False, encoding=None, squeeze=False, mangle_dupe_cols=True, tupleize_cols=False, infer_datetime_format=False, skip_blank_lines=True, force_valid_ids=False, id=None,scheduler=None,tracer=None,predictor=None,storage=None,input_descriptors=[],output_descriptors=[])
     """
     self._add_slots(kwds,'input_descriptors',
                     [SlotDescriptor('filenames', type=pd.DataFrame,required=False)])
     super(CSVLoader, self).__init__(**kwds)
     self.default_step_size = kwds.get('chunksize', 1000)  # initial guess
     kwds.setdefault('chunksize', self.default_step_size)
     # Filter out the module keywords from the csv loader keywords
     csv_kwds = self._filter_kwds(kwds, pd.read_csv)
     # When called with a specified chunksize, it returns a parser
     self.filepath_or_buffer = filepath_or_buffer
     self.force_valid_ids = force_valid_ids
     self.parser = None
     self.csv_kwds = csv_kwds
     self._rows_read = 0
     if filter is not None and not callable(filter):
         raise ProgressiveError('filter parameter should be callable or None')
     self._filter = filter
     self._buffer = BufferedDataFrame()
Example #6
0
 def __init__(self, columns, rows=-1, random=np.random.rand, throttle=False, force_valid_ids=False, **kwds):
     super(RandomTable, self).__init__(**kwds)
     self.default_step_size = 1000
     if isinstance(columns, int):
         self.columns = list(range(1, columns + 1))
     elif isinstance(columns, (list, np.ndarray)):
         self.columns = columns
     else:
         raise ProgressiveError("Invalid type for columns")
     cols = len(self.columns)
     self.columns.append(self.UPDATE_COLUMN)
     self.rows = rows
     self.random = random
     if throttle and isinstance(throttle, (int, float)):
         self.throttle = throttle
     else:
         self.throttle = False
     self._df = create_dataframe(self.columns, types=cols * [np.dtype(float)] + [np.dtype(int)])
     if force_valid_ids:
         force_valid_id_columns(self._df)
     self.columns = self._df.columns  # reuse the pandas index structure
     self._buffer = BufferedDataFrame()
Example #7
0
class CSVLoader(DataFrameModule):
    def __init__(self, filepath_or_buffer=None, filter=None, force_valid_ids=False, **kwds):
        """CSVLoader(filepath_or_buffer=None, sep=', ', dialect=None, compression='infer', doublequote=True, escapechar=None, quotechar='"', quoting=0, skipinitialspace=False, lineterminator=None, header='infer', index_col=None, names=None, prefix=None, skiprows=None, skipfooter=None, skip_footer=0, na_values=None, na_fvalues=None, true_values=None, false_values=None, delimiter=None, converters=None, dtype=None, usecols=None, engine=None, delim_whitespace=False, as_recarray=False, na_filter=True, compact_ints=False, use_unsigned=False, low_memory=True, buffer_lines=None, warn_bad_lines=True, error_bad_lines=True, keep_default_na=True, thousands=None, comment=None, decimal='.', parse_dates=False, keep_date_col=False, dayfirst=False, date_parser=None, memory_map=False, float_precision=None, nrows=None, chunksize=None, verbose=False, encoding=None, squeeze=False, mangle_dupe_cols=True, tupleize_cols=False, infer_datetime_format=False, skip_blank_lines=True, force_valid_ids=False, id=None,scheduler=None,tracer=None,predictor=None,storage=None,input_descriptors=[],output_descriptors=[])
        """
        self._add_slots(kwds,'input_descriptors',
                        [SlotDescriptor('filenames', type=pd.DataFrame,required=False)])
        super(CSVLoader, self).__init__(**kwds)
        self.default_step_size = kwds.get('chunksize', 1000)  # initial guess
        kwds.setdefault('chunksize', self.default_step_size)
        # Filter out the module keywords from the csv loader keywords
        csv_kwds = self._filter_kwds(kwds, pd.read_csv)
        # When called with a specified chunksize, it returns a parser
        self.filepath_or_buffer = filepath_or_buffer
        self.force_valid_ids = force_valid_ids
        self.parser = None
        self.csv_kwds = csv_kwds
        self._rows_read = 0
        if filter is not None and not callable(filter):
            raise ProgressiveError('filter parameter should be callable or None')
        self._filter = filter
        self._buffer = BufferedDataFrame()

    def rows_read(self):
        return self._rows_read

    def is_ready(self):
        fn = self.get_input_slot('filenames')
        if fn and fn.has_created():
            return True
        return super(CSVLoader, self).is_ready()

    def validate_parser(self, run_number):
        if self.parser is None:
            if self.filepath_or_buffer is not None:
                try:
                    self.parser = pd.read_csv(self.filepath_or_buffer, **self.csv_kwds)
                except IOError as e:
                    logger.error('Cannot open file %s: %s', self.filepath_or_buffer, e)
                    self.parser = None
                    return self.state_terminated
                self.filepath_or_buffer = None
            else:
                fn_slot = self.get_input_slot('filenames')
                if fn_slot is None or fn_slot.output_module is None:
                    return self.state_terminated
                with fn_slot.lock:
                    fn_slot.update(run_number)
                    if fn_slot.has_deleted() or fn_slot.has_updated():
                        raise ProgressiveError('Cannot handle input file changes')
                    df = fn_slot.data()                        
                    while self.parser is None:
                        indices = fn_slot.next_created(1)
                        if indices.stop==indices.start:
                            return self.state_blocked
                        filename = df.at[indices.start, 'filename']
                        try:
                            self.parser = pd.read_csv(filename, **self.csv_kwds)
                        except IOError as e:
                            logger.error('Cannot open file %s: %s', filename, e)
                            self.parser = None
                        # fall through
        return self.state_ready

    def run_step(self,run_number,step_size, howlong):
        if step_size==0: # bug
            logger.error('Received a step_size of 0')
            return self._return_run_step(self.state_ready, steps_run=0, creates=0)
        status = self.validate_parser(run_number)
        if status==self.state_terminated:
            raise StopIteration('no more filenames')
        elif status==self.state_blocked:
            return self._return_run_step(status, steps_run=0, creates=0)
        elif status != self.state_ready:
            logger.error('Invalid state returned by validate_parser: %d', status)
            raise StopIteration('Unexpected situation')
        logger.info('loading %d lines', step_size)
        try:
            with self.lock:
                df = self.parser.read(step_size) # raises StopIteration at EOF
        except StopIteration:
            fn_slot = self.get_input_slot('filenames')
            if fn_slot is None or fn_slot.output_module is None:
                raise
            self.parser = None
            return self._return_run_step(self.state_ready, steps_run=0, creates=0)

        creates = len(df)
        if creates == 0: # should not happen
            logger.error('Received 0 elements')
            raise StopIteration
        if self._filter != None:
            df = self._filter(df)
        creates = len(df)
        if creates == 0:
            logger.info('frame has been filtered out')
        else:
            self._rows_read += creates
            logger.info('Loaded %d lines', self._rows_read)
            if self.force_valid_ids:
                force_valid_id_columns(df)
            df[self.UPDATE_COLUMN] = run_number
            with self.lock:
                self._buffer.append(df)
                self._df = self._buffer.df()
        return self._return_run_step(self.state_ready, steps_run=creates)
Example #8
0
class HDFLoader(DataFrameModule):
    def __init__(self, filepath_or_buffer=None, filter=None, force_valid_ids=False, **kwds):
        """HDFLoader(filepath_or_buffer=None, force_valid_ids=False, id=None,scheduler=None,tracer=None,predictor=None,storage=None)
        """
        super(HDFLoader, self).__init__(**kwds)
        assert False, "Not working yet"
        self.default_step_size = kwds.get('chunksize', 1000)  # initial guess
        kwds.setdefault('chunksize', self.default_step_size)
        # Filter out the module keywords from the csv loader keywords
        hdf_kwds = self._filter_kwds(kwds, pd.read_hdf)
        # When called with a specified chunksize, it returns a parser
        self.filepath_or_buffer = filepath_or_buffer
        self.force_valid_ids = force_valid_ids
        self.hdf_kwds = hdf_kwds
        self._rows_read = 0
        if filter is not None and not callable(filter):
            raise ProgressiveError('filter parameter should be callable or None')
        self._filter = filter
        self._buffer = BufferedDataFrame()

    def rows_read(self):
        return self._rows_read

    def is_ready(self):
        fn = self.get_input_slot('filenames')
        if fn and fn.has_created():
            return True
        return super(HDFLoader, self).is_ready()

    def validate_parser(self, run_number):
        if self.parser is None:
            if self.filepath_or_buffer is not None:
                try:
                    self.parser = pd.read_hdf(self.filepath_or_buffer, **self.hdf_kwds)
                except IOError as e:
                    logger.error('Cannot open file %s: %s', self.filepath_or_buffer, e)
                    self.parser = None
                    return self.state_terminated
                self.filepath_or_buffer = None
        return self.state_ready

    def run_step(self,run_number,step_size, howlong):
        if step_size==0: # bug
            logger.error('Received a step_size of 0')
            return self._return_run_step(self.state_ready, steps_run=0, creates=0)
        status = self.validate_parser(run_number)
        if status==self.state_terminated:
            raise StopIteration()
        elif status != self.state_ready:
            logger.error('Invalid state returned by validate_parser: %d', status)
            raise StopIteration('Unexpected situation')
        logger.info('loading %d lines', step_size)
        df = self.parser.read(step_size) # raises StopIteration at EOF

        creates = len(df)
        if creates == 0: # should not happen
            logger.error('Received 0 elements')
            raise StopIteration
        if self._filter != None:
            df = self._filter(df)
        creates = len(df)
        if creates == 0:
            logger.info('frame has been filtered out')
        else:
            self._rows_read += creates
            logger.info('Loaded %d lines', self._rows_read)
            if self.force_valid_ids:
                self.force_valid_id_columns(df)
            df[self.UPDATE_COLUMN] = run_number
            with self.lock:
                self._buffer.append(df)
                self._df = self._buffer.df()
        return self._return_run_step(self.state_ready, steps_run=creates)
Example #9
0
class VECLoader(DataFrameModule):
    pattern = re.compile(r"\(([0-9]+),([-+.0-9]+)\)[ ]*")
    
    def __init__(self, filename, dtype=np.float64, **kwds):
        self._add_slots(kwds,'output_descriptors',
                        [SlotDescriptor('array', type=csr_matrix, required=False)])
        super(VECLoader, self).__init__(**kwds)
        self._dtype = dtype
        self.default_step_size = kwds.get('chunksize', 100)  # initial guess
        openf=open
        if filename.endswith('.bz2'):
            openf=BZ2File
        elif filename.endswith('.gz') or filename.endswith('.Z'):
            openf=GzipFile
        self.f = openf(filename)
        # When created with a specified chunksize, it returns the parser
        self._rows_read = 0
        self._csr_matrix = None
        self._buffer = BufferedDataFrame()

    def rows_read(self):
        return self._rows_read

    def toarray(self):
        if self._csr_matrix is None:
            docs = self.df()['document']
            dv=DictVectorizer()
            #TODO: race condition when using threads, cleanup_run can reset between
            #setting the value here and returning it at the next instruction
            self._csr_matrix = dv.fit_transform(docs)
        return self._csr_matrix

    def cleanup_run(self, run_number):
        self._csr_matrix = None
        super(VECLoader, self).cleanup_run(run_number)

    def get_data(self, name):
        if name=='array':
            return self.toarray()
        return super(VECLoader, self).get_data(name)

    def run_step(self,run_number,step_size, howlong):
        if self.f is None:
            raise StopIteration()
        
        dataset = []
        try:
            while len(dataset) < step_size:
                line = self.f.next()
                line=line.rstrip('\n\r')
                if len(line)==0:
                    continue
                doc = {}
                for match in re.finditer(self.pattern, line):
                    termidx = int(match.group(1))
                    termfrx = self._dtype(match.group(2))
                    doc[termidx] = termfrx
                if len(doc)!=0:
                    dataset.append(doc)
        except StopIteration:
            self.f.close()
            self.f = None

        creates = len(dataset)
        if creates==0:
            raise StopIteration()

        df = pd.DataFrame({'document': dataset,
                           self.UPDATE_COLUMN: run_number})
        
        self._rows_read += creates
        with self.lock:
            self._buffer.append(df)
            self._df = self._buffer.df()
        return self._return_run_step(self.state_ready, steps_run=creates, creates=creates)
Example #10
0
class MBKMeans(DataFrameModule):
    """
    Mini-batch k-means using the sklearn implementation.
    """
    def __init__(self, n_clusters, columns=None, batch_size=100, tol=0.0, is_input=True, random_state=None,**kwds):
        self._add_slots(kwds, 'input_descriptors',
                        [SlotDescriptor('df', type=pd.DataFrame, required=True)])
        self._add_slots(kwds,'output_descriptors',
                         [SlotDescriptor('labels', type=pd.DataFrame, required=False)])
        super(MBKMeans, self).__init__(**kwds)
        self.mbk = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size,
                                   verbose=True,
                                   tol=tol, random_state=random_state)
        self.columns = columns
        self.n_clusters = n_clusters
        self.default_step_size = 100
        self._buffer = None
        self._labels = None
        self._is_input = is_input

    def reset(self, init='k-means++'):
        print "Reset, init=", init
        self.mbk = MiniBatchKMeans(n_clusters=self.mbk.n_clusters,
                                   batch_size=self.mbk.batch_size,
                                   init=init,
                                   #tol=self._rel_tol,
                                   random_state=self.mbk.random_state)
        dfslot = self.get_input_slot('df')
        dfslot.reset()
        if self._buffer is not None:
            self._buffer.reset()
        self._df = None
        self._labels = None
        self.set_state(self.state_ready)

    def validate_outputs(self):
        valid = super(MBKMeans, self).validate_inputs()
        if valid:
            opt_slot = self.get_output_slot('labels')
            if opt_slot:
                logger.debug('Maintaining labels')
                self._buffer = BufferedDataFrame()
            else:
                logger.debug('Not maintaining labels')
        return valid

    def labels(self):
        return self._labels

    def get_data(self, name):
        if name=='labels':
            return self.labels()
        return super(MBKMeans, self).get_data(name)

    def run_step(self, run_number, step_size, howlong):
        dfslot = self.get_input_slot('df')
        dfslot.update(run_number)

        if dfslot.has_deleted() or dfslot.has_updated():
            logger.debug('has deleted or updated, reseting')
            self.reset()
            dfslot.update(run_number)

        print('dfslot has buffered %d elements'% dfslot.created_length())
        if dfslot.created_length() < self.mbk.n_clusters:
            # Should add more than k items per loop
            return self._return_run_step(self.state_blocked, steps_run=0)
        indices = dfslot.next_created(step_size) # returns a slice
        steps = indices_len(indices)
        if steps==0:
            return self._return_run_step(self.state_blocked, steps_run=0)
        input_df = dfslot.data()
        X = self.filter_columns(input_df, fix_loc(indices)).values
        batch_size = self.mbk.batch_size or 100
        for batch in gen_batches(steps, batch_size):
            self.mbk.partial_fit(X[batch])
            if self._buffer is not None:
                df = pd.DataFrame({'labels': self.mbk.labels_})
                df[self.UPDATE_COLUMN] = run_number
                self._buffer.append(df)

        with self.lock:
            self._df = pd.DataFrame(self.mbk.cluster_centers_, columns=self.columns)
            self._df[self.UPDATE_COLUMN] = run_number
            if self._buffer is not None:
                logger.debug('Setting the labels')
                self._labels = self._buffer.df()
        return self._return_run_step(dfslot.next_state(), steps_run=steps)

    def is_visualization(self):
        return False

    def to_json(self, short=False):
        json = super(MBKMeans, self).to_json(short)
        if short:
            return json
        return self._centers_to_json(json)

    def _centers_to_json(self, json):
        if self._df is not None:
            json['cluster_centers'] = self._df.to_json()
        return json

    def set_centroid(self, c, values):
        try:
            c = int(c)
        except:
            pass

        centroids = self._df
        if c not in centroids.index:
            raise ProgressiveError('Expected %s values, received %s', len(self.columns), values)

        if len(values)!=len(self.columns):
            raise ProgressiveError('Expected %s of values, received %s', len(self.columns), values)
        run_number = self.scheduler().for_input(self)
        centroids.loc[c, self.columns] = values
        centroids.loc[c, self.UPDATE_COLUMN] = run_number
        self.mbk.cluster_centers_[c] = centroids.loc[c, self.columns]
        return values

    def is_input(self):
        return self._is_input

    def from_input(self, msg):
        logger.info('Received message %s', msg)
        for c in msg:
            self.set_centroid(c, msg[c])
        self.reset(init=self.mbk.cluster_centers_)