Example #1
0
class DataSet:
    """
        DataSet abstract class that has functions that child classes can inherit
        """

    '''
        Initialize 4 fields for the class
        attributes: list of items in the attributes dict in the business metadata to look for (d attributes)
        labels: list of list of labels for the attributes dimension is n x d
        features: list of dict of features in sparse representation using dicts n x ???
        stars: list of list of star ratings from the reviews n x r (reviews)
        '''
    def __init__(self):
        self.attributes = []
        self.labels = None
        self.features_dict = []
        self.stars = []
        self.vocabulary = {}
        self.datamatrix = None
        self.isTfIdf = False

    def make_tfidf_matrix(self, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):
        try:
            self.datamatrix = TfidfTransformer(norm=norm,
                                               use_idf=use_idf,
                                               smooth_idf=smooth_idf,
                                               sublinear_tf=sublinear_tf)\
                .fit_transform(self.datamatrix)
            self.datamatrix.eliminate_zeros()
            self.isTfIdf = True
        except:
            print 'Tf-Idf Transformation failed!!!'

    '''
    Adds another attribute for the dataset builder to look for, will fail if we have already built a dataset
    '''
    def add_attribute(self, attribute):
        if not len(self.features_dict) == 0:
            print 'Already created dataset, cannot add more attributes!!!'
        else:
            if attribute not in self.attributes:
                self.attributes.append(attribute)

    '''
    Checks if the data is valid.
    '''
    def check_data(self):
        l_attributes = len(self.attributes)
        l_labels = len(self.labels)
        l_features = len(self.features_dict)
        l_stars = len(self.stars)
        if not l_attributes == l_labels or not l_labels == l_features or not l_features == l_stars:
            print Exception('Data lengths do not match!')
            return False
        return True

    def __str__(self):
        if not self.check_data():
            return 'Bad data!'
        output = ''
        output += str(self.attributes) + ':stars:features\n'
        for i in range(len(self.attributes)):
            output += str(self.labels) + ':' + str(self.stars) + ':' + str(self.features_dict) + '\n'
        return output

    '''
    Reads a business object and add as a datapoint. This will vary from one representation to the next, so this should
    be overridden in child implementations
    '''
    def add_datapoint(self, business):
        raise NotImplementedError('Please override in child class!!!')

    def make_sparse_datamtrix(self, mat_maker=sp.csr_matrix):
        indptr = [0]
        indices = []
        data = []
        for doc in self.features_dict:
            for feature, value in doc.iteritems():
                index = self.vocabulary.setdefault(feature, len(self.vocabulary))
                indices.append(index)
                data.append(value)
            indptr.append(len(indices))
        sparse_matrix = mat_maker((data, indices, indptr), dtype=float)
        if DEBUG:
            mat_rows = sparse_matrix.shape[0]
            mat_cols = sparse_matrix.shape[1]
            mat_size = mat_rows * mat_cols
            print 'Size of the sparse matrix is: {} x {}'.format(mat_rows, mat_cols)
            print 'Sparsity of data matrix is: {}'.format(float(sparse_matrix.nnz) / mat_size)
            if mat_size < 10000:
                print self.vocabulary
                print sparse_matrix.toarray()
            else:
                print sparse_matrix.toarray()[0:100, 0:100]
        self.datamatrix = sparse_matrix

    def make_sparse_attribute_matrix(self, mat_maker=sp.csr_matrix):
        self.labels = mat_maker(self.labels)