def getZscore(word, user, ngramTable, schema, distTable = ''): (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) word = word.replace('\'', '\'\'') try: query = 'SELECT group_norm FROM {}.{} where group_id = \'{}\' and feat = \'{}\''.format(schema, ngramTable, user, word) #print query except UnicodeEncodeError: return 0 group_norm = mm.executeGetList(schema, dbCursor, query) if not group_norm: #print group_norm #print 'group_norm is None' return 0 if isinstance(group_norm, tuple): #print group_norm group_norm = group_norm[0] if isinstance(group_norm, tuple): group_norm = group_norm[0] (mean, std) = getMeanAndStd(word, ngramTable = ngramTable, schema = schema, distTable = distTable) #print type(group_norm) if (std == 0): return 0 else: return (group_norm - mean)/(std + 0.0)
def getFeatWithLimit(schema, table, group='', amount=50, orderBy='group_norm', desc=True): #get the first n amount of words, using the orderBy (asc or desc) column to sort. #if group is specified, get from that specific group #returns list of (feat, group_norm) (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) if group != '': select_group = 'where group_id = \'{}\''.format(group) else: select_group = '' if amount <= 0: limit = '' else: limit = ' LIMIT {}'.format(int(amount)) query = 'SELECT feat, group_norm FROM {}.{} {} ORDER BY {} DESC{}'.format( schema, table, select_group, orderBy, limit) return mm.executeGetList(schema, dbCursor, query)
def updateZscore(schema, ngramTable, user = '', use_feat_table = False, distTable = ''): # update ngramTable with z-values (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) counter = 0 if user != '': users = [user] else: users = getUsers(schema, ngramTable) for user in users: for ngram in map(lambda x: x[0], getNgrams(ngramTable, schema)): if use_feat_table: z = getZscore(ngram, user, ngramTable, schema) else: z = getZscore(ngram, user, ngramTable, schema, distTable = distTable) ngram = ngram.replace('\'', '\'\'') try : query = "UPDATE {}.{} SET z = {} where group_id = \'{}\' and feat=\'{}\'".format(schema, ngramTable, z, user, ngram) except UnicodeEncodeError: query = "UPDATE {}.{} SET z = 0 where group_id = \'{}\' and feat=\'{}\'".format(schema, ngramTable, user, ngram.encode('utf-8')) if counter % 1000 == 0: print query mm.executeGetList(schema, dbCursor, query) counter += 1
def getFeatValueAndZ(user, schema, ngramTable, min_value = 5, ordered = True, z_threshold = 0): #returns list of (feat, value, z) for a given user (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) if ordered: order_by = " ORDER BY z DESC" else: order_by = "" pos_z = " AND z > {}".format(z_threshold) query = 'SELECT feat, value, z FROM {}.{} WHERE group_id = \'{}\' and value >= {}{}{};'.format(schema, ngramTable, user, min_value, pos_z, order_by) print query list = mm.executeGetList(schema, dbCursor, query) #return map(lambda x: x[0], list) return list
def getUniqueNgrams(schema, ngramTable, user = '', max = -1): # get n ngrams from ngramTable where z-score = 0, sorted by group_norm # if user is specified, only grab unique ngrams from that user (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) if user != '': select_user = '******'{}\''.format(user) else: select_user = '' if max != -1: limit = ' LIMIT {}'.format(max) else: limit = '' query = 'SELECT feat, group_norm FROM {}.{} WHERE z = 0{} ORDER BY group_norm DESC{}'.format(schema, ngramTable, select_user, limit) return mm.executeGetList(schema, dbCursor, query)
def getOneGram(schema, ngramTable): (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) query = "SELECT feat, sum(value) as count FROM {}.{} group by feat".format(schema, ngramTable) print query return mm.executeGetList(schema, dbCursor, query)
def createZColumn(schema, ngramTable): (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) query = "ALTER TABLE {}.{} ADD COLUMN z DOUBLE;".format(schema, ngramTable) mm.executeGetList(schema, dbCursor, query)
def getUsers(schema, ngramTable): (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) query = "SELECT distinct(group_id) FROM {}.{};".format(schema, ngramTable) return map(lambda user: user[0], mm.executeGetList(schema, dbCursor, query))
def getNgrams(ngramTable, schema): #returns list of ngrams (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) query = "SELECT feat FROM {}.{} GROUP BY feat".format(schema, ngramTable) return mm.executeGetList(schema, dbCursor, query)
def getMeanAndStd(word, ngramTable, schema, num_groups = -1, distTable = '', distTableSource = None): # get mean and std for a word using the ngramTable (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) if num_groups == -1: query = 'SELECT count(distinct(group_id)) FROM {}.{}'.format(schema, ngramTable) result = mm.executeGetList(schema, dbCursor, query) num_groups = int(result[0][0]) #print int(num_groups[0][0]) elif distTableSource is not None: #TODO: let user specify distTableSource query = 'SELECT count(distinct(group_id)) FROM {}.{}'.format(schema, distTableSource) result = mm.executeGetList(schema, dbCursor, query) num_groups = int(result[0][0]) else: pass if distTable == '': ########### two pass algorithm n = 0 #count sum = 0.0 diff_squared_sum = 0.0 #for group_norm in session.query(Feature.group_norm).filter(Feature.feat == word): query = u'SELECT group_norm FROM {}.{} WHERE feat = \'{}\''.format(schema, ngramTable, word) group_norms = mm.executeGetList(schema, dbCursor, query) #print 'SELECT group_norm FROM {}.{} WHERE feat = \'{}\''.format(schema, ngramTable, word) num_groups = len(group_norms) if len(group_norms) == 1: return (0, 0) for group_norm in group_norms: n += 1 sum += group_norm[0] mean = float(sum)/num_groups #print "Mean: %.12f" % mean for group_norm in group_norms: diff_squared_sum += (group_norm[0] - mean) ** 2 if (num_groups == 1): variance = 1 else: variance = diff_squared_sum / (num_groups - 1) #sample variance std = sqrt(variance) #print "Standard Deviation: %.12f" % std ########### algorithm end else: query = "SELECT mean, std FROM {}.{} where feat = \'{}\'".format(schema, distTable, word) result = mm.executeGetList(schema, dbCursor, query) if not result: mean = 0 std = 0 else: mean = result[0][0] std = result[0][1] #print (mean, std) return (mean, std)
labels = dataHolder.keys() counts = dataHolder.values() ro_labels = ro.StrVector(labels) ro_counts = ro.IntVector(counts) if filename: self.grdevices.png(file="%s_hist_cats.png"%(filename), width=self.widths, height=self.heights) self.graphics.par(las=2, mar=[5.1, 7.1, 4.1, 2.1]) ro.r.barplot(ro_counts, main = "Category Histogram, N=%d"%(total_count), beside=True, horiz=True, col='royalblue4', **{"names.arg":ro_labels}) self.grdevices.dev_off() if __name__=="__main__": sp = StatsPlotter() #floatCols = [2, 3, 4] + range(6,14) + [23] + [28] #prefix = '600_' #sp.getFloatColStats("userstats_en", floatCols, "plots/%sdesc"%prefix) #sp.getCategoricalColStats("userstats_en", range(14, 23), "plots/%srelnbins"%prefix) (conn, cur, dcur) = mm.dbConnect('fb20') #sp.getCategoricalColStats('fb20', cur, "userstats_en", range(24, 28), '/data/ml/plots/fb20/age_category') #sp.getCategoricalColStats('fb20', cur, "userstats_en", range(14, 23), '/data/ml/plots/fb20/reln_category') # N=100000 # d1 = list(rand.normal(0,2,N)) # d2 = list(rand.normal(0,1,N)) # d3 = list(rand.normal(0,17,N)) # d_all = {"d1":d1, "d2":d2, "d3":d3} # e1 = list(rand.exponential(2,N)) # e2 = list(rand.exponential(14,N)) # e_all = {'e1':e1, 'e2':e2} # sp.plot2dHist('d1', d1, 'd2', d2) # sp.plot2dHistGeneralized(d_all, e_all, 'plots/samba') mm.warn("descStats.py exits with success :)")