def updateZscore(schema, ngramTable, user = '', use_feat_table = False, distTable = ''): # update ngramTable with z-values (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) counter = 0 if user != '': users = [user] else: users = getUsers(schema, ngramTable) for user in users: for ngram in map(lambda x: x[0], getNgrams(ngramTable, schema)): if use_feat_table: z = getZscore(ngram, user, ngramTable, schema) else: z = getZscore(ngram, user, ngramTable, schema, distTable = distTable) ngram = ngram.replace('\'', '\'\'') try : query = "UPDATE {}.{} SET z = {} where group_id = \'{}\' and feat=\'{}\'".format(schema, ngramTable, z, user, ngram) except UnicodeEncodeError: query = "UPDATE {}.{} SET z = 0 where group_id = \'{}\' and feat=\'{}\'".format(schema, ngramTable, user, ngram.encode('utf-8')) if counter % 1000 == 0: print query mm.executeGetList(schema, dbCursor, query) counter += 1
def getCategoricalColStats(self, db, dbCursor, tableName, colsOfNote, filename=None): #1. Get column names sql = "SELECT column_name from information_schema.columns where table_name='%s'"%tableName row = mm.executeGetList(db, dbCursor, sql) colNames = [] for col in colsOfNote: colNames.append(row[col][0]) ncols = len(colsOfNote) #. Get users who have at least one message sql = "SELECT DISTINCT group_id FROM feat$1gram$messages$user_id$16to16" rows = mm.executeGetList(db, dbCursor, sql) user_ids = [] for row in rows: user_ids.append(row[0]) sql_user_ids = ",".join(map(str,user_ids)) #. Get total row count total_count = len(user_ids) #. Assemble storage data structure dataHolder = dict() for col in colNames: dataHolder[col] = 0 dataHolder["none specified"] = 0 #. Pull data of interest sql = "SELECT * FROM %s WHERE user_id IN (%s)"%(tableName, sql_user_ids) rows = mm.executeGetList(db, dbCursor, sql) ii = 0 for row in rows: jj = 0 has_specified_value = False for col in colsOfNote: if row[col]: dataHolder[colNames[jj]] += 1 if has_specified_value: raise Exception("incorrect assumption; more than one category allowed") has_specified_value = True jj += 1 if not has_specified_value: dataHolder["none specified"] += 1 ii += 1 #. Calculate descriptive statistics and create plots labels = dataHolder.keys() counts = dataHolder.values() ro_labels = ro.StrVector(labels) ro_counts = ro.IntVector(counts) if filename: self.grdevices.png(file="%s_hist_cats.png"%(filename), width=self.widths, height=self.heights) self.graphics.par(las=2, mar=[5.1, 7.1, 4.1, 2.1]) ro.r.barplot(ro_counts, main = "Category Histogram, N=%d"%(total_count), beside=True, horiz=True, col='royalblue4', **{"names.arg":ro_labels}) self.grdevices.dev_off()
def getZscore(word, user, ngramTable, schema, distTable = ''): (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) word = word.replace('\'', '\'\'') try: query = 'SELECT group_norm FROM {}.{} where group_id = \'{}\' and feat = \'{}\''.format(schema, ngramTable, user, word) #print query except UnicodeEncodeError: return 0 group_norm = mm.executeGetList(schema, dbCursor, query) if not group_norm: #print group_norm #print 'group_norm is None' return 0 if isinstance(group_norm, tuple): #print group_norm group_norm = group_norm[0] if isinstance(group_norm, tuple): group_norm = group_norm[0] (mean, std) = getMeanAndStd(word, ngramTable = ngramTable, schema = schema, distTable = distTable) #print type(group_norm) if (std == 0): return 0 else: return (group_norm - mean)/(std + 0.0)
def getFeatWithLimit(schema, table, group='', amount=50, orderBy='group_norm', desc=True): #get the first n amount of words, using the orderBy (asc or desc) column to sort. #if group is specified, get from that specific group #returns list of (feat, group_norm) (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) if group != '': select_group = 'where group_id = \'{}\''.format(group) else: select_group = '' if amount <= 0: limit = '' else: limit = ' LIMIT {}'.format(int(amount)) query = 'SELECT feat, group_norm FROM {}.{} {} ORDER BY {} DESC{}'.format( schema, table, select_group, orderBy, limit) return mm.executeGetList(schema, dbCursor, query)
def getFloatColStats(self, db, dbCursor, tableName, colsOfNote, filename=None): #1. Get column names sql = "SELECT column_name from information_schema.columns where table_name='%s'"%tableName row = mm.executeGetList(db, dbCursor, sql) colNames = [] for col in colsOfNote: colNames.append(row[col][0]) ncols = len(colsOfNote) #. Get users who have at least one message sql = "SELECT DISTINCT group_id FROM feat$1gram$messages$user_id$16to16" rows = mm.executeGetList(db, dbCursor, sql) user_ids = [] for row in rows: user_ids.append(row[0]) sql_user_ids = ",".join(map(str,user_ids)) #. Get total row count total_count = len(user_ids) #. Assemble storage data structure dataHolder = [] for col in colsOfNote: dataHolder.append([None]*total_count) #. Pull data of interest; use offset if needed sql = "SELECT * FROM %s WHERE user_id IN (%s)"%(tableName, sql_user_ids) rows = mm.executeGetList(db, dbCursor, sql) ii = 0 for row in rows: jj = 0 for col in colsOfNote: dataHolder[jj][ii] = row[col] jj += 1 ii += 1 #. Link the data to their names dataDict = dict() for cc in range(ncols): dataDict[colNames[cc]] = dataHolder[cc] return self.plotDescStats(dataDict, total_count, filename)
def getFeatValueAndZ(user, schema, ngramTable, min_value = 5, ordered = True, z_threshold = 0): #returns list of (feat, value, z) for a given user (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) if ordered: order_by = " ORDER BY z DESC" else: order_by = "" pos_z = " AND z > {}".format(z_threshold) query = 'SELECT feat, value, z FROM {}.{} WHERE group_id = \'{}\' and value >= {}{}{};'.format(schema, ngramTable, user, min_value, pos_z, order_by) print query list = mm.executeGetList(schema, dbCursor, query) #return map(lambda x: x[0], list) return list
def getUniqueNgrams(schema, ngramTable, user = '', max = -1): # get n ngrams from ngramTable where z-score = 0, sorted by group_norm # if user is specified, only grab unique ngrams from that user (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) if user != '': select_user = '******'{}\''.format(user) else: select_user = '' if max != -1: limit = ' LIMIT {}'.format(max) else: limit = '' query = 'SELECT feat, group_norm FROM {}.{} WHERE z = 0{} ORDER BY group_norm DESC{}'.format(schema, ngramTable, select_user, limit) return mm.executeGetList(schema, dbCursor, query)
def getOneGram(schema, ngramTable): (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) query = "SELECT feat, sum(value) as count FROM {}.{} group by feat".format(schema, ngramTable) print query return mm.executeGetList(schema, dbCursor, query)
def createZColumn(schema, ngramTable): (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) query = "ALTER TABLE {}.{} ADD COLUMN z DOUBLE;".format(schema, ngramTable) mm.executeGetList(schema, dbCursor, query)
def getUsers(schema, ngramTable): (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) query = "SELECT distinct(group_id) FROM {}.{};".format(schema, ngramTable) return map(lambda user: user[0], mm.executeGetList(schema, dbCursor, query))
def getNgrams(ngramTable, schema): #returns list of ngrams (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) query = "SELECT feat FROM {}.{} GROUP BY feat".format(schema, ngramTable) return mm.executeGetList(schema, dbCursor, query)
def getMeanAndStd(word, ngramTable, schema, num_groups = -1, distTable = '', distTableSource = None): # get mean and std for a word using the ngramTable (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema) if num_groups == -1: query = 'SELECT count(distinct(group_id)) FROM {}.{}'.format(schema, ngramTable) result = mm.executeGetList(schema, dbCursor, query) num_groups = int(result[0][0]) #print int(num_groups[0][0]) elif distTableSource is not None: #TODO: let user specify distTableSource query = 'SELECT count(distinct(group_id)) FROM {}.{}'.format(schema, distTableSource) result = mm.executeGetList(schema, dbCursor, query) num_groups = int(result[0][0]) else: pass if distTable == '': ########### two pass algorithm n = 0 #count sum = 0.0 diff_squared_sum = 0.0 #for group_norm in session.query(Feature.group_norm).filter(Feature.feat == word): query = u'SELECT group_norm FROM {}.{} WHERE feat = \'{}\''.format(schema, ngramTable, word) group_norms = mm.executeGetList(schema, dbCursor, query) #print 'SELECT group_norm FROM {}.{} WHERE feat = \'{}\''.format(schema, ngramTable, word) num_groups = len(group_norms) if len(group_norms) == 1: return (0, 0) for group_norm in group_norms: n += 1 sum += group_norm[0] mean = float(sum)/num_groups #print "Mean: %.12f" % mean for group_norm in group_norms: diff_squared_sum += (group_norm[0] - mean) ** 2 if (num_groups == 1): variance = 1 else: variance = diff_squared_sum / (num_groups - 1) #sample variance std = sqrt(variance) #print "Standard Deviation: %.12f" % std ########### algorithm end else: query = "SELECT mean, std FROM {}.{} where feat = \'{}\'".format(schema, distTable, word) result = mm.executeGetList(schema, dbCursor, query) if not result: mean = 0 std = 0 else: mean = result[0][0] std = result[0][1] #print (mean, std) return (mean, std)