def makeTopicLabelMap(self, topiclexicon, numtopicwords=5, is_weighted_lexicon=False): featlabel_tablename = 'feat_to_label$%s$%d'%(topiclexicon, numtopicwords) pldb = self.lexicondb (plconn, plcur, plcurD) = mm.dbConnect(pldb, charset=self.encoding, use_unicode=self.use_unicode) sql = 'DROP TABLE IF EXISTS `%s`'%featlabel_tablename mm.execute(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = 'CREATE TABLE `%s` (`id` int(16) unsigned NOT NULL AUTO_INCREMENT, `term` varchar(128) DEFAULT NULL, `category` varchar(64) DEFAULT NULL, PRIMARY KEY (`id`), KEY `term` (`term`), KEY `category` (`category`) )'%featlabel_tablename mm.execute(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = 'SELECT DISTINCT category FROM %s'%topiclexicon categories = map(lambda x: x[0], mm.executeGetList(pldb, plcur, sql)) label_list = [] for category in categories: if is_weighted_lexicon: sql = 'SELECT term, weight from %s WHERE category = \'%s\''%(topiclexicon, category) rows = mm.executeGetList(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode) top_n_rows = sorted(rows, key=lambda x:x[1], reverse=True) terms = map(lambda x: x[0], top_n_rows) label = ' '.join(map(str, terms[0:numtopicwords])) escaped_label = MySQLdb.escape_string(label) sql = 'INSERT INTO `%s` (`term`, `category`) VALUES(\'%s\', \'%s\')'%(featlabel_tablename, category, escaped_label ) mm.execute(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode) else: sql = 'SELECT term from %s WHERE category = \'%s\''%(topiclexicon, category) terms = map(lambda x: x[0], mm.executeGetList(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode)) label = ' '.join(map(str, terms[0:numtopicwords])) escaped_label = MySQLdb.escape_string(label) sql = 'INSERT INTO `%s` (`term`, `category`) VALUES(\'%s\', \'%s\')'%(featlabel_tablename, category, escaped_label ) mm.execute(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode) return featlabel_tablename
def createTfIdfTable(self, ngram_table): ''' Creates new feature table where group_norm = tf-idf (term frequency-inverse document frequency) :param ngram_table: table containing words/ngrams, collocs, etc... Written by Phil ''' # tf-idf = tf*idf # tf (term frequency) is simply how frequently a term occurs in a document (group_norm for a given group_id) # each feat's idf = log(N/dt) # N = number of documents in total (i.e. count(distinct(group_id)) # df (document frequency) = number of documents where feat was used in (i.e. count(distinct(group_id)) where feat = 'feat') # create new feature table feat_name_grabber = re.compile(r'^feat\$([^\$]+)\$') feat_name = feat_name_grabber.match(ngram_table).group(1) # grabs feat_name (i.e. 1gram, 1to3gram) short_name = 'tf_idf_{}'.format(feat_name) idf_table = self.createFeatureTable(short_name, valueType = 'DOUBLE') #getting N sql = "SELECT COUNT(DISTINCT group_id) FROM %s" % ngram_table N = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0] feat_counts = self.getFeatureCounts() #tuples of: feat, count (number of groups feature appears with) fwc.warn('Inserting idf values into new table') counter = 0 for (feat, dt) in feat_counts: idf = log(N/float(dt)) # get (group_id, group_norm) where feat = feat # clean_feat = mm.MySQLdb.escape_string(feat.encode('utf-8')) sql = u"""SELECT group_id, value, group_norm from %s WHERE feat = \'%s\'"""%(ngram_table, mm.MySQLdb.escape_string(feat.encode('utf-8')).decode('utf-8')) group_id_freq = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) for (group_id, value, tf) in group_id_freq: tf_idf = tf * idf insert_sql = u"INSERT INTO {} (group_id, feat, value, group_norm) VALUES (\'{}\', \'{}\', {}, {});".format( idf_table, group_id, mm.MySQLdb.escape_string(feat.encode('utf-8')).decode('utf-8'), value, tf_idf) mm.execute(self.corpdb, self.dbCursor, insert_sql) if (counter % 50000 == 0): print '%d tf_idf values inserted!' % (counter) counter += 1 fwc.warn('Finished inserting.') return idf_table
def getMessagesWithFieldForCorrelField(self, cf_id, extraField, messageTable = None, warnMsg = True): """...""" if not messageTable: messageTable = self.corptable msql = """SELECT %s, %s, %s FROM %s WHERE %s = '%s'""" % ( self.messageid_field, self.message_field, extraField, messageTable, self.correl_field, cf_id) #return self._executeGetSSCursor(msql, showQuery) return mm.executeGetList(self.corpdb, self.dbCursor, msql, warnMsg, charset=self.encoding, use_unicode=self.use_unicode)
def createFeatureTable(self, featureName, featureType = 'VARCHAR(64)', valueType = 'INTEGER', tableName = None, valueFunc = None, correlField=None, extension = None): """Creates a feature table based on self data and feature name""" #create table name if not tableName: valueExtension = '' tableName = 'feat$'+featureName+'$'+self.corptable+'$'+self.correl_field if valueFunc: tableName += '$' + str(16)+'to'+"%d"%round(valueFunc(16)) if extension: tableName += '$' + extension #find correl_field type: sql = """SELECT column_type FROM information_schema.columns WHERE table_schema='%s' AND table_name='%s' AND column_name='%s'""" % ( self.corpdb, self.corptable, self.correl_field) correlField = self.getCorrelFieldType(self.correl_field) if not correlField else correlField correl_fieldType = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0] if not correlField else correlField #create sql drop = """DROP TABLE IF EXISTS %s""" % tableName # featureType = "VARCHAR(30)" # MAARTEN #CREATE TABLE feat_3gram_messages_rand1000_user_id (id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, user_id ('bigint(20) unsigned',), 3gram VARCHAR(64), VALUE INTEGER #sql = """CREATE TABLE %s (id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, group_id %s, feat %s, value %s, group_norm DOUBLE, feat_norm DOUBLE, KEY `correl_field` (`group_id`), KEY `feature` (`feat`)) CHARACTER SET utf8 COLLATE utf8_general_ci""" %(tableName, correl_fieldType, featureType, valueType) sql = """CREATE TABLE %s (id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, group_id %s, feat %s, value %s, group_norm DOUBLE, KEY `correl_field` (`group_id`), KEY `feature` (`feat`)) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin""" %(tableName, correl_fieldType, featureType, valueType) #run sql mm.execute(self.corpdb, self.dbCursor, drop, charset=self.encoding, use_unicode=self.use_unicode) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) return tableName;
def makeBlackWhiteList(args_featlist, args_lextable, args_categories, args_lexdb): newlist = set() if args.use_unicode: print "making black or white list: [%s] [%s] [%s]" %([unicode(feat,'utf-8') if isinstance(feat, str) else feat for feat in args_featlist], args_lextable, args_categories) else: print "making black or white list: [%s] [%s] [%s]" %([feat if isinstance(feat, str) else feat for feat in args_featlist], args_lextable, args_categories) if args_lextable and args_categories: (conn, cur, dcur) = mm.dbConnect(args_lexdb, charset=self.encoding, use_unicode=self.use_unicode) sql = 'SELECT term FROM %s' % (args_lextable) if (len(args_categories) > 0) and args_categories[0] != '*': sql = 'SELECT term FROM %s WHERE category in (%s)'%(args_lextable, ','.join(map(lambda x: '\''+str(x)+'\'', args_categories))) rows = mm.executeGetList(args_lexdb, cur, sql, charset=self.encoding, use_unicode=self.use_unicode) for row in rows: newlist.add(row[0]) elif args_featlist: for feat in args_featlist: if args.use_unicode: feat = unicode(feat, 'utf-8') if isinstance(feat, str) else feat else: feat = feat if isinstance(feat, str) else feat # newlist.add(feat.lower()) if args.use_unicode: newlist.add(feat.upper() if sum(map(unicode.isupper, feat)) > (len(feat)/2) else feat.lower()) else: newlist.add(feat.upper() if sum(map(str.isupper, feat)) > (len(feat)/2) else feat.lower()) else: raise Exception('blacklist / whitelist flag specified without providing features.') newlist = [w.strip() for w in newlist] return newlist
def getValuesAndGroupNormsForFeat(self, feat, where = '', warnMsg = False): """returns a list of (group_id, feature, group_norm) triples""" if self.use_unicode: sql = """SELECT group_id, value, group_norm FROM %s WHERE feat = '%s'"""%(self.featureTable, MySQLdb.escape_string(unicode(feat, 'utf8'))) else: sql = """SELECT group_id, value, group_norm FROM %s WHERE feat = '%s'"""%(self.featureTable, MySQLdb.escape_string(feat)) if (where): sql += ' AND ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, warnMsg, charset=self.encoding, use_unicode=self.use_unicode)
def getSumValue(self, where=''): """returns the sume of all values""" sql = """select sum(value) from %s""" % (self.featureTable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0]
def getNumWordsByCorrelField(self, where = ''): """...""" #assumes corptable has num_words field for each message #SELECT user_id, sum(num_words) FROM (SELECT user_id, num_words FROM messages GROUP BY message_id) as a GROUP BY user_id sql = """SELECT %s, sum(num_words) FROM (SELECT %s, num_words FROM %s """ % (self.correl_field, self.correl_field, self.corptable) if (where): sql += ' WHERE ' + where sql += """ GROUP BY %s) as a """ % self.messageid_field sql += """ GROUP BY %s """ % self.correl_field return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getFeatureTables(self, where=''): """Return all available feature tables for the given corpdb, corptable and correl_field""" sql = """SHOW TABLES FROM %s LIKE 'feat$%%$%s$%s$%%' """ % ( self.corpdb, self.corptable, self.correl_field) return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getValuesAndGroupNormsForFeats(self, feats, where = '', warnMsg = False): """returns a list of (group_id, feature, group_norm) triples""" if self.use_unicode: fCond = " feat in ('%s')" % "','".join(MySQLdb.escape_string(unicode(f)) for f in feats) else: fCond = " feat in ('%s')" % "','".join(MySQLdb.escape_string(f) for f in feats) sql = """SELECT group_id, value, group_norm FROM %s WHERE %s"""%(self.featureTable, fCond) if (where): sql += ' AND ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, warnMsg, charset=self.encoding, use_unicode=self.use_unicode)
def getValues(self, where=''): """returns a list of (group_id, feature, value) triples""" sql = """select group_id, feat, value from %s""" % (self.featureTable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getFeatureZeros(self, where=''): """returns a distinct list of (feature) tuples given the name of the feature value field (either value, group_norm, or feat_norm)""" sql = "select feat, zero_feat_norm from %s" % ('mean_' + self.featureTable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getSumValuesByFeat(self, where=''): """ """ sql = """SELECT feat, sum(value) FROM %s """ % self.featureTable if (where): sql += ' WHERE ' + where sql += """ GROUP BY feat """ return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getGroupNorms(self, where=''): """returns a list of (group_id, feature, group_norm) triples""" sql = """SELECT group_id, feat, group_norm from %s""" % ( self.featureTable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getFeatMeanData(self, where = ''): """returns a dict of (feature => (mean, std, zero_feat_norm)) """ meanTable = 'mean_'+self.featureTable sql = """select feat, mean, std, zero_feat_norm from %s"""%(meanTable) if (where): sql += ' WHERE ' + where mList = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) meanData = dict() for tup in mList: #feat : (mean, std, zero_feat_norm) meanData[tup[0]] = tup[1:] return meanData
def getFeatureValueSums(self, where=''): """returns a list of (feature, count) tuples, where count is the number of groups with the feature""" sql = """select feat, sum(value) from %s group by feat""" % ( self.featureTable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getDistinctGroupsFromFeatTable(self, where=""): """Returns the distinct group ids that are in the feature table""" sql = "select distinct group_id from %s" % self.featureTable if (where): sql += ' WHERE ' + where return map( lambda l: l[0], mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
def getGroupNormsForFeat(self, feat, where='', warnMsg=False): """returns a list of (group_id, feature, group_norm) triples""" sql = """SELECT group_id, group_norm FROM %s WHERE feat = '%s'""" % ( self.featureTable, MySQLdb.escape_string(feat)) if (where): sql += ' AND ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, warnMsg, charset=self.encoding, use_unicode=self.use_unicode)
def getGroupAndFeatureValues(self, featName=None, where=''): """returns a list of (group_id, feature_value) tuples""" if not featName: featName = self.featNames[0] sql = "select group_id, group_norm from %s WHERE feat = '%s'" % ( self.featureTable, featName) if (where): sql += ' AND ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)
def getDistinctOutcomeValues(self, outcome = None, includeNull = True, where = ''): """returns a list of outcome values""" if not outcome: outcome = self.outcome_value_fields[0] sql = "SELECT DISTINCT %s FROM %s"%(outcome, self.outcome_table) if not includeNull or where: wheres = [] if where: wheres.append(where) if not includeNull: wheres.append("%s IS NOT NULL" % outcome) sql += ' WHERE ' + ' AND '.join(wheres) return map(lambda v: v[0], mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
def getDistinctOutcomeValues(self, outcome = None, includeNull = True, where = ''): """returns a list of outcome values""" if not outcome: outcome = self.outcome_value_fields[0] sql = "SELECT DISTINCT %s FROM %s"%(outcome, self.outcome_table) if not includeNull or where: wheres = [] if where: wheres.append(where) if not includeNull: wheres.append("%s IS NOT NULL" % outcome) sql += ' WHERE ' + ' AND '.join(wheres) return map(lambda v: v[0], mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
def getDistinctGroups(self, where=''): """returns the distinct distinct groups (note that this runs on the corptable to be accurate)""" sql = """select DISTINCT %s from %s""" % (self.correl_field, self.corptable) if (where): sql += ' WHERE ' + where return map( lambda l: l[0], mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
def makeContingencyTable(self, featureGetter, featureValueField, outcome_filter_where='', feature_value_group_sum_min=0): """makes a contingency table from this outcome value, a featureGetter, and the desired column of the featureGetter, assumes both correl_field's are the same""" """follows http://www.artfulsoftware.com/infotree/queries.php at section: Group Column Statistics in Rows""" """the only time this uses outcome_value's would be in the outcome_filter_where statement""" fg = featureGetter distinctFeatureList = fg.getDistinctFeatures() #access single idx featZeroDict = dict(fg.getFeatureZeros()) sql = "SELECT %s, "%(fg.correl_field) def makeCaseStrings( distinctFeature ): df = distinctFeature[0] zero = .0000001 if df in featZeroDict: zero = featZeroDict[df] df = MySQLdb.escape_string(df) if df:#debug return "( CASE feat WHEN '%s' THEN %s ELSE %s END ) AS '%s'"%(df, featureValueField, str(zero), df) return '' case_statements = map(makeCaseStrings, distinctFeatureList) sql_cases_features = ", ".join(case_statements) + " " #debugN = 1000 #DEBUG #_warn( distinctFeatureList[0:debugN] ) #DEBUG #sql_cases_features = "".join(case_statements[0:debugN]) #DEBUG # update the main sql statement to select distinct features as columns sql += sql_cases_features # filter out the outcomes based on the outcome_filter_where statement # an example would be outcome_filter_where = "self.featureValueField is not null and self.featureValueField > 0.50" sql_new_outcome_correl_ids = "( SELECT %s FROM %s "%(self.correl_field, self.outcome_table) if outcome_filter_where: sql_new_outcome_correl_ids += "WHERE " + outcome_filter_where sql_new_outcome_correl_ids += ")" # This piece takes care of "users with > 4000 words" sql_filtered_feature_table = fg.featureTable if feature_value_group_sum_min > 0: # Get a sum of "value" for each group_id sql_group_ids_and_value_counts = "( SELECT %s, SUM(value) AS value_sum FROM %s GROUP BY %s )"%(fg.correl_field, fg.featureTable, fg.correl_field) # Select group_id that have a "value_sum" >= N (input as a function argument; feature_value_group_sum_min) sql_group_ids_with_sufficient_value = "( SELECT %s FROM %s AS groupIdsAndSums WHERE value_sum > %s )"%(fg.correl_field, sql_group_ids_and_value_counts, feature_value_group_sum_min) # Select the subset of the original fg.featureTable where group_id meets the "value_sum >= N" condition sql_filtered_feature_table = "( SELECT featuresOriginal.* FROM %s AS featuresOriginal, %s AS featuresSubset WHERE featuresOriginal.%s = featuresSubset.%s )"%( fg.featureTable, sql_group_ids_with_sufficient_value, fg.correl_field, fg.correl_field) # update the feature table to contain only the outcomes from the filtered id's sql_filtered_feature_table_2 = "( SELECT filteredFeatures.* FROM %s AS filteredFeatures, %s AS filteredOutcomes WHERE filteredFeatures.%s = filteredOutcomes.%s)"%(sql_filtered_feature_table, sql_new_outcome_correl_ids, fg.correl_field, self.correl_field) # finish the original query with the updated feature table sql += "FROM %s AS updatedFeatures GROUP BY %s"%(sql_filtered_feature_table_2, fg.correl_field) return [distinctFeatureList, mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)]
def makeContingencyTable(self, featureGetter, featureValueField, outcome_filter_where='', feature_value_group_sum_min=0): """makes a contingency table from this outcome value, a featureGetter, and the desired column of the featureGetter, assumes both correl_field's are the same""" """follows http://www.artfulsoftware.com/infotree/queries.php at section: Group Column Statistics in Rows""" """the only time this uses outcome_value's would be in the outcome_filter_where statement""" fg = featureGetter distinctFeatureList = fg.getDistinctFeatures() #access single idx featZeroDict = dict(fg.getFeatureZeros()) sql = "SELECT %s, "%(fg.correl_field) def makeCaseStrings( distinctFeature ): df = distinctFeature[0] zero = .0000001 if df in featZeroDict: zero = featZeroDict[df] df = MySQLdb.escape_string(df) if df:#debug return "( CASE feat WHEN '%s' THEN %s ELSE %s END ) AS '%s'"%(df, featureValueField, str(zero), df) return '' case_statements = map(makeCaseStrings, distinctFeatureList) sql_cases_features = ", ".join(case_statements) + " " #debugN = 1000 #DEBUG #_warn( distinctFeatureList[0:debugN] ) #DEBUG #sql_cases_features = "".join(case_statements[0:debugN]) #DEBUG # update the main sql statement to select distinct features as columns sql += sql_cases_features # filter out the outcomes based on the outcome_filter_where statement # an example would be outcome_filter_where = "self.featureValueField is not null and self.featureValueField > 0.50" sql_new_outcome_correl_ids = "( SELECT %s FROM %s "%(self.correl_field, self.outcome_table) if outcome_filter_where: sql_new_outcome_correl_ids += "WHERE " + outcome_filter_where sql_new_outcome_correl_ids += ")" # This piece takes care of "users with > 4000 words" sql_filtered_feature_table = fg.featureTable if feature_value_group_sum_min > 0: # Get a sum of "value" for each group_id sql_group_ids_and_value_counts = "( SELECT %s, SUM(value) AS value_sum FROM %s GROUP BY %s )"%(fg.correl_field, fg.featureTable, fg.correl_field) # Select group_id that have a "value_sum" >= N (input as a function argument; feature_value_group_sum_min) sql_group_ids_with_sufficient_value = "( SELECT %s FROM %s AS groupIdsAndSums WHERE value_sum > %s )"%(fg.correl_field, sql_group_ids_and_value_counts, feature_value_group_sum_min) # Select the subset of the original fg.featureTable where group_id meets the "value_sum >= N" condition sql_filtered_feature_table = "( SELECT featuresOriginal.* FROM %s AS featuresOriginal, %s AS featuresSubset WHERE featuresOriginal.%s = featuresSubset.%s )"%( fg.featureTable, sql_group_ids_with_sufficient_value, fg.correl_field, fg.correl_field) # update the feature table to contain only the outcomes from the filtered id's sql_filtered_feature_table_2 = "( SELECT filteredFeatures.* FROM %s AS filteredFeatures, %s AS filteredOutcomes WHERE filteredFeatures.%s = filteredOutcomes.%s)"%(sql_filtered_feature_table, sql_new_outcome_correl_ids, fg.correl_field, self.correl_field) # finish the original query with the updated feature table sql += "FROM %s AS updatedFeatures GROUP BY %s"%(sql_filtered_feature_table_2, fg.correl_field) return [distinctFeatureList, mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)]
def countGroups(self, groupThresh = 0, where=''): """returns the number of distinct groups (note that this runs on the corptable to be accurate)""" if groupThresh: groupCnts = self.getGroupWordCounts(where) count = 0 for wordCount in groupCnts.itervalues(): if (wordCount >= groupThresh): count += 1 return count else: sql = """select count(DISTINCT %s) from %s""" %(self.correl_field, self.corptable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0]
def makeBlackWhiteList(args_featlist, args_lextable, args_categories, args_lexdb, args_use_unicode): newlist = set() if args_use_unicode: print "making black or white list: [%s] [%s] [%s]" % ([ unicode(feat, 'utf-8') if isinstance(feat, str) else feat for feat in args_featlist ], args_lextable, args_categories) else: print "making black or white list: [%s] [%s] [%s]" % ([ feat if isinstance(feat, str) else feat for feat in args_featlist ], args_lextable, args_categories) if args_lextable and args_categories: (conn, cur, dcur) = mm.dbConnect(args_lexdb, charset=self.encoding, use_unicode=self.use_unicode) sql = 'SELECT term FROM %s' % (args_lextable) if (len(args_categories) > 0) and args_categories[0] != '*': sql = 'SELECT term FROM %s WHERE category in (%s)' % ( args_lextable, ','.join( map(lambda x: '\'' + str(x) + '\'', args_categories))) rows = mm.executeGetList(args_lexdb, cur, sql, charset=self.encoding, use_unicode=self.use_unicode) for row in rows: newlist.add(row[0]) elif args_featlist: for feat in args_featlist: if args_use_unicode: feat = unicode(feat, 'utf-8') if isinstance(feat, str) else feat else: feat = feat if isinstance(feat, str) else feat # newlist.add(feat.lower()) if args_use_unicode: newlist.add(feat.upper() if sum(map(unicode.isupper, feat)) > (len(feat) / 2) else feat.lower()) else: newlist.add(feat.upper() if sum(map(str.isupper, feat)) > ( len(feat) / 2) else feat.lower()) else: raise Exception( 'blacklist / whitelist flag specified without providing features.' ) newlist = [w.strip() for w in newlist] return newlist
def getNumWordsByCorrelField(self, where=''): """...""" #assumes corptable has num_words field for each message #SELECT user_id, sum(num_words) FROM (SELECT user_id, num_words FROM messages GROUP BY message_id) as a GROUP BY user_id sql = """SELECT %s, sum(num_words) FROM (SELECT %s, num_words FROM %s """ % ( self.correl_field, self.correl_field, self.corptable) if (where): sql += ' WHERE ' + where sql += """ GROUP BY %s) as a """ % self.messageid_field sql += """ GROUP BY %s """ % self.correl_field return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getFeatMeanData(self, where=''): """returns a dict of (feature => (mean, std, zero_feat_norm)) """ meanTable = 'mean_' + self.featureTable sql = """select feat, mean, std, zero_feat_norm from %s""" % ( meanTable) if (where): sql += ' WHERE ' + where mList = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) meanData = dict() for tup in mList: #feat : (mean, std, zero_feat_norm) meanData[tup[0]] = tup[1:] return meanData
def getMessagesForCorrelField(self, cf_id, messageTable=None, warnMsg=True): """...""" if not messageTable: messageTable = self.corptable msql = """SELECT %s, %s FROM %s WHERE %s = '%s'""" % ( self.messageid_field, self.message_field, messageTable, self.correl_field, cf_id) #return self._executeGetSSCursor(msql, warnMsg, host=self.mysql_host) return mm.executeGetList(self.corpdb, self.dbCursor, msql, warnMsg, charset=self.encoding, use_unicode=self.use_unicode)
def getFeatureCounts(self, groupFreqThresh=0, where='', SS=False, groups=set()): """ Gets feature occurence by group Args: groupFreqThresh (int): Minimum number of words a group must contain to be considered valid where (string): Conditional sql string to limit the search to elements meeting a specified criteria SS (boolean): Indicates the use of SSCursor (true use SSCursor to access MySQL) groups (set): Set of group ID's Returns: returns a list of (feature, count) tuples, where count is the feature occurence in each group """ if groupFreqThresh: groupCnts = self.getGroupWordCounts(where) for group, wordCount in groupCnts.iteritems(): if (wordCount >= groupFreqThresh): groups.add(group) if (where): where += ' WHERE ' + where if groups: where += ' AND ' + " group_id in ('%s')" % "','".join( str(g) for g in groups) elif groups: where = " WHERE group_id in ('%s')" % "','".join( str(g) for g in groups) sql = """select feat, count(*) from %s %s group by feat""" % ( self.featureTable, where) if SS: mm.executeGetSSCursor(self.corpdb, sql, charset=self.encoding, use_unicode=self.use_unicode, host=self.mysql_host) return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getDistinctOutcomeValueCounts(self, outcome = None, requireControls = False, includeNull = True, where = ''): """returns a dict of (outcome_value, count)""" if not outcome: outcome = self.outcome_value_fields[0] sql = "SELECT %s, count(*) FROM %s"%(outcome, self.outcome_table) if requireControls or not includeNull or where: wheres = [] if where: wheres.append(where) if requireControls: for control in self.outcome_controls: wheres.append("%s IS NOT NULL" % control) if not includeNull: wheres.append("%s IS NOT NULL" % outcome) sql += ' WHERE ' + ' AND '.join(wheres) sql += ' group by %s ' % outcome return dict(mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
def getValuesAndGroupNormsForFeats(self, feats, where='', warnMsg=False): """returns a list of (group_id, feature, group_norm) triples""" if self.use_unicode: fCond = " feat in ('%s')" % "','".join( MySQLdb.escape_string(unicode(f)) for f in feats) else: fCond = " feat in ('%s')" % "','".join( MySQLdb.escape_string(f) for f in feats) sql = """SELECT group_id, value, group_norm FROM %s WHERE %s""" % ( self.featureTable, fCond) if (where): sql += ' AND ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, warnMsg, charset=self.encoding, use_unicode=self.use_unicode)
def getDistinctOutcomeValueCounts(self, outcome = None, requireControls = False, includeNull = True, where = ''): """returns a dict of (outcome_value, count)""" if not outcome: outcome = self.outcome_value_fields[0] sql = "SELECT %s, count(*) FROM %s"%(outcome, self.outcome_table) if requireControls or not includeNull or where: wheres = [] if where: wheres.append(where) if requireControls: for control in self.outcome_controls: wheres.append("%s IS NOT NULL" % control) if not includeNull: wheres.append("%s IS NOT NULL" % outcome) sql += ' WHERE ' + ' AND '.join(wheres) sql += ' group by %s ' % outcome return dict(mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
def countGroups(self, groupThresh=0, where=''): """returns the number of distinct groups (note that this runs on the corptable to be accurate)""" if groupThresh: groupCnts = self.getGroupWordCounts(where) count = 0 for wordCount in groupCnts.itervalues(): if (wordCount >= groupThresh): count += 1 return count else: sql = """select count(DISTINCT %s) from %s""" % (self.correl_field, self.corptable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0]
def getFeatureCounts(self, groupFreqThresh = 0, where = '', SS = False, groups = set()): """returns a list of (feature, count) tuples, where count is the number of groups with the feature""" if groupFreqThresh: groupCnts = self.getGroupWordCounts(where) for group, wordCount in groupCnts.iteritems(): if (wordCount >= groupFreqThresh): groups.add(group) if (where): where += ' WHERE ' + where if groups: where += ' AND ' + " group_id in ('%s')" % "','".join(str(g) for g in groups) elif groups: where = " WHERE group_id in ('%s')" % "','".join(str(g) for g in groups) sql = """select feat, count(*) from %s %s group by feat"""%(self.featureTable, where) if SS: mm.executeGetSSCursor(self.corpdb, sql, charset=self.encoding, use_unicode=self.use_unicode) return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def addFeatTableMeans(self, field='group_norm', groupNorms = None): """Add to the feature mean table: mean, standard deviation, and zero_mean for the current feature table""" #CREATE TABLE meanTable = 'mean$'+self.featureTable mm.execute(self.corpdb, self.dbCursor, "DROP TABLE IF EXISTS %s" % meanTable, charset=self.encoding, use_unicode=self.use_unicode) featType = mm.executeGetList(self.corpdb, self.dbCursor, "SHOW COLUMNS FROM %s like 'feat'" % self.featureTable)[0][1] sql = """CREATE TABLE %s (feat %s, mean DOUBLE, std DOUBLE, zero_feat_norm DOUBLE, PRIMARY KEY (`feat`))""" % (meanTable, featType) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) fMeans = self.findMeans(field, True, groupNorms) fMeansList = [(k, v[0], v[1], v[2]) for k, v in fMeans.iteritems()] #print fMeansList #debug #WRITE TO TABLE: sql = """INSERT INTO """+meanTable+""" (feat, mean, std, zero_feat_norm) VALUES (%s, %s, %s, %s)""" mm.executeWriteMany(self.corpdb, self.dbCursor, sql, fMeansList, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode) return fMeans
def getDistinctOutcomeAndControlValueCounts(self, outcome = None, control = None, includeNull = True, where = ''): """returns a dict of (outcome_value, count)""" #TODO: muliple controls if not outcome: outcome = self.outcome_value_fields[0] if not control: control = self.outcome_controls[0] sql = "SELECT %s, %s, count(*) FROM %s"%(outcome, control, self.outcome_table) if not includeNull or where: wheres = [] if where: wheres.append(where) if not includeNull: wheres.append("%s IS NOT NULL" % outcome) wheres.append("%s IS NOT NULL" % control) sql += ' WHERE ' + ' AND '.join(wheres) sql += ' group by %s, %s ' % (outcome, control) countDict = dict() for (outcome, control, count) in mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode): if not outcome in countDict: countDict[outcome] = dict() countDict[outcome][control] = count return countDict
def getDistinctOutcomeAndControlValueCounts(self, outcome = None, control = None, includeNull = True, where = ''): """returns a dict of (outcome_value, count)""" #TODO: muliple controls if not outcome: outcome = self.outcome_value_fields[0] if not control: control = self.outcome_controls[0] sql = "SELECT %s, %s, count(*) FROM %s"%(outcome, control, self.outcome_table) if not includeNull or where: wheres = [] if where: wheres.append(where) if not includeNull: wheres.append("%s IS NOT NULL" % outcome) wheres.append("%s IS NOT NULL" % control) sql += ' WHERE ' + ' AND '.join(wheres) sql += ' group by %s, %s ' % (outcome, control) countDict = dict() for (outcome, control, count) in mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode): if not outcome in countDict: countDict[outcome] = dict() countDict[outcome][control] = count return countDict
def getGroupAndOutcomeValues(self, outcomeField = None, where=''): """returns a list of (group_id, outcome_value) tuples""" if not outcomeField: outcomeField = self.outcome_value_fields[0] sql = "select %s, %s from `%s` WHERE %s IS NOT NULL"%(self.correl_field, outcomeField, self.outcome_table, outcomeField) if (where): sql += ' AND ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)
def getFeatureZeros(self, where=''): """returns a distinct list of (feature) tuples given the name of the feature value field (either value, group_norm, or feat_norm)""" sql = "select feat, zero_feat_norm from %s"%('mean_'+self.featureTable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getDistinctGroupsFromFeatTable(self, where=""): """Returns the distinct group ids that are in the feature table""" sql = "select distinct group_id from %s" % self.featureTable if (where): sql += ' WHERE ' + where return map(lambda l:l[0], mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
def getDistinctGroups(self, where=''): """returns the distinct distinct groups (note that this runs on the corptable to be accurate)""" sql = """select DISTINCT %s from %s""" %(self.correl_field, self.corptable) if (where): sql += ' WHERE ' + where return map(lambda l: l[0], mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
def ttestWithOtherFG(self, other, maskTable= None, groupFreqThresh = 0): """Performs PAIRED ttest on differences between group norms for 2 tables, within features""" """to-do: switch for paired ttest or not""" #read mask table and figure out groups for each mask: masks = {'no mask': set()} if maskTable: maskList = mm.getTableColumnNameList(self.corpdb, self.dbCursor, maskTable, charset=self.encoding, use_unicode=self.use_unicode) print maskList assert self.correl_field in maskList, "group field, %s, not in mask table" % self.correl_field maskToIndex = dict([(maskList[i], i) for i in xrange(len(maskList))]) groupIndex = maskToIndex[self.correl_field] #get data: sql = """SELECT %s FROM %s""" % (', '.join(maskList), maskTable) maskData = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) for maskId in maskList: if not maskId == self.correl_field: masks[maskId] = set() for row in maskData: groupId = row[groupIndex] for i in xrange(len(row)): if i != groupIndex and row[i] == 1: masks[maskList[i]].add(groupId) #apply masks results = dict() #mask => results for mid, mask in masks.iteritems(): threshGroups1 = set() threshGroups2 = set() # get groups passing GFT for BOTH if groupFreqThresh: print 'groupFreqThresh set to '+str(groupFreqThresh) groupCnts1 = self.getGroupWordCounts(lexicon_count_table=self.getWordTable(self.featureTable.split('$')[2])) #print groupCnts1 for group, wordCount in groupCnts1.iteritems(): if (wordCount >= groupFreqThresh): threshGroups1.add(group) groupCnts2 = other.getGroupWordCounts(lexicon_count_table=other.getWordTable(other.featureTable.split('$')[2])) #print groupCnts2 for group, wordCount in groupCnts2.iteritems(): if (wordCount >= groupFreqThresh): threshGroups2.add(group) print str(len(threshGroups1))+' groups pass groupFreqThresh for feat table 1' print str(len(threshGroups2))+' groups pass groupFreqThresh for feat table 2' threshGroups = threshGroups1 & threshGroups2 if mask: threshGroups = threshGroups & mask threshGroups = list(threshGroups) print str(len(threshGroups))+' groups pass groupFreqThresh for BOTH' assert len(threshGroups) > 0, "No groups passing frequency threshold" #find features: feats1 = self.getDistinctFeatures() feats2 = other.getDistinctFeatures() featsInCommon = list(set(feats1) & set(feats2)) ttestResults = dict() featYielder1 = self.yieldGroupNormsWithZerosByFeat(groups = threshGroups, feats = featsInCommon) featYielder2 = other.yieldGroupNormsWithZerosByFeat(groups = threshGroups, feats = featsInCommon) for (feat1, dataDict1, Nfeats1) in featYielder1: (feat2, dataDict2, Nfeats2) = featYielder2.next() assert feat1==feat2, 'feats do not match' assert sorted(dataDict1)==sorted(dataDict2), 'groups do not match' gns1 = [gn for (group, gn) in sorted(dataDict1.items())] gns2 = [gn for (group, gn) in sorted(dataDict2.items())] #t,p = ttest_rel(gns1,gns2) t,p, d = self.pairedTTest(gns1,gns2) ttestResults[feat1] = {'t': t, 'p': p, 'd': d, 'N': len(gns1)} results[mid] = ttestResults # dict for each feat return results
def getSumValuesByFeat(self, where = ''): """ """ sql = """SELECT feat, sum(value) FROM %s """ % self.featureTable if (where): sql += ' WHERE ' + where sql += """ GROUP BY feat """ return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getGroupAndOutcomeValues(self, outcomeField = None, where=''): """returns a list of (group_id, outcome_value) tuples""" if not outcomeField: outcomeField = self.outcome_value_fields[0] sql = "select %s, %s from `%s` WHERE %s IS NOT NULL"%(self.correl_field, outcomeField, self.outcome_table, outcomeField) if (where): sql += ' AND ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)
def getSumValue(self, where = ''): """returns the sume of all values""" sql = """select sum(value) from %s"""%(self.featureTable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0]
def getFeatureValueSums(self, where = ''): """returns a list of (feature, count) tuples, where count is the number of groups with the feature""" sql = """select feat, sum(value) from %s group by feat"""%(self.featureTable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getFeatAll(self, where = ''): """returns a list of (group_id, feature, value, group_norm) tuples""" sql = """select group_id, feat, value, group_norm from %s"""%(self.featureTable) if (where): sql += ' WHERE ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
def getGroupAndFeatureValues(self, featName=None, where=''): """returns a list of (group_id, feature_value) tuples""" if not featName: featName = self.featNames[0] sql = "select group_id, group_norm from %s WHERE feat = '%s'"%(self.featureTable, featName) if (where): sql += ' AND ' + where return mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)
def ttestWithOtherFG(self, other, maskTable=None, groupFreqThresh=0): """Performs PAIRED ttest on differences between group norms for 2 tables, within features""" """to-do: switch for paired ttest or not""" #read mask table and figure out groups for each mask: masks = {'no mask': set()} if maskTable: maskList = mm.getTableColumnNameList(self.corpdb, self.dbCursor, maskTable, charset=self.encoding, use_unicode=self.use_unicode) print maskList assert self.correl_field in maskList, "group field, %s, not in mask table" % self.correl_field maskToIndex = dict([(maskList[i], i) for i in xrange(len(maskList))]) groupIndex = maskToIndex[self.correl_field] #get data: sql = """SELECT %s FROM %s""" % (', '.join(maskList), maskTable) maskData = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) for maskId in maskList: if not maskId == self.correl_field: masks[maskId] = set() for row in maskData: groupId = row[groupIndex] for i in xrange(len(row)): if i != groupIndex and row[i] == 1: masks[maskList[i]].add(groupId) #apply masks results = dict() #mask => results for mid, mask in masks.iteritems(): threshGroups1 = set() threshGroups2 = set() # get groups passing GFT for BOTH if groupFreqThresh: print 'groupFreqThresh set to ' + str(groupFreqThresh) groupCnts1 = self.getGroupWordCounts( lexicon_count_table=self.getWordTable( self.featureTable.split('$')[2])) #print groupCnts1 for group, wordCount in groupCnts1.iteritems(): if (wordCount >= groupFreqThresh): threshGroups1.add(group) groupCnts2 = other.getGroupWordCounts( lexicon_count_table=other.getWordTable( other.featureTable.split('$')[2])) #print groupCnts2 for group, wordCount in groupCnts2.iteritems(): if (wordCount >= groupFreqThresh): threshGroups2.add(group) print str(len(threshGroups1) ) + ' groups pass groupFreqThresh for feat table 1' print str(len(threshGroups2) ) + ' groups pass groupFreqThresh for feat table 2' threshGroups = threshGroups1 & threshGroups2 if mask: threshGroups = threshGroups & mask threshGroups = list(threshGroups) print str( len(threshGroups)) + ' groups pass groupFreqThresh for BOTH' assert len( threshGroups) > 0, "No groups passing frequency threshold" #find features: feats1 = self.getDistinctFeatures() feats2 = other.getDistinctFeatures() featsInCommon = list(set(feats1) & set(feats2)) ttestResults = dict() featYielder1 = self.yieldGroupNormsWithZerosByFeat( groups=threshGroups, feats=featsInCommon) featYielder2 = other.yieldGroupNormsWithZerosByFeat( groups=threshGroups, feats=featsInCommon) for (feat1, dataDict1, Nfeats1) in featYielder1: (feat2, dataDict2, Nfeats2) = featYielder2.next() assert feat1 == feat2, 'feats do not match' assert sorted(dataDict1) == sorted( dataDict2), 'groups do not match' gns1 = [gn for (group, gn) in sorted(dataDict1.items())] gns2 = [gn for (group, gn) in sorted(dataDict2.items())] #t,p = ttest_rel(gns1,gns2) t, p, d = self.pairedTTest(gns1, gns2) ttestResults[feat1] = {'t': t, 'p': p, 'd': d, 'N': len(gns1)} results[mid] = ttestResults # dict for each feat return results
def createTableWithBinnedFeats(self, num_bins, group_id_range, groupfreqthresh, valueFunc = lambda x:x, gender=None, genderattack=False, reporting_percent=0.04, outcomeTable = fwc.DEF_OUTCOME_TABLE, skip_binning=False): featureTable = self.featureTable group_id_range = map(int, group_id_range) newTable = featureTable+'$'+str(num_bins)+'b_'+'_'.join(map(str,group_id_range)) if skip_binning: return newTable sql = 'DROP TABLE IF EXISTS %s'%newTable mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = "CREATE TABLE %s like %s" % (newTable, featureTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) #groupValues = self.getSumValuesByGroup(where) # [(gid1, val1), ...] # OLD N calculation.... same as new one.... # sql = 'SELECT age, COUNT(DISTINCT user_id) from userstats_en_ageadj where age >= %d AND age <= %d AND uwt >= %d'%(group_id_range[0], group_id_range[1], groupfreqthresh) # if gender: # gender = gender.lower() # if gender == 'm': # gender = 0 # elif gender == 'f': # gender = 1 # sql += ' AND gender = %d'%gender # else: # sql += ' AND gender IS NOT NULL' # sql += ' group by age' # groupValues = mm.executeGetList(self.corpdb, self.dbCursor, sql) # [(gid1, N1), ...] # groupIdToN = dict(groupValues) groupNs = mm.executeGetList(self.corpdb, self.dbCursor, 'SELECT group_id, N FROM %s GROUP BY group_id'%self.featureTable, charset=self.encoding, use_unicode=self.use_unicode) groupIdToN = dict(groupNs) #pprint(groupIdToN) #pprint(groupIdToN) total_freq = sum(map(lambda x:x[1], groupNs)) bin_size = float(total_freq) / float(num_bins+2) num_groups = len(groupNs) reporting_int = fwc._getReportingInt(reporting_percent, num_groups) # figure out the bins, i.e. if group_id's 1,2,3 total value is greater than "bin_size" our first bin is 1_3. fwc.warn('determining the number of bins...') current_sum = 0 current_lower_group = groupNs[0][0] current_upper_group = None next_group_is_lower_group = False bin_groups = OrderedDict() gg = 0 for group, value in groupNs: if next_group_is_lower_group: current_lower_group = group next_group_is_lower_group = False current_sum += value current_upper_group = group if current_sum >= bin_size: current_sum = 0 bin_groups[(current_lower_group, current_upper_group)] = '_'.join(map(str,[current_lower_group, current_upper_group])) next_group_is_lower_group = True gg += 1 fwc._report('group_id\'s', gg, reporting_int, num_groups) if current_sum >= 0: bin_groups[(current_lower_group, current_upper_group)] = '_'.join(map(str,[current_lower_group, current_upper_group])) max_label_length = max(map(len, bin_groups.values())) sql = 'ALTER TABLE %s MODIFY COLUMN group_id VARCHAR(%d)'%(newTable, max_label_length) #this action preserves the index mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = 'ALTER TABLE %s ADD COLUMN `bin_center` float(6) not null default -1.0'%(newTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = 'ALTER TABLE %s ADD COLUMN `bin_center_w` float(6) not null default -1.0'%(newTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = 'ALTER TABLE %s ADD COLUMN `bin_width` int(10) not null default -1'%(newTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) mm.disableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode) # for each newly denoted bin: e.g. 1_3, 4_5, 6_6, ... get the new feature value counts / group norms; insert them into the new table # e.g. 1 'hi' 5, 2 'hi' 10, 3 'hi' 30 ==> 1_3 'hi' 45 (of course include group_norm also) fwc.warn('aggreagating the newly binned feature values / group_norms into the new table...') isql = 'INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N, bin_center, bin_center_w, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s, %s, %s') #isql = 'INSERT INTO %s (group_id, feat, value, group_norm, N, bin_center, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s') ii_bins = 0 num_bins = len(bin_groups.keys()) reporting_int = fwc._getReportingInt(reporting_percent, num_bins) #_warn('#############BIN NUMBER############### [[%d]] #############'%len(bin_groups)) for (lower_group, upper_group), label in bin_groups.iteritems(): bin_N_sum = 0 bin_width = 0 bin_center = sum((lower_group, upper_group)) / 2.0 bin_center_w = 0 for ii in range(lower_group, upper_group+1): #_warn('for bin %d_%d ii:%d'%(lower_group, upper_group, ii)) bin_width += 1 bin_N_sum += groupIdToN.get(ii, 0) bin_center_w += groupIdToN.get(ii, 0) * ii bin_center_w = float(bin_center_w) / float(bin_N_sum) #_warn('number of users in range [%d, %d] is %d'%(lower_group, upper_group, bin_N_sum)) # sql = 'SELECT group_id, feat, value, group_norm, N FROM %s where group_id >= %d AND group_id <= %d'%(self.featureTable, lower_group, upper_group) sql = 'SELECT group_id, feat, value, group_norm, std_dev FROM %s where group_id >= %d AND group_id <= %d'%(self.featureTable, lower_group, upper_group) groupFeatValueNorm = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) #pprint(groupFeatValueNorm) totalFeatCountForThisBin = float(0) featToValue = {} featToSummedNorm = {} for group_id, feat, value, norm, sd in groupFeatValueNorm: # for group_id, feat, value, norm, N in groupFeatValueNorm: if fwc.LOWERCASE_ONLY: feat = str(feat).lower() totalFeatCountForThisBin += value currentN = groupIdToN[group_id] try: featToValue[feat] += value featToSummedNorm[feat] += norm * currentN except KeyError: featToValue[feat] = value featToSummedNorm[feat] = norm * currentN #calculate mean and std_dev, using above info featToMeanNorm = {} featToSummedVar = {} for group_id, feat, _, norm, sd in groupFeatValueNorm: currentN = groupIdToN[group_id] meanNorm = featToSummedNorm[feat]/bin_N_sum try: featToSummedVar[feat] += currentN*((meanNorm - norm)**2 + (sd*sd)) except KeyError: featToSummedVar[feat] = currentN*((meanNorm - norm)**2 + (sd*sd)) featToMeanNorm[feat] = meanNorm current_batch = [ ('_'.join(map(str,(lower_group, upper_group))), k, v, featToMeanNorm[k], sqrt(featToSummedVar[k] / bin_N_sum), bin_N_sum, bin_center, bin_center_w, bin_width) for k, v in featToValue.iteritems() ] mm.executeWriteMany(self.corpdb, self.dbCursor, isql, current_batch, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode) # print 'N bin sum:', bin_N_sum # isql = 'INSERT INTO %s (group_id, feat, value, group_norm, N, bin_center, bin_center_w, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s, %s') ii_bins += 1 fwc._report('group_id bins', ii_bins, reporting_int, num_bins) mm.enableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode) fwc.warn('Done creating new group_id-binned feature table.') outputdata = mm.executeGetList(self.corpdb, self.dbCursor, 'select group_id, N from `%s` group by group_id'%(newTable,), charset=self.encoding, use_unicode=self.use_unicode) pprint(outputdata) # mm.execute(self.corpdb, self.dbCursor, 'drop table if exists `%s`'%(newTable,)) return newTable