def yieldValuesWithZerosByGroup(self, groups = [], where = '', allFeats = None): """returns a dict of (group_id, feature_values)""" valuelist = [] if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) if where: valuelist = self.getValues(where+" AND "+gCond) else: valuelist = self.getValues(gCond) else: valuelist = self.getValues() values = dict() for tup in valuelist: (gid, feat, value) = tup if not gid in values: values[gid] = dict() values[gid][feat] = value if not groups: groups = self.getDistinctGroups(where) if not allFeats: allFeats = self.getDistinctFeatures(where) #fill in zeros (this can get quite big!) fwc.warn("Yielding values with zeros for %d groups * %d feats." %(len(groups), len(allFeats))) for gid in groups: thisValues = dict() if gid in values: thisValues.update(values[gid]) for feat in allFeats: if not feat in thisValues: thisValues[feat] = 0 yield (gid, thisValues)
def getFeatNormsWithZeros(self, groups=[], where=''): """returns a dict of (group_id => feature => feat_norm) """ fnlist = [] if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) if where: gnlist = self.getFeatNorms(where + " AND " + gCond) else: fnlist = self.getFeatNorms(gCond) else: fnlist = self.getFeatNorms() fns = dict() for tup in fnlist: (gid, feat, fn) = tup if not gid in fns: fns[gid] = dict() fns[gid][feat] = float(fn) if not groups: groups = self.getDistinctGroups(where) #fill in zeros (this can get quite big!) fwc.warn("Adding zeros to feat norms (%d groups * %d feats)." % (len(groups), len(meanData.keys()))) meanData = self.getFeatMeanData() # feat : (mean, std, zero_mean) for gid in groups: if not gid in fns: fns[gid] = dict() for feat in meanData.iterkeys(): if not feat in fns[gid]: fns[gid][feat] = meanData[feat][2] return fns, meanData.keys()
def getGroupNormsWithZeros(self, groups=[], where=''): """returns a dict of (group_id => feature => group_norm)""" #This functino gets killed on large feature sets gnlist = [] if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) if where: gnlist = self.getGroupNorms(where + " AND " + gCond) else: gnlist = self.getGroupNorms(gCond) else: gnlist = self.getGroupNorms() gns = dict() for tup in gnlist: (gid, feat, gn) = tup if not gid in gns: gns[gid] = dict() gns[gid][feat] = gn if not groups: groups = self.getDistinctGroups(where) allFeats = self.getDistinctFeatures(where) #fill in zeros (this can get quite big!) fwc.warn("Adding zeros to group norms (%d groups * %d feats)." % (len(groups), len(allFeats))) for gid in groups: if not gid in gns: gns[gid] = dict() for feat in allFeats: if not feat in gns[gid]: gns[gid][feat] = 0 return gns, allFeats
def getCollocsWithPMI(self): ''' :inputs: self.featureTable calculates PMI for each ngram that is >1 :returns: a dict of colloc => [pmi, num_tokens, pmi_threshold_val] **pmi_threshold_val is pmi/(num_tokens-1), thats what --feat_colloc_filter is based on ''' featureTable = self.featureTable fwc.warn(featureTable) wordGetter = self.getWordGetter() tokenizer = Tokenizer(use_unicode=self.use_unicode) jointFreqs = self.getSumValuesByFeat() wordFreqs = dict(wordGetter.getSumValuesByFeat()) allFreqs = wordGetter.getSumValue() keepers = set() collocPMIs = {} count = 0 print "len(jointFreqs): " + str(len(jointFreqs)) for (colloc, freq) in jointFreqs: count +=1 if count % 50000 == 0: print "calculating pmi for {}th feature".format(count) words = [word[:fwc.VARCHAR_WORD_LENGTH] for word in tokenizer.tokenize(colloc)] if (len(words) > 1): indFreqs = [wordFreqs[w] for w in words if w in wordFreqs] pmi = FeatureRefiner.pmi(freq, indFreqs, allFreqs, words = words) collocPMIs[colloc] =[colloc, freq, pmi, len(words), pmi/(len(words)-1)] return collocPMIs
def getFeatValuesAndGNs(feat): if gns: try: if values: return (vals[feat].copy(), gns[feat].copy()) return (None, gns[feat].copy()) except KeyError: fwc.warn( "Couldn't find gns for feat: %s (group_freq_thresh may be too high)" % feat) return (None, dict()) else: #must query for feat gnDict = None valDict = None gnlist = [] if gCond: if where: gnlist = getGroupNormsForFeat(feat, where + " AND " + gCond) else: gnlist = getGroupNormsForFeat(feat, gCond) else: gnlist = self.getGroupNormsForFeat(feat) if values: gnDict = dict([(g, float(gn)) for g, _, gn in gnlist]) valDict = dict([(g, float(v)) for g, v, _ in gnlist]) else: gnDict = dict([(g, float(gn)) for g, gn in gnlist]) return (valDict, gnDict)
def createCollocRefinedFeatTable(self, threshold = 3.0, featNormTable=False): #n = the number of words in the ngrams #uses pmi to remove uncommon collocations: featureTable = self.featureTable fwc.warn(featureTable) wordGetter = self.getWordGetter() tokenizer = Tokenizer(use_unicode=self.use_unicode) jointFreqs = self.getSumValuesByFeat() wordFreqs = dict(wordGetter.getSumValuesByFeat()) allFreqs = wordGetter.getSumValue() keepers = set() for (colloc, freq) in jointFreqs: # words = tokenizer.tokenize(colloc) # If words got truncated in the creation of 1grams, we need to account for that words = [word[:fwc.VARCHAR_WORD_LENGTH] for word in tokenizer.tokenize(colloc)] if (len(words) > 1): indFreqs = [wordFreqs[w] for w in words if w in wordFreqs] pmi = FeatureRefiner.pmi(freq, indFreqs, allFreqs, words = words) # print "%s: %.4f" % (colloc, pmi)#debug if pmi > (len(words)-1)*threshold: keepers.add(colloc) else: keepers.add(colloc) return self.createNewTableWithGivenFeats(keepers, "pmi%s"%str(threshold).replace('.', '_'), featNormTable)
def addFeatNorms(self, ReCompute = False): """Adds the mean normalization by feature (z-score) for each feature""" where = None if not ReCompute: where = 'feat_norm is null' groupNorms = self.getGroupNorms(where = where) #contains group_id, feat, group_norm fMeans = self.addFeatTableMeans(groupNorms = groupNorms) #mean, std, zero wsql = """UPDATE """+self.featureTable+""" SET feat_norm = %s where group_id = %s AND feat = %s""" featNorms = [] num_at_time = 2000 numWritten = 0 for (group_id, feat, group_norm) in groupNorms: if fwc.LOWERCASE_ONLY: feat = feat.lower() if (feat): fn = ( ((group_norm - fMeans[feat][0]) / float(fMeans[feat][1]), group_id, feat) ) featNorms.append(fn) if len(featNorms) >= num_at_time: mm.executeWriteMany(self.corpdb, self.dbCursor, wsql, featNorms, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode) featNorms = [] numWritten += num_at_time if numWritten % 100000 == 0: fwc.warn("%.1fm feature instances updated out of %dm" % ((numWritten/float(1000000)), len(groupNorms)/1000000)) #write values back in if featNorms: mm.executeWriteMany(self.corpdb, self.dbCursor, wsql, featNorms, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode) return True
def yieldValuesWithZerosByGroup(self, groups=[], where='', allFeats=None): """returns a dict of (group_id, feature_values)""" valuelist = [] if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) if where: valuelist = self.getValues(where + " AND " + gCond) else: valuelist = self.getValues(gCond) else: valuelist = self.getValues() values = dict() for tup in valuelist: (gid, feat, value) = tup if not gid in values: values[gid] = dict() values[gid][feat] = value if not groups: groups = self.getDistinctGroups(where) if not allFeats: allFeats = self.getDistinctFeatures(where) #fill in zeros (this can get quite big!) fwc.warn("Yielding values with zeros for %d groups * %d feats." % (len(groups), len(allFeats))) for gid in groups: thisValues = dict() if gid in values: thisValues.update(values[gid]) for feat in allFeats: if not feat in thisValues: thisValues[feat] = 0 yield (gid, thisValues)
def getGroupNormsWithZerosFeatsFirst(self, groups = [], where = '', blacklist = None): """returns a dict of (feature => group_id => group_norm)""" #This functino gets killed on large feature sets gnlist = [] if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) if where: gnlist = self.getGroupNorms(where+" AND "+gCond) else: gnlist = self.getGroupNorms(gCond) else: gnlist = self.getGroupNorms() gns = dict() print "USING BLACKLIST (from getgroupnorms): %s" %str(blacklist) for tup in gnlist: (gid, feat, gn) = tup if blacklist: if not any(r.match(feat) for r in blacklist): if not feat in gns: gns[feat] = dict() gns[feat][gid] = gn else: if not feat in gns: gns[feat] = dict() gns[feat][gid] = gn if not groups: groups = self.getDistinctGroups(where) allFeats = self.getDistinctFeatures(where) if blacklist: allFeats = list(set(allFeats) - set(blacklist)) #fill in zeros (this can get quite big!) fwc.warn("Adding zeros to group norms (%d groups * %d feats)." %(len(groups), len(allFeats))) for feat in allFeats: if not feat in gns: gns[feat] = dict() thisGn = gns[feat] for gid in groups: if not gid in thisGn: thisGn[gid] = 0 return gns, allFeats
def createTfIdfTable(self, ngram_table): ''' Creates new feature table where group_norm = tf-idf (term frequency-inverse document frequency) :param ngram_table: table containing words/ngrams, collocs, etc... Written by Phil ''' # tf-idf = tf*idf # tf (term frequency) is simply how frequently a term occurs in a document (group_norm for a given group_id) # each feat's idf = log(N/dt) # N = number of documents in total (i.e. count(distinct(group_id)) # df (document frequency) = number of documents where feat was used in (i.e. count(distinct(group_id)) where feat = 'feat') # create new feature table feat_name_grabber = re.compile(r'^feat\$([^\$]+)\$') feat_name = feat_name_grabber.match(ngram_table).group(1) # grabs feat_name (i.e. 1gram, 1to3gram) short_name = 'tf_idf_{}'.format(feat_name) idf_table = self.createFeatureTable(short_name, valueType = 'DOUBLE') #getting N sql = "SELECT COUNT(DISTINCT group_id) FROM %s" % ngram_table N = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0] feat_counts = self.getFeatureCounts() #tuples of: feat, count (number of groups feature appears with) fwc.warn('Inserting idf values into new table') counter = 0 for (feat, dt) in feat_counts: idf = log(N/float(dt)) # get (group_id, group_norm) where feat = feat # clean_feat = mm.MySQLdb.escape_string(feat.encode('utf-8')) sql = u"""SELECT group_id, value, group_norm from %s WHERE feat = \'%s\'"""%(ngram_table, mm.MySQLdb.escape_string(feat.encode('utf-8')).decode('utf-8')) group_id_freq = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) for (group_id, value, tf) in group_id_freq: tf_idf = tf * idf insert_sql = u"INSERT INTO {} (group_id, feat, value, group_norm) VALUES (\'{}\', \'{}\', {}, {});".format( idf_table, group_id, mm.MySQLdb.escape_string(feat.encode('utf-8')).decode('utf-8'), value, tf_idf) mm.execute(self.corpdb, self.dbCursor, insert_sql) if (counter % 50000 == 0): print '%d tf_idf values inserted!' % (counter) counter += 1 fwc.warn('Finished inserting.') return idf_table
def getWordGetter(self, lexicon_count_table=None): from featureGetter import FeatureGetter if lexicon_count_table: fwc.warn(lexicon_count_table) wordTable = self.getWordTable() if not lexicon_count_table else lexicon_count_table assert mm.tableExists(self.corpdb, self.dbCursor, wordTable), "Need to create word table to use current functionality: %s" % wordTable return FeatureGetter(self.corpdb, self.corptable, self.correl_field, self.mysql_host, self.message_field, self.messageid_field, self.encoding, self.use_unicode, self.lexicondb, featureTable=wordTable, wordTable = wordTable)
def printJoinedFeatureLines(self, filename, delimeter = ' '): """prints feature table like a message table in format mallet can use""" f = open(filename, 'w') for (gid, featValues) in self.yieldValuesSparseByGroup(): message = delimeter.join([delimeter.join([feat.replace(' ', '_')]*value) for feat, value in featValues.iteritems()]) f.write("""%s en %s\n""" %(gid, message.encode('utf-8'))) f.close() fwc.warn("Wrote joined features file to: %s"%filename)
def getGroupsAndFeats(self, where=''): fwc.warn("Loading Features and Getting Groups.") groups = set() features = dict() featNames = set(self.featNames) for featName in featNames: features[featName] = dict(self.getGroupAndFeatureValues(featName, where)) groups.update(features[featName].keys()) return (groups, features)
def getGroupsAndFeats(self, where=''): fwc.warn("Loading Features and Getting Groups.") groups = set() features = dict() featNames = set(self.featNames) for featName in featNames: features[featName] = dict( self.getGroupAndFeatureValues(featName, where)) groups.update(features[featName].keys()) return (groups, features)
def printJoinedFeatureLines(self, filename, delimeter=' '): """prints feature table like a message table in format mallet can use""" f = open(filename, 'w') for (gid, featValues) in self.yieldValuesSparseByGroup(): message = delimeter.join([ delimeter.join([feat.replace(' ', '_')] * value) for feat, value in featValues.iteritems() ]) f.write("""%s en %s\n""" % (gid, message.encode('utf-8'))) f.close() fwc.warn("Wrote joined features file to: %s" % filename)
def getAnnotationTableAsDF(self, fields=['unit_id', 'worker_id', 'score'], where='', index=['unit_id', 'worker_id'], pivot=True, fillNA=False): """return a dataframe of unit_it, worker_id, score""" if fillNA and not pivot: fwc.warn("fillNA set to TRUE but pivot set to FALSE. No missing values will be filled.") db_eng = get_db_engine(self.corpdb) sql = """SELECT %s, %s, %s from %s""" % tuple(fields + [self.outcome_table]) if (where): sql += ' WHERE ' + where if pivot: if fillNA: return pd.read_sql(sql=sql, con=db_eng, index_col=index).unstack().fillna(value=0) else: return pd.read_sql(sql=sql, con=db_eng, index_col=index).unstack() else: return pd.read_sql(sql=sql, con=db_eng, index_col=index)
def getWordGetter(self, lexicon_count_table=None): from featureGetter import FeatureGetter if lexicon_count_table: fwc.warn(lexicon_count_table) wordTable = self.getWordTable( ) if not lexicon_count_table else lexicon_count_table assert mm.tableExists( self.corpdb, self.dbCursor, wordTable ), "Need to create word table to use current functionality: %s" % wordTable return FeatureGetter(self.corpdb, self.corptable, self.correl_field, self.mysql_host, self.message_field, self.messageid_field, self.encoding, self.use_unicode, self.lexicondb, featureTable=wordTable, wordTable=wordTable)
def _getKeepSet(self, p, minimumFeatSum = 0, groupFreqThresh = 0): """creates a set of features occuring in less than p*|correl_field| rows""" #acquire the number of groups (need to base on corp table): featureTable = self.featureTable totalGroups = self.countGroups(groupFreqThresh) assert totalGroups > 0, 'NO GROUPS TO FILTER BASED ON (LIKELY group_freq_thresh IS TOO HIGH)' assert p <= 1, 'p_occ > 1 not implemented yet' threshold = int(round(p*totalGroups)) fwc.warn (" %s [threshold: %d]" %(featureTable, threshold)) #acquire counts per feature (each row will come from a different correl_field) featCounts = self.getFeatureCounts(groupFreqThresh) #tuples of: feat, count (number of groups feature appears with) #apply filter: toKeep = set() i = 0 for (feat, count) in featCounts: if count >= threshold: if self.use_unicode: toKeep.add(unicode(feat).lower()) else: toKeep.add(feat.lower()) i += 1 if (i % 1000000) == 0: print " checked %d features" % i #apply secondary filter if minimumFeatSum > 1: featSums = self.getFeatureValueSums() for (feat, fsum) in featSums: if self.use_unicode: feat = unicode(feat).lower() else: feat = feat.lower() if feat in toKeep: if fsum < minimumFeatSum: toKeep.remove(feat) return toKeep
def getGroupNormsWithZerosFeatsFirst(self, groups=[], where='', blacklist=None): """returns a dict of (feature => group_id => group_norm)""" #This functino gets killed on large feature sets gnlist = [] if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) if where: gnlist = self.getGroupNorms(where + " AND " + gCond) else: gnlist = self.getGroupNorms(gCond) else: gnlist = self.getGroupNorms() gns = dict() print "USING BLACKLIST (from getgroupnorms): %s" % str(blacklist) for tup in gnlist: (gid, feat, gn) = tup if blacklist: if not any(r.match(feat) for r in blacklist): if not feat in gns: gns[feat] = dict() gns[feat][gid] = gn else: if not feat in gns: gns[feat] = dict() gns[feat][gid] = gn if not groups: groups = self.getDistinctGroups(where) allFeats = self.getDistinctFeatures(where) if blacklist: allFeats = list(set(allFeats) - set(blacklist)) #fill in zeros (this can get quite big!) fwc.warn("Adding zeros to group norms (%d groups * %d feats)." % (len(groups), len(allFeats))) for feat in allFeats: if not feat in gns: gns[feat] = dict() thisGn = gns[feat] for gid in groups: if not gid in thisGn: thisGn[gid] = 0 return gns, allFeats
def getFeatNormsWithZeros(self, groups = [], where = ''): """returns a dict of (group_id => feature => feat_norm) """ fnlist = [] if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) if where: gnlist = self.getFeatNorms(where+" AND "+gCond) else: fnlist = self.getFeatNorms(gCond) else: fnlist = self.getFeatNorms() fns = dict() for tup in fnlist: (gid, feat, fn) = tup if not gid in fns: fns[gid] = dict() fns[gid][feat] = float(fn) if not groups: groups = self.getDistinctGroups(where) #fill in zeros (this can get quite big!) fwc.warn("Adding zeros to feat norms (%d groups * %d feats)." %(len(groups), len(meanData.keys()))) meanData = self.getFeatMeanData() # feat : (mean, std, zero_mean) for gid in groups: if not gid in fns: fns[gid] = dict() for feat in meanData.iterkeys(): if not feat in fns[gid]: fns[gid][feat] = meanData[feat][2] return fns, meanData.keys()
def getFeatValuesAndGNs(feat): if gns: try: if values: return (vals[feat].copy(), gns[feat].copy()) return (None, gns[feat].copy()) except KeyError: fwc.warn("Couldn't find gns for feat: %s (group_freq_thresh may be too high)" % feat) return (None, dict()) else:#must query for feat gnDict = None valDict = None gnlist = [] if gCond: if where: gnlist = getGroupNormsForFeat(feat, where+" AND "+gCond) else: gnlist = getGroupNormsForFeat(feat, gCond) else: gnlist = self.getGroupNormsForFeat(feat) if values: gnDict = dict([(g, float(gn)) for g, _, gn in gnlist]) valDict = dict([(g, float(v)) for g, v, _ in gnlist]) else: gnDict = dict([(g, float(gn)) for g, gn in gnlist]) return (valDict, gnDict)
def getGroupNormsWithZeros(self, groups = [], where = ''): """returns a dict of (group_id => feature => group_norm)""" #This functino gets killed on large feature sets gnlist = [] if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) if where: gnlist = self.getGroupNorms(where+" AND "+gCond) else: gnlist = self.getGroupNorms(gCond) else: gnlist = self.getGroupNorms() gns = dict() for tup in gnlist: (gid, feat, gn) = tup if not gid in gns: gns[gid] = dict() gns[gid][feat] = gn if not groups: groups = self.getDistinctGroups(where) allFeats = self.getDistinctFeatures(where) #fill in zeros (this can get quite big!) fwc.warn("Adding zeros to group norms (%d groups * %d feats)." %(len(groups), len(allFeats))) for gid in groups: if not gid in gns: gns[gid] = dict() for feat in allFeats: if not feat in gns[gid]: gns[gid][feat] = 0 return gns, allFeats
def createNewTableWithGivenFeats(self, toKeep, label, featNorm=False): """Creates a new table only containing the given features""" featureTable = self.featureTable numToKeep = len(toKeep) newTable = featureTable+'$'+label mm.execute(self.corpdb, self.dbCursor, "DROP TABLE IF EXISTS %s" % newTable, charset=self.encoding, use_unicode=self.use_unicode) fwc.warn(" %s <new table %s will have %d distinct features.>" %(featureTable, newTable, numToKeep)) sql = """CREATE TABLE %s like %s""" % (newTable, featureTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) mm.disableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode) num_at_time = 2000 total = 0 toWrite = [] wsql = """INSERT INTO """+newTable+""" (group_id, feat, value, group_norm, feat_norm) values (%s, %s, %s, %s, %s)""" if featNorm else """INSERT INTO """+newTable+""" (group_id, feat, value, group_norm) values (%s, %s, %s, %s)""" #iterate through each row, deciding whetehr to keep or not for featRow in self.getFeatAllSS(featNorm=featNorm): #print "%d %d" % (len(featRow), len(toWrite)) if self.use_unicode and unicode(featRow[1]).lower() in toKeep: toWrite.append(featRow) elif not self.use_unicode and featRow[1].lower() in toKeep: toWrite.append(featRow) if len(toWrite) > num_at_time: #write those past the filter to the table mm.executeWriteMany(self.corpdb, self.dbCursor, wsql, toWrite, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode) total+= num_at_time if total % 100000 == 0: fwc.warn("%.1fm feature instances written" % (total/float(1000000))) toWrite = [] #catch rest: if len(toWrite) > 0: #write those past the filter to the table mm.executeWriteMany(self.corpdb, self.dbCursor, wsql, toWrite, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode) fwc.warn("Done inserting.\nEnabling keys.") mm.enableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode) fwc.warn("done.") self.featureTable = newTable return newTable
def createAggregateFeatTableByGroup(self, valueFunc = lambda d: d): """combines feature tables, and groups by the given group field""" featureTable = self.featureTable (_, name, oldCorpTable, oldGroupField) = featureTable.split('$')[:4] theRest = featureTable.split('$')[4:] newTable = 'feat$agg_'+name[:12]+'$'+oldCorpTable+'$'+self.correl_field # +'$'+'$'.join(theRest) drop = """DROP TABLE IF EXISTS %s""" % (newTable) mm.execute(self.corpdb, self.dbCursor, drop, charset=self.encoding, use_unicode=self.use_unicode) sql = """CREATE TABLE %s like %s""" % (newTable, featureTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = """ALTER TABLE %s MODIFY group_id VARCHAR(255)""" % (newTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) mm.disableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode) fwc.warn("Inserting group_id, feat, and values") sql = "INSERT INTO %s SELECT m.%s, f.feat, sum(f.value), 0 FROM %s AS f, %s AS m where m.%s = f.group_id GROUP BY m.%s, f.feat" % (newTable,self.correl_field, featureTable, self.corptable, oldGroupField, self.correl_field) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) fwc.warn("Recalculating group_norms") sql = "UPDATE %s a INNER JOIN (SELECT group_id,sum(value) sum FROM %s GROUP BY group_id) b ON a.group_id=b.group_id SET a.group_norm=a.value/b.sum" % (newTable,newTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) # patrick changed this to be all SQL 7/21/15. Values and group norms were being calculated wrong before fwc.warn("Done inserting.\nEnabling keys.") mm.enableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode) fwc.warn("done.") self.featureTable = newTable return newTable
def createFeatTableByDistinctOutcomes(self, outcomeGetter, controlValuesToAvg = [], outcomeRestriction = None, nameSuffix=None ): """Creates a new feature table, by combining values based on an outcome, then applies an averaging based on controls""" ##TODO: perform outcome restriction by using group freq thresh instead of uwt, for flexibility featureTable = self.featureTable outcomeTable = outcomeGetter.outcome_table assert len(outcomeGetter.outcome_value_fields) < 2, 'Currently, only allowed to specify one outcome.' outcomeField = outcomeGetter.outcome_value_fields[0] controlField = None if outcomeGetter.outcome_controls: assert len(outcomeGetter.outcome_controls) < 2, 'Currently, only allowed to specify one control.' controlField = outcomeGetter.outcome_controls[0] if len(controlValuesToAvg) < 1: fwc.warn("getting distinct values for controls") controlValuesToAvg = outcomeGetter.getDistinctOutcomeValues(outcome = controlField, includeNull = False, where=outcomeRestriction) #create new table name: nameParts = featureTable.split('$') nameParts = map(lambda part: part.replace('16to', ''), nameParts) nameParts = map(lambda part: part.replace('messages', 'msgs'), nameParts) newTables = [] nameSuffix = '' if not nameSuffix else '_%s'%(nameSuffix,) if controlField: for value in controlValuesToAvg: controlGroupName = outcomeField + '_' + controlField + '_' + str(value) newTables.append('feat_grpd'+ nameSuffix +'$' + '$'.join(nameParts[1:3]) + '$' + controlGroupName + '$' + '$'.join(nameParts[4:])) else: newTables.append('feat_grpd'+ nameSuffix +'$' + '$'.join(nameParts[1:3]) + '$' + outcomeField + '$' + '$'.join(nameParts[4:])) #1. create table where outcome is group_id and insert values for newTable in newTables: drop = """DROP TABLE IF EXISTS %s""" % (newTable) sql = "create table %s like %s" % (newTable, featureTable) mm.execute(self.corpdb, self.dbCursor, drop, charset=self.encoding, use_unicode=self.use_unicode) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = 'ALTER TABLE %s ADD COLUMN `N` int(16) not null default -1'%(newTable) mm.execute(self.corpdb, self.dbCursor, sql) sql = 'ALTER TABLE %s CHANGE feat_norm std_dev FLOAT' % newTable; mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) outres = outcomeRestriction outres = outres + ' AND ' if outres else '' #only need and if it exists if controlField: for outcomeValue, cntrlcounts in \ outcomeGetter.getDistinctOutcomeAndControlValueCounts(control = controlField, includeNull = False, where=outcomeRestriction).iteritems(): for cvalue, count in cntrlcounts.iteritems(): if cvalue in controlValuesToAvg: newTable = 'feat_grpd'+ nameSuffix + '$' + '$'.join(nameParts[1:3]) + '$' + outcomeField + '_' + controlField + '_' + str(cvalue) + '$' + '$'.join(nameParts[4:]) print "on %s %s and %s %s, count: %d" % (outcomeField, str(outcomeValue), controlField, str(cvalue), count) sql = "INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N) SELECT age, feat, total_freq, mean_rel_freq, SQRT((N_no_zero*(POW((mean_no_zero - mean_rel_freq), 2) + std_no_zero*std_no_zero) + (N - N_no_zero)*(mean_rel_freq * mean_rel_freq)) / N) as std, N from (SELECT b.%s, feat, SUM(value) as total_freq, SUM(group_norm)/%d as mean_rel_freq, AVG(group_norm) as mean_no_zero, std(group_norm) as std_no_zero, %d as N, count(*) as N_no_zero FROM %s AS a, %s AS b WHERE %s b.%s = '%s' AND b.%s = '%s' AND b.user_id = a.group_id group by b.%s, a.feat) as stats" % (newTable, outcomeField, count, count, featureTable, outcomeTable, outres, controlField, str(cvalue), outcomeField, str(outcomeValue), outcomeField) #SELECT age, feat, total_freq, mean_rel_freq, SQRT((N_no_zero*(POW((mean_no_zero - mean_rel_freq), 2) + std_no_zero*std_no_zero) + (N - N_no_zero)*(mean_rel_freq * mean_rel_freq)) / N) as std, N from ( #SELECT b.age, feat, SUM(value) as total_freq, SUM(group_norm)/390 as mean_rel_freq, AVG(group_norm) as mean_no_zero, std(group_norm) as std_no_zero, 390 as N, count(*) as N_no_zero FROM feat$1gram$messages_en$user_id$16to16$0_01 AS a, masterstats_andy AS b WHERE UWT >= 1000 AND b.age = '45' AND b.user_id = a.group_id group by b.age, a.feat) as a mm.execute(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode) else: print "skipping %s %s and %s %s, count: %d because control value not in list" % (outcomeField, str(outcomeValue), controlField, str(cvalue), count) else: #no controls to avg # Maarten correspondences = outcomeGetter.getGroupAndOutcomeValues() correspondences_inv = {} for k,v in correspondences: correspondences_inv[v] = correspondences_inv.get(v,[]) correspondences_inv[v].append(k) correspondences = correspondences_inv total_sum_values = {i[0]: long(i[1]) for i in self.getSumValuesByGroup()} i = 0 j = 0 for outcomeValue, groups in correspondences.iteritems(): i += 1 rows = [] groups_nonZero = [g for g in groups if g in total_sum_values] for feat, values, gns, Nfeats in self.yieldGroupNormsWithZerosByFeat(groups = groups, values = True): if not values: continue sum_value = sum(values.values()) total_sum_value = sum(total_sum_values[g] for g in groups_nonZero) group_norm = float(sum_value)/total_sum_value std_dev = std(gns.values()) N = len(gns) rows.append([outcomeValue, feat, sum_value, group_norm, std_dev, N]) if len(rows) >= 10000: sql = "INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N) " % newTable sql += "VALUES (%s)" % ', '.join('%s' for r in rows[0]) mm.executeWriteMany(self.corpdb, self.dbCursor, sql, rows, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode) j += len(rows) print " wrote %d rows [finished %d outcome_values]" % (j, i) rows = [] if rows: sql = "INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N) " % newTable sql += "VALUES (%s)" % ', '.join('%s' for r in rows[0]) mm.executeWriteMany(self.corpdb, self.dbCursor, sql, rows, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode) j += len(rows) print " wrote %d rows [finished %d outcome_values]" % (j, i) print "Inserted into %s" % newTable """ for outcomeValue, count in outcomeGetter.getDistinctOutcomeValueCounts(includeNull = False, where=outcomeRestriction).iteritems(): newTable = 'feat_grpd'+ '$' + '$'.join(nameParts[1:3]) + '$' + outcomeField + '$' + '$'.join(nameParts[4:]) print "on %s %s, count: %d (no control)" % (outcomeField, str(outcomeValue), count) sql = "INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N) SELECT %s, feat, total_freq, mean_rel_freq, SQRT((N_no_zero*(POW((mean_no_zero - mean_rel_freq), 2) + std_no_zero*std_no_zero) + (N - N_no_zero)*(mean_rel_freq * mean_rel_freq)) / N) as std, N from (SELECT group_id, feat, SUM(value) as total_freq, SUM(group_norm)/%d as mean_rel_freq, AVG(group_norm) as mean_no_zero, std(group_norm) as std_no_zero, count(1) as N_no_zero, %d as N FROM %s) AS a, %s AS b WHERE %s b.%s = '%s' AND b.%s = a.group_id group by b.%s, a.feat" % (newTable, outcomeField, count, count, featureTable, outcomeTable, outres, outcomeField, str(outcomeValue), self.correl_field ,outcomeField) # print "Maarten", self.correl_field, sql mm.execute(self.corpdb, self.dbCursor, sql, False)""" #2: Combine feature table to take average of controls: #controlGroupAvgName = outcomeField + '_' + controlField + 'avg_' + '_'.join(map(lambda v: str(v), controlValuesToAvg)) if controlField and len(newTables) > 1: controlGroupAvgName = outcomeField + '_' + controlField + 'avg' avgTable = 'feat_grpd'+ nameSuffix + '$' + '$'.join(nameParts[1:3]) + '$' + controlGroupAvgName + '$' + '$'.join(nameParts[4:]) drop = """DROP TABLE IF EXISTS %s""" % (avgTable) sql = "create table %s like %s" % (avgTable, newTables[0]) mm.execute(self.corpdb, self.dbCursor, drop, charset=self.encoding, use_unicode=self.use_unicode) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) #create insert fields: shortNames = map(lambda i: chr(ord('a')+i), range(len(newTables))) tableNames = ', '.join(map(lambda i: "%s as %s" % (newTables[i], shortNames[i]), range(len(newTables)))) values = "(%s)" % (' + '.join(map(lambda name: "%s.value" % (name), shortNames))) + ' / ' + str(len(shortNames)) groupNorms = "(%s)" % (' + '.join(map(lambda name: "%s.group_norm" % (name), shortNames))) + ' / ' + str(len(shortNames)) Ns = ' + '.join(map(lambda name: "%s.N" % (name), shortNames)) stdDev = "(%s)" % (' + '.join(map(lambda name: "POW(%s.group_norm - (%s), 2) + POW(%s.std_dev, 2)" % (name, groupNorms, name), shortNames))) + ' / ' + str(len(shortNames)) stdDev = "SQRT(%s)" % stdDev #stdDev = "SQRT(%s)" % (' + '.join(map(lambda name: "%s.N*(POW(%s.group_norm - %s, 2) + POW(%s.std_dev,2))" % (name, name, groupNorms, name), shortNames))) + ' / ' + Ns #stdDev = "(%s)" % (' + '.join(map(lambda name: "%s.std_dev_no_zero" % (name), shortNames))) + ' / ' + str(len(shortNames)) #create joins groupIds = map(lambda name: "%s.group_id" % (name), shortNames) feats = map(lambda name: "%s.feat" % (name), shortNames) groupIdJoins = [] featJoins = [] for i in xrange(len(groupIds) - 1): groupIdJoins.append('%s = %s' % (groupIds[i], groupIds[i+1])) featJoins.append('%s = %s' % (feats[i], feats[i+1])) groupIdJoins = ' AND '.join(groupIdJoins) featJoins = ' AND '.join(featJoins) #call SQL sql = "INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N) SELECT a.group_id, a.feat, %s, %s, %s, %s FROM %s where %s AND %s" % \ (avgTable, values, groupNorms, stdDev, Ns, tableNames, groupIdJoins, featJoins) print "Populating AVG table with command: %s" % sql mm.execute(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)
def yieldGroupNormsWithZerosByFeat(self, groups=[], where='', values=False, feats=[]): """yields (feat, groupnorms, number of features""" """ or if values = True, (feat, values, groupnorms, number of features)""" allFeats = feats if not feats: allFeats = self.getDistinctFeatures(where) else: fwc.warn("feats restricted to %s" % feats) numFeats = len(allFeats) gCond = None if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) else: groups = self.getDistinctGroups(where) numGroups = len(groups) getGroupNorms = self.getGroupNorms getGroupNormsForFeat = self.getGroupNormsForFeat getGroupNormsForFeats = self.getGroupNormsForFeats if values: getGroupNorms = self.getValuesAndGroupNorms getGroupNormsForFeat = self.getValuesAndGroupNormsForFeat getGroupNormsForFeats = self.getValuesAndGroupNormsForFeats #figure out if too big for memory: fwc.warn("Yielding norms with zeros (%d groups * %d feats)." % (len(groups), numFeats)) gns = dict() vals = dict() #only gets field if values is true if (numFeats * numGroups) < 12500000 * fwc.GIGS_OF_MEMORY: #statically acquire all gns gnlist = [] if gCond: if where: gnlist = getGroupNorms(where + " AND " + gCond) else: gnlist = getGroupNorms(gCond) else: #don't need to specify groups gnlist = getGroupNorms() if feats: if where: where = " AND ".join( [where, "feat IN ('" + "','".join(feats) + "')"]) else: where = " feat IN ('" + "','".join(feats) + "')" for tup in gnlist: (gid, feat) = tup[0:2] if not feat in gns: gns[feat] = dict() if values: vals[feat] = dict() gns[feat][gid] = float(tup[-1]) if values: vals[feat][gid] = float(tup[2]) else: fwc.warn( "Too big to keep gns in memory, querying for each feature (slower, but less memory intensive)" ) def getFeatValuesAndGNs(feat): if gns: try: if values: return (vals[feat].copy(), gns[feat].copy()) return (None, gns[feat].copy()) except KeyError: fwc.warn( "Couldn't find gns for feat: %s (group_freq_thresh may be too high)" % feat) return (None, dict()) else: #must query for feat gnDict = None valDict = None gnlist = [] if gCond: if where: gnlist = getGroupNormsForFeat(feat, where + " AND " + gCond) else: gnlist = getGroupNormsForFeat(feat, gCond) else: gnlist = self.getGroupNormsForFeat(feat) if values: gnDict = dict([(g, float(gn)) for g, _, gn in gnlist]) valDict = dict([(g, float(v)) for g, v, _ in gnlist]) else: gnDict = dict([(g, float(gn)) for g, gn in gnlist]) return (valDict, gnDict) #fill in zeros (this can get quite big!) for feat in allFeats: (valDict, gnDict) = getFeatValuesAndGNs(feat) for gid in groups: if not gid in gnDict: #add zeros! gnDict[gid] = 0 if values and valDict: valDict[gid] = 0 if values: yield (feat, valDict, gnDict, numFeats) else: yield (feat, gnDict, numFeats)
def getGroupsAndOutcomes(self, lexicon_count_table=None, groupsWhere = '', includeFoldLabels=False): if self.group_freq_thresh and self.wordTable != self.get1gramTable(): fwc.warn("""You specified a --word_table and --group_freq_thresh is enabled, so the total word count for your groups might be off (remove "--word_table WT" to solve this issue)""", attention=False) groups = set() outcomes = dict() outcomeFieldList = set(self.outcome_value_fields).union(set(self.outcome_controls)).union(set(self.outcome_interaction)) ocs = dict() controls = dict() folds = dict() #get outcome values: fwc.warn("Loading Outcomes and Getting Groups for: %s" % str(outcomeFieldList)) #debug if outcomeFieldList: for outcomeField in outcomeFieldList: outcomes[outcomeField] = dict(self.getGroupAndOutcomeValues(outcomeField)) if outcomeField in self.outcome_value_fields: groups.update(outcomes[outcomeField].keys()) if self.group_freq_thresh: where = """ group_id in ('%s')""" % ("','".join(str(g) for g in groups)) groupCnts = self.getGroupWordCounts(where, lexicon_count_table = lexicon_count_table) groups = set() for outcomeField, outcomeValues in outcomes.iteritems(): newOutcomes = dict() for gId in outcomeValues.iterkeys(): if (gId in groupCnts) and (groupCnts[gId] >= self.group_freq_thresh): #keep # newOutcomes[gId] = float(outcomeValues[gId]) newOutcomes[gId] = outcomeValues[gId] outcomes[outcomeField] = newOutcomes if outcomeField in self.outcome_value_fields: groups.update(newOutcomes.keys()) #set groups: for k in self.outcome_controls + self.outcome_interaction: groups = groups & set(outcomes[k].keys()) #always intersect with controls if groupsWhere: outcm = groupsWhere.split()[0].strip() # val = groupsWhere.split('=')[1].strip() # # print "Maarten getGroupsAndOutcomes", [groupsWhere, outcm, val] # whereusers = set([i[0] for i in self.getGroupAndOutcomeValues(outcm) if str(i[1]) == val]) whereusers = set([i[0] for i in self.getGroupAndOutcomeValues(outcm, where=groupsWhere)]) groups = groups & whereusers if self.oneGroupSetForAllOutcomes: for k in self.outcome_value_fields: groups = groups & set(outcomes[k].keys()) # only intersect if wanting all the same groups #split into outcomes and controls: ocs = dict() controls = dict() for k in self.outcome_controls + self.outcome_interaction: outcomeDict = outcomes[k] outcomeDict = dict([(g, v) for g, v in outcomeDict.iteritems() if g in groups]) controls[k] = outcomeDict for k in self.outcome_value_fields: outcomeDict = outcomes[k] outcomeDict = dict([(g, v) for g, v in outcomeDict.iteritems() if g in groups]) ocs[k] = outcomeDict elif self.group_freq_thresh: groupCnts = self.getGroupWordCounts(where = None, lexicon_count_table = lexicon_count_table) groups = set() for gId, cnt in groupCnts.iteritems(): if cnt >= self.group_freq_thresh: groups.add(gId) if groupsWhere: outcm = groupsWhere.split('=')[0].strip() val = groupsWhere.split('=')[1].strip() # print "Maarten getGroupsAndOutcomes", [groupsWhere, outcm, val] whereusers = set([i[0] for i in self.getGroupAndOutcomeValues(outcm) if str(i[1]) == val]) groups = groups & whereusers if self.fold_column: folds = dict(self.getGroupAndOutcomeValues(self.fold_column)) if includeFoldLabels: return (groups, ocs, controls, folds) else: return (groups, ocs, controls)
def getContingencyArrayFeatNorm(self, where = ''): """ returns a list of lists: each row is a group_id and each col is a feature""" """ the first row has a blank first entry and then a list of unique features""" """ the first column has a blank first entry and then a list of unique group_ids""" fwc.warn("running getContingencyArrayFeatNorm") fwc.warn("Getting distinct feature / groupId lists and (feat, featNormZero) list") distinctFeatureList = self.getDistinctFeatures( where ) featureZeroList = self.getFeatureZeros( where ) distinctGroupList = self.getDistinctGroups( where ) fwc.warn("Converting feature / groupId lists to dictionaries (item: index) for quick insertion") distinctFeatureDict = {} counter = 0 for feature in distinctFeatureList: distinctFeatureDict[feature] = counter counter += 1 distinctGroupDict = {} counter = 0 for group in distinctGroupList: distinctGroupDict[group] = counter counter += 1 fwc.warn("Making a 2d array (matrix) with ncol = nDistinctFeatures and nrow = nDistinctGroupIds") fwc.warn("For each distinct feature, intializing that column with feat norm zeros' value") contingencyMatrix = zeros( ( len(distinctGroupList), len(distinctFeatureList) ) ) for tup in featureZeroList: (feat, featNormZero) = tup columnIndexToZero = distinctFeatureDict[ feat ] contingencyMatrix[ :, columnIndexToZero ] = featNormZero fwc.warn("calling getFeatNormsSS, iterating through (with SS cursor)") fwc.warn("for each iteration, using the index dictionaries to insert the entry into the matrix") ssCursor = self.getFeatNormsSS( where ) for tup in ssCursor: (gid, feat, featNorm) = tup columnIndexForInsertion = distinctFeatureDict[ feat ] rowIndexForInsertion = distinctGroupDict[ gid ] contingencyMatrix[ rowIndexForInsertion, columnIndexForInsertion ] = featNorm fwc.warn("returning [contingency matrix, rownames (distinct groups), and colnames (distinct features)]") return [ contingencyMatrix, distinctGroupList, distinctFeatureList ]
def getGroupsAndOutcomes(self, groupThresh = 0, lexicon_count_table=None, groupsWhere = ''): if groupThresh and self.wordTable != self.get1gramTable(): fwc.warn("""################################################################### WARNING: You specified a --word_table and --group_freq_thresh is enabled, so the total word count for your groups might be off (remove "--word_table WT" to solve this issue) ###################################################################""") groups = set() outcomes = dict() outcomeFieldList = set(self.outcome_value_fields).union(set(self.outcome_controls)).union(set(self.outcome_interaction)) ocs = dict() controls = dict() #get outcome values: fwc.warn("Loading Outcomes and Getting Groups for: %s" % str(outcomeFieldList)) #debug if outcomeFieldList: for outcomeField in outcomeFieldList: outcomes[outcomeField] = dict(self.getGroupAndOutcomeValues(outcomeField)) if outcomeField in self.outcome_value_fields: groups.update(outcomes[outcomeField].keys()) if groupThresh: where = """ group_id in ('%s')""" % ("','".join(str(g) for g in groups)) groupCnts = self.getGroupWordCounts(where, lexicon_count_table = lexicon_count_table) groups = set() for outcomeField, outcomeValues in outcomes.iteritems(): newOutcomes = dict() for gId in outcomeValues.iterkeys(): if (gId in groupCnts) and (groupCnts[gId] >= groupThresh): #keep # newOutcomes[gId] = float(outcomeValues[gId]) newOutcomes[gId] = outcomeValues[gId] outcomes[outcomeField] = newOutcomes if outcomeField in self.outcome_value_fields: groups.update(newOutcomes.keys()) #set groups: for k in self.outcome_controls + self.outcome_interaction: groups = groups & set(outcomes[k].keys()) #always intersect with controls if groupsWhere: outcm = groupsWhere.split('=')[0].strip() val = groupsWhere.split('=')[1].strip() # print "Maarten getGroupsAndOutcomes", [groupsWhere, outcm, val] whereusers = set([i[0] for i in self.getGroupAndOutcomeValues(outcm) if str(i[1]) == val]) groups = groups & whereusers if self.oneGroupSetForAllOutcomes: for k in self.outcome_value_fields: groups = groups & set(outcomes[k].keys()) #only intersect if wanting all the same groups #split into outcomes and controls: ocs = dict() controls = dict() for k in self.outcome_controls + self.outcome_interaction: outcomeDict = outcomes[k] outcomeDict = dict([(g, v) for g, v in outcomeDict.iteritems() if g in groups]) controls[k] = outcomeDict for k in self.outcome_value_fields: outcomeDict = outcomes[k] outcomeDict = dict([(g, v) for g, v in outcomeDict.iteritems() if g in groups]) ocs[k] = outcomeDict elif groupThresh: groupCnts = self.getGroupWordCounts(where = None, lexicon_count_table = lexicon_count_table) groups = set() for gId, cnt in groupCnts.iteritems(): if cnt >= groupThresh: groups.add(gId) if groupsWhere: outcm = groupsWhere.split('=')[0].strip() val = groupsWhere.split('=')[1].strip() # print "Maarten getGroupsAndOutcomes", [groupsWhere, outcm, val] whereusers = set([i[0] for i in self.getGroupAndOutcomeValues(outcm) if str(i[1]) == val]) groups = groups & whereusers return (groups, ocs, controls)
def yieldGroupNormsWithZerosByFeat(self, groups = [], where = '', values = False, feats = []): """yields (feat, groupnorms, number of features""" """ or if values = True, (feat, values, groupnorms, number of features)""" allFeats = feats if not feats: allFeats = self.getDistinctFeatures(where) else: fwc.warn("feats restricted to %s" % feats) numFeats = len(allFeats) gCond = None if groups: gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups) else: groups = self.getDistinctGroups(where) numGroups = len(groups) getGroupNorms = self.getGroupNorms getGroupNormsForFeat = self.getGroupNormsForFeat getGroupNormsForFeats = self.getGroupNormsForFeats if values: getGroupNorms = self.getValuesAndGroupNorms getGroupNormsForFeat = self.getValuesAndGroupNormsForFeat getGroupNormsForFeats = self.getValuesAndGroupNormsForFeats #figure out if too big for memory: fwc.warn("Yielding norms with zeros (%d groups * %d feats)." %(len(groups), numFeats)) gns = dict() vals = dict() #only gets field if values is true if (numFeats * numGroups) < 12500000*fwc.GIGS_OF_MEMORY: #statically acquire all gns gnlist = [] if gCond: if where: gnlist = getGroupNorms(where+" AND "+gCond) else: gnlist = getGroupNorms(gCond) else: #don't need to specify groups gnlist = getGroupNorms() if feats: if where: where = " AND ".join([where, "feat IN ('"+"','".join(feats)+"')"]) else: where = " feat IN ('"+"','".join(feats)+"')" for tup in gnlist: (gid, feat) = tup[0:2] if not feat in gns: gns[feat] = dict() if values: vals[feat] = dict() gns[feat][gid] = float(tup[-1]) if values: vals[feat][gid] = float(tup[2]) else: fwc.warn("Too big to keep gns in memory, querying for each feature (slower, but less memory intensive)") def getFeatValuesAndGNs(feat): if gns: try: if values: return (vals[feat].copy(), gns[feat].copy()) return (None, gns[feat].copy()) except KeyError: fwc.warn("Couldn't find gns for feat: %s (group_freq_thresh may be too high)" % feat) return (None, dict()) else:#must query for feat gnDict = None valDict = None gnlist = [] if gCond: if where: gnlist = getGroupNormsForFeat(feat, where+" AND "+gCond) else: gnlist = getGroupNormsForFeat(feat, gCond) else: gnlist = self.getGroupNormsForFeat(feat) if values: gnDict = dict([(g, float(gn)) for g, _, gn in gnlist]) valDict = dict([(g, float(v)) for g, v, _ in gnlist]) else: gnDict = dict([(g, float(gn)) for g, gn in gnlist]) return (valDict, gnDict) #fill in zeros (this can get quite big!) for feat in allFeats: (valDict, gnDict) = getFeatValuesAndGNs(feat) for gid in groups: if not gid in gnDict: #add zeros! gnDict[gid] = 0 if values and valDict: valDict[gid] = 0 if values: yield (feat, valDict, gnDict, numFeats) else: yield (feat, gnDict, numFeats)
# Interfaces with FeatureWorker and scikit-learn # to perform prediction of outcomes for language features. # # example: predicting satisfaction with life score given language use # # example usage: ./featureWorker.py --outcome_fields SWL --train_regression from fwConstants import warn import cPickle as pickle try: from rpy2.robjects.packages import importr import rpy2.robjects as ro from rpy2.rinterface import RNULLType except ImportError: warn("rpy2 cannot be imported") pass import pandas as pd try: import pandas.rpy.common as com except ImportError: warn("pandas.rpy.common cannot be imported") pass from inspect import ismethod import sys import random from itertools import combinations # scikit-learn imports
def getContingencyArrayFeatNorm(self, where=''): """ returns a list of lists: each row is a group_id and each col is a feature""" """ the first row has a blank first entry and then a list of unique features""" """ the first column has a blank first entry and then a list of unique group_ids""" fwc.warn("running getContingencyArrayFeatNorm") fwc.warn( "Getting distinct feature / groupId lists and (feat, featNormZero) list" ) distinctFeatureList = self.getDistinctFeatures(where) featureZeroList = self.getFeatureZeros(where) distinctGroupList = self.getDistinctGroups(where) fwc.warn( "Converting feature / groupId lists to dictionaries (item: index) for quick insertion" ) distinctFeatureDict = {} counter = 0 for feature in distinctFeatureList: distinctFeatureDict[feature] = counter counter += 1 distinctGroupDict = {} counter = 0 for group in distinctGroupList: distinctGroupDict[group] = counter counter += 1 fwc.warn( "Making a 2d array (matrix) with ncol = nDistinctFeatures and nrow = nDistinctGroupIds" ) fwc.warn( "For each distinct feature, intializing that column with feat norm zeros' value" ) contingencyMatrix = zeros( (len(distinctGroupList), len(distinctFeatureList))) for tup in featureZeroList: (feat, featNormZero) = tup columnIndexToZero = distinctFeatureDict[feat] contingencyMatrix[:, columnIndexToZero] = featNormZero fwc.warn("calling getFeatNormsSS, iterating through (with SS cursor)") fwc.warn( "for each iteration, using the index dictionaries to insert the entry into the matrix" ) ssCursor = self.getFeatNormsSS(where) for tup in ssCursor: (gid, feat, featNorm) = tup columnIndexForInsertion = distinctFeatureDict[feat] rowIndexForInsertion = distinctGroupDict[gid] contingencyMatrix[rowIndexForInsertion, columnIndexForInsertion] = featNorm fwc.warn( "returning [contingency matrix, rownames (distinct groups), and colnames (distinct features)]" ) return [contingencyMatrix, distinctGroupList, distinctFeatureList]
def createTableWithBinnedFeats(self, num_bins, group_id_range, groupfreqthresh, valueFunc = lambda x:x, gender=None, genderattack=False, reporting_percent=0.04, outcomeTable = fwc.DEF_OUTCOME_TABLE, skip_binning=False): featureTable = self.featureTable group_id_range = map(int, group_id_range) newTable = featureTable+'$'+str(num_bins)+'b_'+'_'.join(map(str,group_id_range)) if skip_binning: return newTable sql = 'DROP TABLE IF EXISTS %s'%newTable mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = "CREATE TABLE %s like %s" % (newTable, featureTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) #groupValues = self.getSumValuesByGroup(where) # [(gid1, val1), ...] # OLD N calculation.... same as new one.... # sql = 'SELECT age, COUNT(DISTINCT user_id) from userstats_en_ageadj where age >= %d AND age <= %d AND uwt >= %d'%(group_id_range[0], group_id_range[1], groupfreqthresh) # if gender: # gender = gender.lower() # if gender == 'm': # gender = 0 # elif gender == 'f': # gender = 1 # sql += ' AND gender = %d'%gender # else: # sql += ' AND gender IS NOT NULL' # sql += ' group by age' # groupValues = mm.executeGetList(self.corpdb, self.dbCursor, sql) # [(gid1, N1), ...] # groupIdToN = dict(groupValues) groupNs = mm.executeGetList(self.corpdb, self.dbCursor, 'SELECT group_id, N FROM %s GROUP BY group_id'%self.featureTable, charset=self.encoding, use_unicode=self.use_unicode) groupIdToN = dict(groupNs) #pprint(groupIdToN) #pprint(groupIdToN) total_freq = sum(map(lambda x:x[1], groupNs)) bin_size = float(total_freq) / float(num_bins+2) num_groups = len(groupNs) reporting_int = fwc._getReportingInt(reporting_percent, num_groups) # figure out the bins, i.e. if group_id's 1,2,3 total value is greater than "bin_size" our first bin is 1_3. fwc.warn('determining the number of bins...') current_sum = 0 current_lower_group = groupNs[0][0] current_upper_group = None next_group_is_lower_group = False bin_groups = OrderedDict() gg = 0 for group, value in groupNs: if next_group_is_lower_group: current_lower_group = group next_group_is_lower_group = False current_sum += value current_upper_group = group if current_sum >= bin_size: current_sum = 0 bin_groups[(current_lower_group, current_upper_group)] = '_'.join(map(str,[current_lower_group, current_upper_group])) next_group_is_lower_group = True gg += 1 fwc._report('group_id\'s', gg, reporting_int, num_groups) if current_sum >= 0: bin_groups[(current_lower_group, current_upper_group)] = '_'.join(map(str,[current_lower_group, current_upper_group])) max_label_length = max(map(len, bin_groups.values())) sql = 'ALTER TABLE %s MODIFY COLUMN group_id VARCHAR(%d)'%(newTable, max_label_length) #this action preserves the index mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = 'ALTER TABLE %s ADD COLUMN `bin_center` float(6) not null default -1.0'%(newTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = 'ALTER TABLE %s ADD COLUMN `bin_center_w` float(6) not null default -1.0'%(newTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) sql = 'ALTER TABLE %s ADD COLUMN `bin_width` int(10) not null default -1'%(newTable) mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) mm.disableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode) # for each newly denoted bin: e.g. 1_3, 4_5, 6_6, ... get the new feature value counts / group norms; insert them into the new table # e.g. 1 'hi' 5, 2 'hi' 10, 3 'hi' 30 ==> 1_3 'hi' 45 (of course include group_norm also) fwc.warn('aggreagating the newly binned feature values / group_norms into the new table...') isql = 'INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N, bin_center, bin_center_w, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s, %s, %s') #isql = 'INSERT INTO %s (group_id, feat, value, group_norm, N, bin_center, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s') ii_bins = 0 num_bins = len(bin_groups.keys()) reporting_int = fwc._getReportingInt(reporting_percent, num_bins) #_warn('#############BIN NUMBER############### [[%d]] #############'%len(bin_groups)) for (lower_group, upper_group), label in bin_groups.iteritems(): bin_N_sum = 0 bin_width = 0 bin_center = sum((lower_group, upper_group)) / 2.0 bin_center_w = 0 for ii in range(lower_group, upper_group+1): #_warn('for bin %d_%d ii:%d'%(lower_group, upper_group, ii)) bin_width += 1 bin_N_sum += groupIdToN.get(ii, 0) bin_center_w += groupIdToN.get(ii, 0) * ii bin_center_w = float(bin_center_w) / float(bin_N_sum) #_warn('number of users in range [%d, %d] is %d'%(lower_group, upper_group, bin_N_sum)) # sql = 'SELECT group_id, feat, value, group_norm, N FROM %s where group_id >= %d AND group_id <= %d'%(self.featureTable, lower_group, upper_group) sql = 'SELECT group_id, feat, value, group_norm, std_dev FROM %s where group_id >= %d AND group_id <= %d'%(self.featureTable, lower_group, upper_group) groupFeatValueNorm = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) #pprint(groupFeatValueNorm) totalFeatCountForThisBin = float(0) featToValue = {} featToSummedNorm = {} for group_id, feat, value, norm, sd in groupFeatValueNorm: # for group_id, feat, value, norm, N in groupFeatValueNorm: if fwc.LOWERCASE_ONLY: feat = str(feat).lower() totalFeatCountForThisBin += value currentN = groupIdToN[group_id] try: featToValue[feat] += value featToSummedNorm[feat] += norm * currentN except KeyError: featToValue[feat] = value featToSummedNorm[feat] = norm * currentN #calculate mean and std_dev, using above info featToMeanNorm = {} featToSummedVar = {} for group_id, feat, _, norm, sd in groupFeatValueNorm: currentN = groupIdToN[group_id] meanNorm = featToSummedNorm[feat]/bin_N_sum try: featToSummedVar[feat] += currentN*((meanNorm - norm)**2 + (sd*sd)) except KeyError: featToSummedVar[feat] = currentN*((meanNorm - norm)**2 + (sd*sd)) featToMeanNorm[feat] = meanNorm current_batch = [ ('_'.join(map(str,(lower_group, upper_group))), k, v, featToMeanNorm[k], sqrt(featToSummedVar[k] / bin_N_sum), bin_N_sum, bin_center, bin_center_w, bin_width) for k, v in featToValue.iteritems() ] mm.executeWriteMany(self.corpdb, self.dbCursor, isql, current_batch, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode) # print 'N bin sum:', bin_N_sum # isql = 'INSERT INTO %s (group_id, feat, value, group_norm, N, bin_center, bin_center_w, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s, %s') ii_bins += 1 fwc._report('group_id bins', ii_bins, reporting_int, num_bins) mm.enableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode) fwc.warn('Done creating new group_id-binned feature table.') outputdata = mm.executeGetList(self.corpdb, self.dbCursor, 'select group_id, N from `%s` group by group_id'%(newTable,), charset=self.encoding, use_unicode=self.use_unicode) pprint(outputdata) # mm.execute(self.corpdb, self.dbCursor, 'drop table if exists `%s`'%(newTable,)) return newTable