コード例 #1
0
    def makeTopicLabelMap(self, topiclexicon, numtopicwords=5, is_weighted_lexicon=False):
        featlabel_tablename = 'feat_to_label$%s$%d'%(topiclexicon, numtopicwords)

        pldb = self.lexicondb
        (plconn, plcur, plcurD) = mm.dbConnect(pldb, charset=self.encoding, use_unicode=self.use_unicode)
        sql = 'DROP TABLE IF EXISTS `%s`'%featlabel_tablename
        mm.execute(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode)
        sql = 'CREATE TABLE `%s` (`id` int(16) unsigned NOT NULL AUTO_INCREMENT, `term` varchar(128) DEFAULT NULL, `category` varchar(64) DEFAULT NULL, PRIMARY KEY (`id`), KEY `term` (`term`), KEY `category` (`category`) )'%featlabel_tablename
        mm.execute(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode)

        sql = 'SELECT DISTINCT category FROM %s'%topiclexicon
        categories = map(lambda x: x[0], mm.executeGetList(pldb, plcur, sql))
        label_list = []
        for category in categories:
            if is_weighted_lexicon:
                sql = 'SELECT term, weight from %s WHERE category = \'%s\''%(topiclexicon, category)
                rows = mm.executeGetList(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode)
                top_n_rows = sorted(rows, key=lambda x:x[1], reverse=True)
                terms = map(lambda x: x[0], top_n_rows)
                label = ' '.join(map(str, terms[0:numtopicwords]))
                escaped_label = MySQLdb.escape_string(label)
                sql = 'INSERT INTO `%s` (`term`, `category`) VALUES(\'%s\', \'%s\')'%(featlabel_tablename, category, escaped_label )
                mm.execute(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode)
            else:
                sql = 'SELECT term from %s WHERE category = \'%s\''%(topiclexicon, category)
                terms = map(lambda x: x[0], mm.executeGetList(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode))
                label = ' '.join(map(str, terms[0:numtopicwords]))
                escaped_label = MySQLdb.escape_string(label)
                sql = 'INSERT INTO `%s` (`term`, `category`) VALUES(\'%s\', \'%s\')'%(featlabel_tablename, category, escaped_label )
                mm.execute(pldb, plcur, sql, charset=self.encoding, use_unicode=self.use_unicode)

        return featlabel_tablename
コード例 #2
0
    def createTfIdfTable(self, ngram_table):
        '''
        Creates new feature table where group_norm = tf-idf (term frequency-inverse document frequency)
        :param ngram_table: table containing words/ngrams, collocs, etc...

        Written by Phil
        '''

        # tf-idf = tf*idf

        # tf (term frequency) is simply how frequently a term occurs in a document (group_norm for a given group_id)

        # each feat's idf = log(N/dt)
        # N = number of documents in total (i.e. count(distinct(group_id))
        # df (document frequency) = number of documents where feat was used in (i.e. count(distinct(group_id)) where feat = 'feat')

        # create new feature table
        feat_name_grabber = re.compile(r'^feat\$([^\$]+)\$') 
        feat_name = feat_name_grabber.match(ngram_table).group(1) # grabs feat_name (i.e. 1gram, 1to3gram)

        short_name = 'tf_idf_{}'.format(feat_name)
        idf_table = self.createFeatureTable(short_name, valueType = 'DOUBLE')

        #getting N
        sql = "SELECT COUNT(DISTINCT group_id) FROM %s" % ngram_table
        N = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0]

        feat_counts = self.getFeatureCounts() #tuples of: feat, count (number of groups feature appears with)

        fwc.warn('Inserting idf values into new table')
        counter = 0
        for (feat, dt) in feat_counts:
            idf = log(N/float(dt))

            # get (group_id, group_norm) where feat = feat
            # clean_feat = mm.MySQLdb.escape_string(feat.encode('utf-8')) 

            sql = u"""SELECT group_id, value, group_norm from %s WHERE feat = \'%s\'"""%(ngram_table, mm.MySQLdb.escape_string(feat.encode('utf-8')).decode('utf-8'))

            group_id_freq = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)

            for (group_id, value, tf) in group_id_freq:
                tf_idf = tf * idf

                insert_sql = u"INSERT INTO {} (group_id, feat, value, group_norm) VALUES (\'{}\', \'{}\', {}, {});".format(
                                                idf_table, 
                                                group_id, 
                                                mm.MySQLdb.escape_string(feat.encode('utf-8')).decode('utf-8'), 
                                                value, 
                                                tf_idf)
                mm.execute(self.corpdb, self.dbCursor, insert_sql)

                if (counter % 50000 == 0):
                    print '%d tf_idf values inserted!' % (counter)
                counter += 1

        fwc.warn('Finished inserting.')

        return idf_table
コード例 #3
0
 def getMessagesWithFieldForCorrelField(self, cf_id, extraField, messageTable = None, warnMsg = True):
     """..."""
     if not messageTable: messageTable = self.corptable
     msql = """SELECT %s, %s, %s FROM %s WHERE %s = '%s'""" % (
         self.messageid_field, self.message_field, extraField, messageTable, self.correl_field, cf_id)
     #return self._executeGetSSCursor(msql, showQuery)
     return mm.executeGetList(self.corpdb, self.dbCursor, msql, warnMsg, charset=self.encoding, use_unicode=self.use_unicode)
コード例 #4
0
    def createFeatureTable(self, featureName, featureType = 'VARCHAR(64)', valueType = 'INTEGER', tableName = None, valueFunc = None, correlField=None, extension = None):
        """Creates a feature table based on self data and feature name"""
        
        #create table name
        if not tableName: 
            valueExtension = ''
            tableName = 'feat$'+featureName+'$'+self.corptable+'$'+self.correl_field
            if valueFunc: 
                tableName += '$' + str(16)+'to'+"%d"%round(valueFunc(16))
            if extension: 
                tableName += '$' + extension

        #find correl_field type:
        sql = """SELECT column_type FROM information_schema.columns WHERE table_schema='%s' AND table_name='%s' AND column_name='%s'""" % (
            self.corpdb, self.corptable, self.correl_field)

        correlField = self.getCorrelFieldType(self.correl_field) if not correlField else correlField
        correl_fieldType = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0] if not correlField else correlField

        #create sql
        drop = """DROP TABLE IF EXISTS %s""" % tableName
        # featureType = "VARCHAR(30)" # MAARTEN
        #CREATE TABLE feat_3gram_messages_rand1000_user_id (id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, user_id ('bigint(20) unsigned',), 3gram VARCHAR(64), VALUE INTEGER
        #sql = """CREATE TABLE %s (id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, group_id %s, feat %s, value %s, group_norm DOUBLE, feat_norm DOUBLE, KEY `correl_field` (`group_id`), KEY `feature` (`feat`)) CHARACTER SET utf8 COLLATE utf8_general_ci""" %(tableName, correl_fieldType, featureType, valueType)
        sql = """CREATE TABLE %s (id INT UNSIGNED NOT NULL AUTO_INCREMENT PRIMARY KEY, group_id %s, feat %s, value %s, group_norm DOUBLE, KEY `correl_field` (`group_id`), KEY `feature` (`feat`)) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin""" %(tableName, correl_fieldType, featureType, valueType)

        #run sql
        mm.execute(self.corpdb, self.dbCursor, drop, charset=self.encoding, use_unicode=self.use_unicode)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)

        return  tableName;
コード例 #5
0
    def makeBlackWhiteList(args_featlist, args_lextable, args_categories, args_lexdb):
        newlist = set()
        if args.use_unicode:
            print "making black or white list: [%s] [%s] [%s]" %([unicode(feat,'utf-8') if isinstance(feat, str) else feat for feat in args_featlist], args_lextable, args_categories)
        else:
            print "making black or white list: [%s] [%s] [%s]" %([feat if isinstance(feat, str) else feat for feat in args_featlist], args_lextable, args_categories)
        if args_lextable and args_categories:
            (conn, cur, dcur) = mm.dbConnect(args_lexdb, charset=self.encoding, use_unicode=self.use_unicode)
            sql = 'SELECT term FROM %s' % (args_lextable)
            if (len(args_categories) > 0) and args_categories[0] != '*':
                sql = 'SELECT term FROM %s WHERE category in (%s)'%(args_lextable, ','.join(map(lambda x: '\''+str(x)+'\'', args_categories)))

            rows = mm.executeGetList(args_lexdb, cur, sql, charset=self.encoding, use_unicode=self.use_unicode)
            for row in rows:
                newlist.add(row[0])
        elif args_featlist:
            for feat in args_featlist:
                if args.use_unicode:
                    feat = unicode(feat, 'utf-8') if isinstance(feat, str) else feat
                else:
                    feat = feat if isinstance(feat, str) else feat
                # newlist.add(feat.lower())
                if args.use_unicode:
                    newlist.add(feat.upper() if sum(map(unicode.isupper, feat)) > (len(feat)/2) else feat.lower())
                else:
                    newlist.add(feat.upper() if sum(map(str.isupper, feat)) > (len(feat)/2) else feat.lower())
        else:
            raise Exception('blacklist / whitelist flag specified without providing features.')
        newlist = [w.strip() for w in newlist]
        return newlist
コード例 #6
0
 def getValuesAndGroupNormsForFeat(self, feat, where = '', warnMsg = False):
     """returns a list of (group_id, feature, group_norm) triples"""
     if self.use_unicode:
         sql = """SELECT group_id, value, group_norm FROM %s WHERE feat = '%s'"""%(self.featureTable, MySQLdb.escape_string(unicode(feat, 'utf8')))
     else:
         sql = """SELECT group_id, value, group_norm FROM %s WHERE feat = '%s'"""%(self.featureTable, MySQLdb.escape_string(feat))
     if (where): sql += ' AND ' + where
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, warnMsg, charset=self.encoding, use_unicode=self.use_unicode) 
コード例 #7
0
 def getSumValue(self, where=''):
     """returns the sume of all values"""
     sql = """select sum(value) from %s""" % (self.featureTable)
     if (where): sql += ' WHERE ' + where
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)[0][0]
コード例 #8
0
 def getNumWordsByCorrelField(self, where = ''):
     """..."""
     #assumes corptable has num_words field for each message
     #SELECT user_id, sum(num_words) FROM (SELECT user_id, num_words FROM messages GROUP BY message_id) as a GROUP BY user_id
     sql = """SELECT %s, sum(num_words) FROM (SELECT %s, num_words FROM %s """ % (self.correl_field, self.correl_field, self.corptable)
     if (where): sql += ' WHERE ' + where  
     sql += """ GROUP BY %s) as a """ % self.messageid_field 
     sql += """ GROUP BY %s """ % self.correl_field
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
コード例 #9
0
ファイル: featureWorker.py プロジェクト: xionglong58/dlatk
 def getFeatureTables(self, where=''):
     """Return all available feature tables for the given corpdb, corptable and correl_field"""
     sql = """SHOW TABLES FROM %s LIKE 'feat$%%$%s$%s$%%' """ % (
         self.corpdb, self.corptable, self.correl_field)
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #10
0
 def getValuesAndGroupNormsForFeats(self, feats, where = '', warnMsg = False):
     """returns a list of (group_id, feature, group_norm) triples"""
     if self.use_unicode:
         fCond = " feat in ('%s')" % "','".join(MySQLdb.escape_string(unicode(f)) for f in feats)
     else:
         fCond = " feat in ('%s')" % "','".join(MySQLdb.escape_string(f) for f in feats)
     sql = """SELECT group_id, value, group_norm FROM %s WHERE %s"""%(self.featureTable, fCond)
     if (where): sql += ' AND ' + where
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, warnMsg, charset=self.encoding, use_unicode=self.use_unicode) 
コード例 #11
0
 def getValues(self, where=''):
     """returns a list of (group_id, feature, value) triples"""
     sql = """select group_id, feat, value from %s""" % (self.featureTable)
     if (where): sql += ' WHERE ' + where
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #12
0
 def getFeatureZeros(self, where=''):
     """returns a distinct list of (feature) tuples given the name of the feature value field (either value, group_norm, or feat_norm)"""
     sql = "select feat, zero_feat_norm from %s" % ('mean_' +
                                                    self.featureTable)
     if (where): sql += ' WHERE ' + where
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #13
0
 def getSumValuesByFeat(self, where=''):
     """ """
     sql = """SELECT feat, sum(value) FROM %s """ % self.featureTable
     if (where): sql += ' WHERE ' + where
     sql += """ GROUP BY feat """
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #14
0
 def getGroupNorms(self, where=''):
     """returns a list of (group_id, feature, group_norm) triples"""
     sql = """SELECT group_id, feat, group_norm from %s""" % (
         self.featureTable)
     if (where): sql += ' WHERE ' + where
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #15
0
 def getFeatMeanData(self, where = ''):
     """returns a dict of (feature => (mean, std, zero_feat_norm)) """
     meanTable = 'mean_'+self.featureTable
     sql = """select feat, mean, std, zero_feat_norm from %s"""%(meanTable)
     if (where): sql += ' WHERE ' + where
     mList = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) 
     meanData = dict()
     for tup in mList: #feat : (mean, std, zero_feat_norm)
         meanData[tup[0]] = tup[1:]
     return meanData
コード例 #16
0
 def getFeatureValueSums(self, where=''):
     """returns a list of (feature, count) tuples, where count is the number of groups with the feature"""
     sql = """select feat, sum(value) from %s group by feat""" % (
         self.featureTable)
     if (where): sql += ' WHERE ' + where
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #17
0
 def getDistinctGroupsFromFeatTable(self, where=""):
     """Returns the distinct group ids that are in the feature table"""
     sql = "select distinct group_id from %s" % self.featureTable
     if (where): sql += ' WHERE ' + where
     return map(
         lambda l: l[0],
         mm.executeGetList(self.corpdb,
                           self.dbCursor,
                           sql,
                           charset=self.encoding,
                           use_unicode=self.use_unicode))
コード例 #18
0
 def getGroupNormsForFeat(self, feat, where='', warnMsg=False):
     """returns a list of (group_id, feature, group_norm) triples"""
     sql = """SELECT group_id, group_norm FROM %s WHERE feat = '%s'""" % (
         self.featureTable, MySQLdb.escape_string(feat))
     if (where): sql += ' AND ' + where
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              warnMsg,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #19
0
 def getGroupAndFeatureValues(self, featName=None, where=''):
     """returns a list of (group_id, feature_value) tuples"""
     if not featName: featName = self.featNames[0]
     sql = "select group_id, group_norm from %s WHERE feat = '%s'" % (
         self.featureTable, featName)
     if (where): sql += ' AND ' + where
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              False,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #20
0
ファイル: outcomeGetter.py プロジェクト: xionglong58/dlatk
 def getDistinctOutcomeValues(self, outcome = None, includeNull = True, where = ''):
     """returns a list of outcome values"""
     if not outcome:
         outcome = self.outcome_value_fields[0]
     sql = "SELECT DISTINCT %s FROM %s"%(outcome, self.outcome_table)
     if not includeNull or where: 
         wheres = []
         if where: wheres.append(where)
         if not includeNull:
             wheres.append("%s IS NOT NULL" % outcome)
         sql += ' WHERE ' + ' AND '.join(wheres)
     return map(lambda v: v[0], mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
コード例 #21
0
 def getDistinctOutcomeValues(self, outcome = None, includeNull = True, where = ''):
     """returns a list of outcome values"""
     if not outcome:
         outcome = self.outcome_value_fields[0]
     sql = "SELECT DISTINCT %s FROM %s"%(outcome, self.outcome_table)
     if not includeNull or where: 
         wheres = []
         if where: wheres.append(where)
         if not includeNull:
             wheres.append("%s IS NOT NULL" % outcome)
         sql += ' WHERE ' + ' AND '.join(wheres)
     return map(lambda v: v[0], mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
コード例 #22
0
 def getDistinctGroups(self, where=''):
     """returns the distinct distinct groups (note that this runs on the corptable to be accurate)"""
     sql = """select DISTINCT %s from %s""" % (self.correl_field,
                                               self.corptable)
     if (where): sql += ' WHERE ' + where
     return map(
         lambda l: l[0],
         mm.executeGetList(self.corpdb,
                           self.dbCursor,
                           sql,
                           charset=self.encoding,
                           use_unicode=self.use_unicode))
コード例 #23
0
ファイル: outcomeGetter.py プロジェクト: xionglong58/dlatk
    def makeContingencyTable(self, featureGetter, featureValueField, outcome_filter_where='', feature_value_group_sum_min=0):
        """makes a contingency table from this outcome value, a featureGetter, and the desired column of the featureGetter, assumes both correl_field's are the same"""
        """follows http://www.artfulsoftware.com/infotree/queries.php at section: Group Column Statistics in Rows"""
        """the only time this uses outcome_value's would be in the outcome_filter_where statement"""
        fg = featureGetter
        distinctFeatureList = fg.getDistinctFeatures() #access single idx
        featZeroDict = dict(fg.getFeatureZeros())
        
        sql = "SELECT %s, "%(fg.correl_field)

        def makeCaseStrings( distinctFeature ):
            df = distinctFeature[0]
            zero = .0000001
            if df in featZeroDict:
                zero = featZeroDict[df]
            df = MySQLdb.escape_string(df)
            if df:#debug
                return "( CASE feat WHEN '%s' THEN %s ELSE %s END ) AS '%s'"%(df, featureValueField, str(zero), df)
            return ''

        case_statements = map(makeCaseStrings, distinctFeatureList)
        sql_cases_features = ", ".join(case_statements) + " "
        #debugN = 1000 #DEBUG
        #_warn( distinctFeatureList[0:debugN] ) #DEBUG
        #sql_cases_features = "".join(case_statements[0:debugN]) #DEBUG
        
        # update the main sql statement to select distinct features as columns
        sql += sql_cases_features

        # filter out the outcomes based on the outcome_filter_where statement
        # an example would be outcome_filter_where = "self.featureValueField is not null and self.featureValueField > 0.50"
        sql_new_outcome_correl_ids = "( SELECT %s FROM %s "%(self.correl_field, self.outcome_table)
        if outcome_filter_where: sql_new_outcome_correl_ids += "WHERE " + outcome_filter_where
        sql_new_outcome_correl_ids += ")"

        # This piece takes care of "users with > 4000 words"
        sql_filtered_feature_table = fg.featureTable
        if feature_value_group_sum_min > 0:
            # Get a sum of "value" for each group_id
            sql_group_ids_and_value_counts = "( SELECT %s, SUM(value) AS value_sum FROM %s GROUP BY %s )"%(fg.correl_field, fg.featureTable, fg.correl_field)
            # Select group_id that have a "value_sum" >= N (input as a function argument; feature_value_group_sum_min)
            sql_group_ids_with_sufficient_value = "( SELECT %s FROM %s AS groupIdsAndSums WHERE value_sum > %s )"%(fg.correl_field, sql_group_ids_and_value_counts, feature_value_group_sum_min)
            # Select the subset of the original fg.featureTable where group_id meets the "value_sum >= N" condition
            sql_filtered_feature_table = "( SELECT featuresOriginal.* FROM %s AS featuresOriginal, %s AS featuresSubset WHERE featuresOriginal.%s = featuresSubset.%s )"%( fg.featureTable, sql_group_ids_with_sufficient_value, fg.correl_field, fg.correl_field)


        # update the feature table to contain only the outcomes from the filtered id's
        sql_filtered_feature_table_2 = "( SELECT filteredFeatures.* FROM %s AS filteredFeatures, %s AS filteredOutcomes WHERE filteredFeatures.%s = filteredOutcomes.%s)"%(sql_filtered_feature_table, sql_new_outcome_correl_ids, fg.correl_field, self.correl_field)

        # finish the original query with the updated feature table
        sql += "FROM %s AS updatedFeatures GROUP BY %s"%(sql_filtered_feature_table_2, fg.correl_field)
        return [distinctFeatureList, mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)]
コード例 #24
0
    def makeContingencyTable(self, featureGetter, featureValueField, outcome_filter_where='', feature_value_group_sum_min=0):
        """makes a contingency table from this outcome value, a featureGetter, and the desired column of the featureGetter, assumes both correl_field's are the same"""
        """follows http://www.artfulsoftware.com/infotree/queries.php at section: Group Column Statistics in Rows"""
        """the only time this uses outcome_value's would be in the outcome_filter_where statement"""
        fg = featureGetter
        distinctFeatureList = fg.getDistinctFeatures() #access single idx
        featZeroDict = dict(fg.getFeatureZeros())
        
        sql = "SELECT %s, "%(fg.correl_field)

        def makeCaseStrings( distinctFeature ):
            df = distinctFeature[0]
            zero = .0000001
            if df in featZeroDict:
                zero = featZeroDict[df]
            df = MySQLdb.escape_string(df)
            if df:#debug
                return "( CASE feat WHEN '%s' THEN %s ELSE %s END ) AS '%s'"%(df, featureValueField, str(zero), df)
            return ''

        case_statements = map(makeCaseStrings, distinctFeatureList)
        sql_cases_features = ", ".join(case_statements) + " "
        #debugN = 1000 #DEBUG
        #_warn( distinctFeatureList[0:debugN] ) #DEBUG
        #sql_cases_features = "".join(case_statements[0:debugN]) #DEBUG
        
        # update the main sql statement to select distinct features as columns
        sql += sql_cases_features

        # filter out the outcomes based on the outcome_filter_where statement
        # an example would be outcome_filter_where = "self.featureValueField is not null and self.featureValueField > 0.50"
        sql_new_outcome_correl_ids = "( SELECT %s FROM %s "%(self.correl_field, self.outcome_table)
        if outcome_filter_where: sql_new_outcome_correl_ids += "WHERE " + outcome_filter_where
        sql_new_outcome_correl_ids += ")"

        # This piece takes care of "users with > 4000 words"
        sql_filtered_feature_table = fg.featureTable
        if feature_value_group_sum_min > 0:
            # Get a sum of "value" for each group_id
            sql_group_ids_and_value_counts = "( SELECT %s, SUM(value) AS value_sum FROM %s GROUP BY %s )"%(fg.correl_field, fg.featureTable, fg.correl_field)
            # Select group_id that have a "value_sum" >= N (input as a function argument; feature_value_group_sum_min)
            sql_group_ids_with_sufficient_value = "( SELECT %s FROM %s AS groupIdsAndSums WHERE value_sum > %s )"%(fg.correl_field, sql_group_ids_and_value_counts, feature_value_group_sum_min)
            # Select the subset of the original fg.featureTable where group_id meets the "value_sum >= N" condition
            sql_filtered_feature_table = "( SELECT featuresOriginal.* FROM %s AS featuresOriginal, %s AS featuresSubset WHERE featuresOriginal.%s = featuresSubset.%s )"%( fg.featureTable, sql_group_ids_with_sufficient_value, fg.correl_field, fg.correl_field)


        # update the feature table to contain only the outcomes from the filtered id's
        sql_filtered_feature_table_2 = "( SELECT filteredFeatures.* FROM %s AS filteredFeatures, %s AS filteredOutcomes WHERE filteredFeatures.%s = filteredOutcomes.%s)"%(sql_filtered_feature_table, sql_new_outcome_correl_ids, fg.correl_field, self.correl_field)

        # finish the original query with the updated feature table
        sql += "FROM %s AS updatedFeatures GROUP BY %s"%(sql_filtered_feature_table_2, fg.correl_field)
        return [distinctFeatureList, mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)]
コード例 #25
0
 def countGroups(self, groupThresh = 0, where=''):
     """returns the number of distinct groups (note that this runs on the corptable to be accurate)"""
     if groupThresh:
         groupCnts = self.getGroupWordCounts(where)
         count = 0
         for wordCount in groupCnts.itervalues():
             if (wordCount >= groupThresh):
                 count += 1
         return count
     else:
         sql = """select count(DISTINCT %s) from %s""" %(self.correl_field, self.corptable)
         if (where): sql += ' WHERE ' + where
         return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0]
コード例 #26
0
ファイル: featureWorker.py プロジェクト: xionglong58/dlatk
    def makeBlackWhiteList(args_featlist, args_lextable, args_categories,
                           args_lexdb, args_use_unicode):
        newlist = set()
        if args_use_unicode:
            print "making black or white list: [%s] [%s] [%s]" % ([
                unicode(feat, 'utf-8') if isinstance(feat, str) else feat
                for feat in args_featlist
            ], args_lextable, args_categories)
        else:
            print "making black or white list: [%s] [%s] [%s]" % ([
                feat if isinstance(feat, str) else feat
                for feat in args_featlist
            ], args_lextable, args_categories)
        if args_lextable and args_categories:
            (conn, cur, dcur) = mm.dbConnect(args_lexdb,
                                             charset=self.encoding,
                                             use_unicode=self.use_unicode)
            sql = 'SELECT term FROM %s' % (args_lextable)
            if (len(args_categories) > 0) and args_categories[0] != '*':
                sql = 'SELECT term FROM %s WHERE category in (%s)' % (
                    args_lextable, ','.join(
                        map(lambda x: '\'' + str(x) + '\'', args_categories)))

            rows = mm.executeGetList(args_lexdb,
                                     cur,
                                     sql,
                                     charset=self.encoding,
                                     use_unicode=self.use_unicode)
            for row in rows:
                newlist.add(row[0])
        elif args_featlist:
            for feat in args_featlist:
                if args_use_unicode:
                    feat = unicode(feat, 'utf-8') if isinstance(feat,
                                                                str) else feat
                else:
                    feat = feat if isinstance(feat, str) else feat
                # newlist.add(feat.lower())
                if args_use_unicode:
                    newlist.add(feat.upper() if sum(map(unicode.isupper, feat))
                                > (len(feat) / 2) else feat.lower())
                else:
                    newlist.add(feat.upper() if sum(map(str.isupper, feat)) > (
                        len(feat) / 2) else feat.lower())
        else:
            raise Exception(
                'blacklist / whitelist flag specified without providing features.'
            )
        newlist = [w.strip() for w in newlist]
        return newlist
コード例 #27
0
ファイル: featureWorker.py プロジェクト: xionglong58/dlatk
 def getNumWordsByCorrelField(self, where=''):
     """..."""
     #assumes corptable has num_words field for each message
     #SELECT user_id, sum(num_words) FROM (SELECT user_id, num_words FROM messages GROUP BY message_id) as a GROUP BY user_id
     sql = """SELECT %s, sum(num_words) FROM (SELECT %s, num_words FROM %s """ % (
         self.correl_field, self.correl_field, self.corptable)
     if (where): sql += ' WHERE ' + where
     sql += """ GROUP BY %s) as a """ % self.messageid_field
     sql += """ GROUP BY %s """ % self.correl_field
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #28
0
 def getFeatMeanData(self, where=''):
     """returns a dict of (feature => (mean, std, zero_feat_norm)) """
     meanTable = 'mean_' + self.featureTable
     sql = """select feat, mean, std, zero_feat_norm from %s""" % (
         meanTable)
     if (where): sql += ' WHERE ' + where
     mList = mm.executeGetList(self.corpdb,
                               self.dbCursor,
                               sql,
                               charset=self.encoding,
                               use_unicode=self.use_unicode)
     meanData = dict()
     for tup in mList:  #feat : (mean, std, zero_feat_norm)
         meanData[tup[0]] = tup[1:]
     return meanData
コード例 #29
0
ファイル: featureWorker.py プロジェクト: xionglong58/dlatk
 def getMessagesForCorrelField(self,
                               cf_id,
                               messageTable=None,
                               warnMsg=True):
     """..."""
     if not messageTable: messageTable = self.corptable
     msql = """SELECT %s, %s FROM %s WHERE %s = '%s'""" % (
         self.messageid_field, self.message_field, messageTable,
         self.correl_field, cf_id)
     #return self._executeGetSSCursor(msql, warnMsg, host=self.mysql_host)
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              msql,
                              warnMsg,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #30
0
    def getFeatureCounts(self,
                         groupFreqThresh=0,
                         where='',
                         SS=False,
                         groups=set()):
        """
        Gets feature occurence by group
        
        Args: 
            groupFreqThresh (int): Minimum number of words a group must contain to be considered valid
            where (string): Conditional sql string to limit the search to elements meeting a specified criteria
            SS (boolean): Indicates the use of SSCursor (true use SSCursor to access MySQL)
            groups (set): Set of group ID's
        Returns:
            returns a list of (feature, count) tuples, 
            where count is the feature occurence in each group

        """

        if groupFreqThresh:
            groupCnts = self.getGroupWordCounts(where)
            for group, wordCount in groupCnts.iteritems():
                if (wordCount >= groupFreqThresh):
                    groups.add(group)

        if (where):
            where += ' WHERE ' + where
            if groups:
                where += ' AND ' + " group_id in ('%s')" % "','".join(
                    str(g) for g in groups)
        elif groups:
            where = " WHERE group_id in ('%s')" % "','".join(
                str(g) for g in groups)
        sql = """select feat, count(*) from %s %s group by feat""" % (
            self.featureTable, where)
        if SS:
            mm.executeGetSSCursor(self.corpdb,
                                  sql,
                                  charset=self.encoding,
                                  use_unicode=self.use_unicode,
                                  host=self.mysql_host)
        return mm.executeGetList(self.corpdb,
                                 self.dbCursor,
                                 sql,
                                 charset=self.encoding,
                                 use_unicode=self.use_unicode)
コード例 #31
0
ファイル: outcomeGetter.py プロジェクト: xionglong58/dlatk
 def getDistinctOutcomeValueCounts(self, outcome = None, requireControls = False, includeNull = True, where = ''):
     """returns a dict of (outcome_value, count)"""
     if not outcome:
         outcome = self.outcome_value_fields[0]
     sql = "SELECT %s, count(*) FROM %s"%(outcome, self.outcome_table)
     if requireControls or not includeNull or where: 
         wheres = []
         if where: wheres.append(where)
         if requireControls:
             for control in self.outcome_controls:
                 wheres.append("%s IS NOT NULL" % control)
         if not includeNull:
             wheres.append("%s IS NOT NULL" % outcome)
         sql += ' WHERE ' + ' AND '.join(wheres)
         
     sql += ' group by %s ' % outcome
     return dict(mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
コード例 #32
0
 def getValuesAndGroupNormsForFeats(self, feats, where='', warnMsg=False):
     """returns a list of (group_id, feature, group_norm) triples"""
     if self.use_unicode:
         fCond = " feat in ('%s')" % "','".join(
             MySQLdb.escape_string(unicode(f)) for f in feats)
     else:
         fCond = " feat in ('%s')" % "','".join(
             MySQLdb.escape_string(f) for f in feats)
     sql = """SELECT group_id, value, group_norm FROM %s WHERE %s""" % (
         self.featureTable, fCond)
     if (where): sql += ' AND ' + where
     return mm.executeGetList(self.corpdb,
                              self.dbCursor,
                              sql,
                              warnMsg,
                              charset=self.encoding,
                              use_unicode=self.use_unicode)
コード例 #33
0
 def getDistinctOutcomeValueCounts(self, outcome = None, requireControls = False, includeNull = True, where = ''):
     """returns a dict of (outcome_value, count)"""
     if not outcome:
         outcome = self.outcome_value_fields[0]
     sql = "SELECT %s, count(*) FROM %s"%(outcome, self.outcome_table)
     if requireControls or not includeNull or where: 
         wheres = []
         if where: wheres.append(where)
         if requireControls:
             for control in self.outcome_controls:
                 wheres.append("%s IS NOT NULL" % control)
         if not includeNull:
             wheres.append("%s IS NOT NULL" % outcome)
         sql += ' WHERE ' + ' AND '.join(wheres)
         
     sql += ' group by %s ' % outcome
     return dict(mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
コード例 #34
0
 def countGroups(self, groupThresh=0, where=''):
     """returns the number of distinct groups (note that this runs on the corptable to be accurate)"""
     if groupThresh:
         groupCnts = self.getGroupWordCounts(where)
         count = 0
         for wordCount in groupCnts.itervalues():
             if (wordCount >= groupThresh):
                 count += 1
         return count
     else:
         sql = """select count(DISTINCT %s) from %s""" % (self.correl_field,
                                                          self.corptable)
         if (where): sql += ' WHERE ' + where
         return mm.executeGetList(self.corpdb,
                                  self.dbCursor,
                                  sql,
                                  charset=self.encoding,
                                  use_unicode=self.use_unicode)[0][0]
コード例 #35
0
    def getFeatureCounts(self, groupFreqThresh = 0, where = '', SS = False, groups = set()):
        """returns a list of (feature, count) tuples, where count is the number of groups with the feature"""

        if groupFreqThresh:
            groupCnts = self.getGroupWordCounts(where)
            for group, wordCount in groupCnts.iteritems():
                if (wordCount >= groupFreqThresh):
                    groups.add(group)
                    
        if (where): 
            where += ' WHERE ' + where
            if groups:
                where += ' AND ' + " group_id in ('%s')" % "','".join(str(g) for g in groups)
        elif groups:
            where = " WHERE group_id in ('%s')" % "','".join(str(g) for g in groups)
        sql = """select feat, count(*) from %s %s group by feat"""%(self.featureTable, where)
        if SS:
            mm.executeGetSSCursor(self.corpdb, sql, charset=self.encoding, use_unicode=self.use_unicode)
        return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) 
コード例 #36
0
    def addFeatTableMeans(self, field='group_norm', groupNorms = None):
        """Add to the feature mean table: mean, standard deviation, and zero_mean for the current feature table"""

        #CREATE TABLE
        meanTable = 'mean$'+self.featureTable
        mm.execute(self.corpdb, self.dbCursor, "DROP TABLE IF EXISTS %s" % meanTable, charset=self.encoding, use_unicode=self.use_unicode)
        featType = mm.executeGetList(self.corpdb, self.dbCursor, "SHOW COLUMNS FROM %s like 'feat'" % self.featureTable)[0][1]
        sql = """CREATE TABLE %s (feat %s, mean DOUBLE, std DOUBLE, zero_feat_norm DOUBLE, PRIMARY KEY (`feat`))""" % (meanTable, featType)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)

        fMeans = self.findMeans(field, True, groupNorms)
        fMeansList = [(k, v[0], v[1], v[2]) for k, v in fMeans.iteritems()]
        #print fMeansList #debug

        #WRITE TO TABLE:
        sql = """INSERT INTO """+meanTable+""" (feat, mean, std, zero_feat_norm) VALUES (%s, %s, %s, %s)"""
        mm.executeWriteMany(self.corpdb, self.dbCursor, sql, fMeansList, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode)

        return fMeans
コード例 #37
0
ファイル: outcomeGetter.py プロジェクト: xionglong58/dlatk
    def getDistinctOutcomeAndControlValueCounts(self, outcome = None, control = None, includeNull = True, where = ''):
        """returns a dict of (outcome_value, count)"""
        #TODO: muliple controls
        if not outcome:
            outcome = self.outcome_value_fields[0]
        if not control:
            control = self.outcome_controls[0]

        sql = "SELECT %s, %s, count(*) FROM %s"%(outcome, control, self.outcome_table)
        if not includeNull or where: 
            wheres = []
            if where: wheres.append(where)
            if not includeNull:
                wheres.append("%s IS NOT NULL" % outcome)
                wheres.append("%s IS NOT NULL" % control)
            sql += ' WHERE ' + ' AND '.join(wheres)
            
        sql += ' group by %s, %s ' % (outcome, control)
        countDict = dict()
        for (outcome, control, count) in mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode):
            if not outcome in countDict:
                countDict[outcome] = dict()
            countDict[outcome][control] = count
        return countDict
コード例 #38
0
    def getDistinctOutcomeAndControlValueCounts(self, outcome = None, control = None, includeNull = True, where = ''):
        """returns a dict of (outcome_value, count)"""
        #TODO: muliple controls
        if not outcome:
            outcome = self.outcome_value_fields[0]
        if not control:
            control = self.outcome_controls[0]

        sql = "SELECT %s, %s, count(*) FROM %s"%(outcome, control, self.outcome_table)
        if not includeNull or where: 
            wheres = []
            if where: wheres.append(where)
            if not includeNull:
                wheres.append("%s IS NOT NULL" % outcome)
                wheres.append("%s IS NOT NULL" % control)
            sql += ' WHERE ' + ' AND '.join(wheres)
            
        sql += ' group by %s, %s ' % (outcome, control)
        countDict = dict()
        for (outcome, control, count) in mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode):
            if not outcome in countDict:
                countDict[outcome] = dict()
            countDict[outcome][control] = count
        return countDict
コード例 #39
0
ファイル: outcomeGetter.py プロジェクト: xionglong58/dlatk
 def getGroupAndOutcomeValues(self, outcomeField = None, where=''):
     """returns a list of (group_id, outcome_value) tuples"""
     if not outcomeField: outcomeField = self.outcome_value_fields[0]
     sql = "select %s, %s from `%s` WHERE %s IS NOT NULL"%(self.correl_field, outcomeField, self.outcome_table, outcomeField)
     if (where): sql += ' AND ' + where
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)
コード例 #40
0
 def getFeatureZeros(self, where=''):
     """returns a distinct list of (feature) tuples given the name of the feature value field (either value, group_norm, or feat_norm)"""
     sql = "select feat, zero_feat_norm from %s"%('mean_'+self.featureTable)
     if (where): sql += ' WHERE ' + where
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
コード例 #41
0
 def getDistinctGroupsFromFeatTable(self, where=""):
     """Returns the distinct group ids that are in the feature table"""
     sql = "select distinct group_id from %s" % self.featureTable
     if (where): sql += ' WHERE ' + where
     return map(lambda l:l[0], mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
コード例 #42
0
 def getDistinctGroups(self, where=''):
     """returns the distinct distinct groups (note that this runs on the corptable to be accurate)"""
     sql = """select DISTINCT %s from %s""" %(self.correl_field, self.corptable)
     if (where): sql += ' WHERE ' + where
     return map(lambda l: l[0], mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode))
コード例 #43
0
    def ttestWithOtherFG(self, other, maskTable= None, groupFreqThresh = 0):
        """Performs PAIRED ttest on differences between group norms for 2 tables, within features"""
        """to-do: switch for paired ttest or not"""

        #read mask table and figure out groups for each mask:
        masks = {'no mask': set()}
        if maskTable:
            maskList = mm.getTableColumnNameList(self.corpdb, self.dbCursor, maskTable, charset=self.encoding, use_unicode=self.use_unicode)
            print maskList
            assert self.correl_field in maskList, "group field, %s, not in mask table" % self.correl_field
            maskToIndex = dict([(maskList[i], i) for i in xrange(len(maskList))])
            groupIndex = maskToIndex[self.correl_field]

            #get data:
            sql = """SELECT %s FROM %s""" % (', '.join(maskList), maskTable)
            maskData = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
            for maskId in maskList:
                if not maskId == self.correl_field:
                    masks[maskId] = set()
            for row in maskData: 
                groupId = row[groupIndex]
                for i in xrange(len(row)):
                    if i != groupIndex and row[i] == 1:
                        masks[maskList[i]].add(groupId)

        #apply masks
        results = dict() #mask => results
        for mid, mask in masks.iteritems():

            threshGroups1 = set()
            threshGroups2 = set()

            # get groups passing GFT for BOTH 
            if groupFreqThresh:
                print 'groupFreqThresh set to '+str(groupFreqThresh)
                groupCnts1 = self.getGroupWordCounts(lexicon_count_table=self.getWordTable(self.featureTable.split('$')[2]))
                #print groupCnts1
                for group, wordCount in groupCnts1.iteritems():
                    if (wordCount >= groupFreqThresh):
                        threshGroups1.add(group)
                groupCnts2 = other.getGroupWordCounts(lexicon_count_table=other.getWordTable(other.featureTable.split('$')[2]))
                #print groupCnts2
                for group, wordCount in groupCnts2.iteritems():
                    if (wordCount >= groupFreqThresh):
                        threshGroups2.add(group)

            print str(len(threshGroups1))+' groups pass groupFreqThresh for feat table 1'
            print str(len(threshGroups2))+' groups pass groupFreqThresh for feat table 2'
            threshGroups = threshGroups1 & threshGroups2
            if mask: 
                threshGroups = threshGroups & mask
            threshGroups = list(threshGroups)
            print str(len(threshGroups))+' groups pass groupFreqThresh for BOTH'
            assert len(threshGroups) > 0, "No groups passing frequency threshold"

            #find features:
            feats1 = self.getDistinctFeatures()
            feats2 = other.getDistinctFeatures()
            featsInCommon = list(set(feats1) & set(feats2))

            ttestResults = dict()

            featYielder1 = self.yieldGroupNormsWithZerosByFeat(groups = threshGroups, feats = featsInCommon)
            featYielder2 = other.yieldGroupNormsWithZerosByFeat(groups = threshGroups, feats = featsInCommon)

            for (feat1, dataDict1, Nfeats1) in featYielder1:
                (feat2, dataDict2, Nfeats2) = featYielder2.next()

                assert feat1==feat2, 'feats do not match'
                assert sorted(dataDict1)==sorted(dataDict2), 'groups do not match'

                gns1 = [gn for (group, gn) in sorted(dataDict1.items())]
                gns2 = [gn for (group, gn) in sorted(dataDict2.items())]

                #t,p = ttest_rel(gns1,gns2)
                t,p, d = self.pairedTTest(gns1,gns2)
                ttestResults[feat1] = {'t': t, 'p': p, 'd': d, 'N': len(gns1)}
        
            results[mid] = ttestResults # dict for each feat

        return results
コード例 #44
0
 def getSumValuesByFeat(self, where = ''):
     """ """
     sql = """SELECT feat, sum(value) FROM %s """ % self.featureTable
     if (where): sql += ' WHERE ' + where  
     sql += """ GROUP BY feat """
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
コード例 #45
0
 def getGroupAndOutcomeValues(self, outcomeField = None, where=''):
     """returns a list of (group_id, outcome_value) tuples"""
     if not outcomeField: outcomeField = self.outcome_value_fields[0]
     sql = "select %s, %s from `%s` WHERE %s IS NOT NULL"%(self.correl_field, outcomeField, self.outcome_table, outcomeField)
     if (where): sql += ' AND ' + where
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)
コード例 #46
0
 def getSumValue(self, where = ''):
     """returns the sume of all values"""
     sql = """select sum(value) from %s"""%(self.featureTable)
     if (where): sql += ' WHERE ' + where
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0]
コード例 #47
0
 def getFeatureValueSums(self, where = ''):
     """returns a list of (feature, count) tuples, where count is the number of groups with the feature"""
     sql = """select feat, sum(value) from %s group by feat"""%(self.featureTable)
     if (where): sql += ' WHERE ' + where
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) 
コード例 #48
0
 def getFeatAll(self, where = ''):
     """returns a list of (group_id, feature, value, group_norm) tuples"""
     sql = """select group_id, feat, value, group_norm from %s"""%(self.featureTable)
     if (where): sql += ' WHERE ' + where
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode) 
コード例 #49
0
 def getGroupAndFeatureValues(self, featName=None, where=''):
     """returns a list of (group_id, feature_value) tuples"""
     if not featName: featName = self.featNames[0]
     sql = "select group_id, group_norm from %s WHERE feat = '%s'"%(self.featureTable, featName)
     if (where): sql += ' AND ' + where
     return mm.executeGetList(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)
コード例 #50
0
    def ttestWithOtherFG(self, other, maskTable=None, groupFreqThresh=0):
        """Performs PAIRED ttest on differences between group norms for 2 tables, within features"""
        """to-do: switch for paired ttest or not"""

        #read mask table and figure out groups for each mask:
        masks = {'no mask': set()}
        if maskTable:
            maskList = mm.getTableColumnNameList(self.corpdb,
                                                 self.dbCursor,
                                                 maskTable,
                                                 charset=self.encoding,
                                                 use_unicode=self.use_unicode)
            print maskList
            assert self.correl_field in maskList, "group field, %s, not in mask table" % self.correl_field
            maskToIndex = dict([(maskList[i], i)
                                for i in xrange(len(maskList))])
            groupIndex = maskToIndex[self.correl_field]

            #get data:
            sql = """SELECT %s FROM %s""" % (', '.join(maskList), maskTable)
            maskData = mm.executeGetList(self.corpdb,
                                         self.dbCursor,
                                         sql,
                                         charset=self.encoding,
                                         use_unicode=self.use_unicode)
            for maskId in maskList:
                if not maskId == self.correl_field:
                    masks[maskId] = set()
            for row in maskData:
                groupId = row[groupIndex]
                for i in xrange(len(row)):
                    if i != groupIndex and row[i] == 1:
                        masks[maskList[i]].add(groupId)

        #apply masks
        results = dict()  #mask => results
        for mid, mask in masks.iteritems():

            threshGroups1 = set()
            threshGroups2 = set()

            # get groups passing GFT for BOTH
            if groupFreqThresh:
                print 'groupFreqThresh set to ' + str(groupFreqThresh)
                groupCnts1 = self.getGroupWordCounts(
                    lexicon_count_table=self.getWordTable(
                        self.featureTable.split('$')[2]))
                #print groupCnts1
                for group, wordCount in groupCnts1.iteritems():
                    if (wordCount >= groupFreqThresh):
                        threshGroups1.add(group)
                groupCnts2 = other.getGroupWordCounts(
                    lexicon_count_table=other.getWordTable(
                        other.featureTable.split('$')[2]))
                #print groupCnts2
                for group, wordCount in groupCnts2.iteritems():
                    if (wordCount >= groupFreqThresh):
                        threshGroups2.add(group)

            print str(len(threshGroups1)
                      ) + ' groups pass groupFreqThresh for feat table 1'
            print str(len(threshGroups2)
                      ) + ' groups pass groupFreqThresh for feat table 2'
            threshGroups = threshGroups1 & threshGroups2
            if mask:
                threshGroups = threshGroups & mask
            threshGroups = list(threshGroups)
            print str(
                len(threshGroups)) + ' groups pass groupFreqThresh for BOTH'
            assert len(
                threshGroups) > 0, "No groups passing frequency threshold"

            #find features:
            feats1 = self.getDistinctFeatures()
            feats2 = other.getDistinctFeatures()
            featsInCommon = list(set(feats1) & set(feats2))

            ttestResults = dict()

            featYielder1 = self.yieldGroupNormsWithZerosByFeat(
                groups=threshGroups, feats=featsInCommon)
            featYielder2 = other.yieldGroupNormsWithZerosByFeat(
                groups=threshGroups, feats=featsInCommon)

            for (feat1, dataDict1, Nfeats1) in featYielder1:
                (feat2, dataDict2, Nfeats2) = featYielder2.next()

                assert feat1 == feat2, 'feats do not match'
                assert sorted(dataDict1) == sorted(
                    dataDict2), 'groups do not match'

                gns1 = [gn for (group, gn) in sorted(dataDict1.items())]
                gns2 = [gn for (group, gn) in sorted(dataDict2.items())]

                #t,p = ttest_rel(gns1,gns2)
                t, p, d = self.pairedTTest(gns1, gns2)
                ttestResults[feat1] = {'t': t, 'p': p, 'd': d, 'N': len(gns1)}

            results[mid] = ttestResults  # dict for each feat

        return results
コード例 #51
0
    def createTableWithBinnedFeats(self, num_bins, group_id_range, groupfreqthresh, valueFunc = lambda x:x, 
                                   gender=None, genderattack=False, reporting_percent=0.04, outcomeTable = fwc.DEF_OUTCOME_TABLE, skip_binning=False):
        featureTable = self.featureTable
        group_id_range = map(int, group_id_range)
        newTable = featureTable+'$'+str(num_bins)+'b_'+'_'.join(map(str,group_id_range))
        if skip_binning: return newTable

        sql = 'DROP TABLE IF EXISTS %s'%newTable
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        sql = "CREATE TABLE %s like %s" % (newTable, featureTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)

        #groupValues = self.getSumValuesByGroup(where) # [(gid1, val1), ...]
        # OLD N calculation.... same as new one....
        # sql = 'SELECT age, COUNT(DISTINCT user_id) from userstats_en_ageadj where age >= %d AND age <= %d AND uwt >= %d'%(group_id_range[0], group_id_range[1], groupfreqthresh)
        # if gender:
        #     gender = gender.lower()
        #     if gender == 'm':
        #         gender = 0
        #     elif gender == 'f':
        #         gender = 1
        #     sql += ' AND gender = %d'%gender
        # else:
        #     sql += ' AND gender IS NOT NULL'
        # sql += ' group by age'
        # groupValues = mm.executeGetList(self.corpdb, self.dbCursor, sql) # [(gid1, N1), ...]
        # groupIdToN = dict(groupValues)
        groupNs = mm.executeGetList(self.corpdb, self.dbCursor, 'SELECT group_id, N FROM %s GROUP BY group_id'%self.featureTable, charset=self.encoding, use_unicode=self.use_unicode)
        groupIdToN = dict(groupNs)
        #pprint(groupIdToN)
        #pprint(groupIdToN)
        total_freq = sum(map(lambda x:x[1], groupNs))
        bin_size = float(total_freq) / float(num_bins+2)

        num_groups = len(groupNs)
        reporting_int = fwc._getReportingInt(reporting_percent, num_groups)

        # figure out the bins, i.e. if group_id's 1,2,3 total value is greater than "bin_size" our first bin is 1_3.
        fwc.warn('determining the number of bins...')
        current_sum = 0
        current_lower_group = groupNs[0][0]

        current_upper_group = None
        next_group_is_lower_group = False
        bin_groups = OrderedDict()
        gg = 0
        for group, value in groupNs:
            if next_group_is_lower_group:
                current_lower_group = group
                next_group_is_lower_group = False
            current_sum += value
            current_upper_group = group
            if current_sum >= bin_size:
                current_sum = 0
                bin_groups[(current_lower_group, current_upper_group)]  = '_'.join(map(str,[current_lower_group, current_upper_group]))
                next_group_is_lower_group = True
            gg += 1
            fwc._report('group_id\'s', gg, reporting_int, num_groups)
        if current_sum >= 0:
            bin_groups[(current_lower_group, current_upper_group)]  = '_'.join(map(str,[current_lower_group, current_upper_group]))

        max_label_length = max(map(len, bin_groups.values()))

        sql = 'ALTER TABLE %s MODIFY COLUMN group_id VARCHAR(%d)'%(newTable, max_label_length) #this action preserves the index
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        sql = 'ALTER TABLE %s ADD COLUMN `bin_center` float(6) not null default -1.0'%(newTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        sql = 'ALTER TABLE %s ADD COLUMN `bin_center_w` float(6) not null default -1.0'%(newTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        sql = 'ALTER TABLE %s ADD COLUMN `bin_width` int(10) not null default -1'%(newTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        mm.disableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode)

        # for each newly denoted bin: e.g. 1_3, 4_5, 6_6, ... get the new feature value counts / group norms; insert them into the new table
        # e.g. 1 'hi' 5, 2 'hi' 10, 3 'hi' 30 ==> 1_3 'hi' 45  (of course include group_norm also)
        fwc.warn('aggreagating the newly binned feature values / group_norms into the new table...')
        isql = 'INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N, bin_center, bin_center_w, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s, %s, %s')
        #isql = 'INSERT INTO %s (group_id, feat, value, group_norm, N, bin_center, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s')
        ii_bins = 0
        num_bins = len(bin_groups.keys())
        reporting_int = fwc._getReportingInt(reporting_percent, num_bins)
        #_warn('#############BIN NUMBER############### [[%d]] #############'%len(bin_groups))
        for (lower_group, upper_group), label in bin_groups.iteritems():
            bin_N_sum = 0
            bin_width = 0
            bin_center = sum((lower_group, upper_group)) / 2.0
            bin_center_w = 0
            for ii in range(lower_group, upper_group+1):
                #_warn('for bin %d_%d ii:%d'%(lower_group, upper_group, ii))
                bin_width += 1
                bin_N_sum += groupIdToN.get(ii, 0)
                bin_center_w += groupIdToN.get(ii, 0) * ii
            bin_center_w = float(bin_center_w) / float(bin_N_sum)

            #_warn('number of users in range [%d, %d] is %d'%(lower_group, upper_group, bin_N_sum))
            
            # sql = 'SELECT group_id, feat, value, group_norm, N FROM %s where group_id >= %d AND group_id <= %d'%(self.featureTable, lower_group, upper_group)
            sql = 'SELECT group_id, feat, value, group_norm, std_dev FROM %s where group_id >= %d AND group_id <= %d'%(self.featureTable, lower_group, upper_group)
            groupFeatValueNorm = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
            #pprint(groupFeatValueNorm)

            totalFeatCountForThisBin = float(0)
            featToValue = {}
            featToSummedNorm = {}
            for group_id, feat, value, norm, sd in groupFeatValueNorm:
            # for group_id, feat, value, norm, N in groupFeatValueNorm:
                if fwc.LOWERCASE_ONLY: feat = str(feat).lower()
                totalFeatCountForThisBin += value
                currentN = groupIdToN[group_id]
                try:
                    featToValue[feat] += value
                    featToSummedNorm[feat] += norm * currentN
                except KeyError:
                    featToValue[feat] = value
                    featToSummedNorm[feat] = norm * currentN

            #calculate mean and std_dev, using above info
            featToMeanNorm = {}
            featToSummedVar = {}
            for group_id, feat, _, norm, sd in groupFeatValueNorm:
                currentN = groupIdToN[group_id]
                meanNorm = featToSummedNorm[feat]/bin_N_sum
                try: 
                    featToSummedVar[feat] += currentN*((meanNorm - norm)**2 + (sd*sd))
                except KeyError:
                    featToSummedVar[feat] = currentN*((meanNorm - norm)**2 + (sd*sd))
                featToMeanNorm[feat] = meanNorm

            current_batch = [ ('_'.join(map(str,(lower_group, upper_group))),  k,  v, featToMeanNorm[k], sqrt(featToSummedVar[k] / bin_N_sum),
                               bin_N_sum, bin_center, bin_center_w, bin_width) for k, v in featToValue.iteritems() ]
            mm.executeWriteMany(self.corpdb, self.dbCursor, isql, current_batch, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode)
            # print 'N bin sum:', bin_N_sum
            # isql = 'INSERT INTO %s (group_id, feat, value, group_norm, N, bin_center, bin_center_w, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s, %s')
            ii_bins += 1
            fwc._report('group_id bins', ii_bins, reporting_int, num_bins)

        mm.enableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode)
        fwc.warn('Done creating new group_id-binned feature table.')

        outputdata = mm.executeGetList(self.corpdb, self.dbCursor, 'select group_id, N from `%s` group by group_id'%(newTable,), charset=self.encoding, use_unicode=self.use_unicode)
        pprint(outputdata)

        # mm.execute(self.corpdb, self.dbCursor, 'drop table if exists `%s`'%(newTable,))
        return newTable