Exemple #1
0
def executeWriteMany( db, dbConn, sql, rows, writeCursor=None, warnQuery=False, charset=DEF_ENCODING, use_unicode=DEF_UNICODE_SWITCH, mysql_config_file=MYSQL_CONFIG_FILE):
    """Executes a write query"""
    if warnQuery:
        warn("SQL (write many) QUERY: %s"% sql)                                                 
    if not writeCursor:
        writeCursor = dbConn.cursor()
    attempts = 0;
    while (1):
        try:
            writeCursor.executemany(sql, rows)
            break
        except MySQLdb.Error as e:
            attempts += 1
            warn(" *MYSQL Corpus DB ERROR on %s:\n%s (%d attempt)"% (sql, e, attempts))
            time.sleep(MYSQL_ERROR_SLEEP)
            (dbConn, dbCursor, dictCursor) = dbConnect(db, charset=charset, use_unicode=use_unicode, mysql_config_file=mysql_config_file)
            writeCursor = dbConn.cursor()
            if (attempts > MAX_ATTEMPTS):
                sys.exit(1)
    return writeCursor
Exemple #2
0
def executeGetList( db, dbCursor, sql, warnQuery=True, charset=DEF_ENCODING, use_unicode=DEF_UNICODE_SWITCH, mysql_config_file=MYSQL_CONFIG_FILE):
    """Executes a given query, returns results as a list of lists"""
    warnQuery = False
    if warnQuery:
        warn("SQL QUERY: %s"% sql[:MAX_SQL_PRINT_CHARS])
    data = []
    attempts = 0;
    while (1):
        try:
            dbCursor.execute(sql)
            data = dbCursor.fetchall()
            break
        except MySQLdb.Error as e:
            attempts += 1
            warn(" *MYSQL Corpus DB ERROR on %s:\n%s (%d attempt)"% (sql, e, attempts))
            time.sleep(MYSQL_ERROR_SLEEP)
            (dbConn, dbCursor, dictCursor) = dbConnect(db, charset=charset, use_unicode=use_unicode, mysql_config_file=mysql_config_file)
            if (attempts > MAX_ATTEMPTS):
                sys.exit(1)
    return data
Exemple #3
0
def dbConnect(db, charset=DEF_ENCODING, use_unicode=DEF_UNICODE_SWITCH, mysql_config_file=MYSQL_CONFIG_FILE):
    """ Connects to specified database. Returns tuple of (dbConn, dbCursor, dictCursor) """
    dbConn = None
    attempts = 0;
    while (1):
        try:
            dbConn = MySQLdb.connect (
                db = db,
                charset = charset,
                use_unicode = use_unicode, 
                read_default_file = mysql_config_file
            )
            break
        except MySQLdb.Error as e:
            attempts += 1
            warn(" *MYSQL Connect ERROR on db:%s\n%s\n (%d attempt)"% (db, e, attempts))
            time.sleep(MYSQL_ERROR_SLEEP)
            if (attempts > MAX_ATTEMPTS):
                sys.exit(1)
    dbCursor = dbConn.cursor()
    dictCursor = dbConn.cursor(MySQLdb.cursors.DictCursor)
    return dbConn, dbCursor, dictCursor
Exemple #4
0
def executeGetSSCursor(db, sql, warnMsg = True, charset=DEF_ENCODING, use_unicode=DEF_UNICODE_SWITCH, mysql_config_file=MYSQL_CONFIG_FILE):
    """Executes a given query (ss cursor is good to iterate over for large returns)"""
    if warnMsg: 
        warn("SQL (SSCursor) QUERY: %s"% sql[:MAX_SQL_PRINT_CHARS])
    #print("\n\nmysql config file from mm:", mysql_config_file)#debug
    ssCursor = dbConnect(db, 
                         charset=charset, 
                         use_unicode=use_unicode,
                         mysql_config_file=mysql_config_file,
                         )[0].cursor(MySQLdb.cursors.SSCursor)
    data = []
    attempts = 0;
    while (1):
        try:
            ssCursor.execute(sql)
            break
        except MySQLdb.Error as e:
            attempts += 1
            warn(" *MYSQL Corpus DB ERROR on %s:\n%s (%d attempt)"% (sql, e, attempts))
            time.sleep(MYSQL_ERROR_SLEEP*attempts**2)
            ssCursor = dbConnect(db, charset=charset, use_unicode=use_unicode, mysql_config_file=mysql_config_file)[0].cursor(MySQLdb.cursors.SSCursor)
            if (attempts > MAX_ATTEMPTS):
                sys.exit(1)
    return ssCursor
Exemple #5
0
import shutil
from subprocess import check_output

from dlatk.messageTransformer import MessageTransformer

sys.path.append(os.path.dirname(os.path.realpath(__file__)).replace("/dlatk/LexicaInterface",""))

from dlatk.featureExtractor import FeatureExtractor
from dlatk import dlaConstants as dlac

try:
    from gensim import corpora
    from gensim.models.wrappers import LdaMallet
except:
    LdaMallet = object
    dlac.warn("Warning: Cannot import gensim (cannot run LDA)")
    pass

from numpy import log2, isnan

try:
    from pymallet import defaults
    from pymallet.lda import estimate_topics
except:
    dlac.warn("Warning: Cannot import pymallet (cannot run LDA)")
    pass

from json import loads


class TopicExtractor(FeatureExtractor):
Exemple #6
0
    def addLDAFeatTable(self,
                        ldaMessageTable,
                        tableName=None,
                        valueFunc=lambda d: d):
        """Creates feature tuples (correl_field, feature, values) table where features are ngrams"""
        """Optional ValueFunc program scales that features by the function given"""
        #CREATE TABLE:
        featureName = 'lda' + '_' + ldaMessageTable.split('$')[1]
        featureTableName = self.createFeatureTable(featureName,
                                                   'SMALLINT UNSIGNED',
                                                   'INTEGER', tableName,
                                                   valueFunc)

        #SELECT / LOOP ON CORREL FIELD FIRST:
        usql = """SELECT %s FROM %s GROUP BY %s""" % (
            self.correl_field, self.corptable, self.correl_field)
        msgs = 0  #keeps track of the number of messages read
        cfRows = self._executeGetList(
            usql)  #SSCursor woudl be better, but it loses connection
        dlac.warn("finding messages for %d '%s's" %
                  (len(cfRows), self.correl_field))
        self._disableTableKeys(
            featureTableName
        )  #for faster, when enough space for repair by sorting
        for cfRow in cfRows:
            cf_id = cfRow[0]
            mids = set()  #currently seen message ids
            freqs = dict()  #holds frequency of n-grams
            totalInsts = 0  #total number of (non-distinct) topics

            #grab topics by messages for that cf:
            for messageRow in self.getMessagesForCorrelField(
                    cf_id, messageTable=ldaMessageTable):
                message_id = messageRow[0]
                topicsEncoded = messageRow[1]
                if not message_id in mids and topicsEncoded:
                    msgs += 1
                    if msgs % PROGRESS_AFTER_ROWS == 0:  #progress update
                        dlac.warn("Messages Read: %dk" % int(msgs / 1000))
                    #print topicsEncoded
                    topics = json.loads(topicsEncoded)

                    for topic in topics:
                        totalInsts += 1
                        topicId = topic['topic_id']
                        if not topicId in freqs:
                            freqs[topicId] = 1
                        else:
                            freqs[topicId] += 1
                    mids.add(message_id)

            #write topic to database (no need for "REPLACE" because we are creating the table)
            wsql = """INSERT INTO """ + featureTableName + """ (group_id, feat, value, group_norm) values ('""" + str(
                cf_id) + """', %s, %s, %s)"""
            totalInsts = float(totalInsts)  #to avoid casting each time below
            rows = [
                (k, v, valueFunc((v / totalInsts))) for k, v in freqs.items()
            ]  #adds group_norm and applies freq filter
            self._executeWriteMany(wsql, rows)

        dlac.warn("Done Reading / Inserting.")

        dlac.warn(
            "Adding Keys (if goes to keycache, then decrease MAX_TO_DISABLE_KEYS or run myisamchk -n)."
        )
        self._enableTableKeys(featureTableName)  #rebuilds keys
        dlac.warn("Done\n")
        return featureTableName
Exemple #7
0
def randomSubsetTable( db, sourceTableName, destinationTableName, keyField, percentToSubset=.10, distinct=True, charset=DEF_ENCODING, use_unicode=DEF_UNICODE_SWITCH, mysql_config_file=MYSQL_CONFIG_FILE):
    warn("making TABLE %s, a %2.2f percent random subset of TABLE %s on unique key %s..."%(destinationTableName, percentToSubset, sourceTableName, keyField))

    warn("connecting to DATABASE %s..."%(db))
    (dbConn, dbCursor, dictCursor) = dbConnect(db, charset=charset, use_unicode=use_unicode, mysql_config_file=mysql_config_file)

    warn("removing destination table if it exists...")
    sql = 'DROP TABLE IF EXISTS %s'%(destinationTableName)
    execute(db, dbCursor, sql, True, charset=charset, use_unicode=use_unicode)

    warn("cloning structure of table...")
    sql = 'CREATE TABLE %s LIKE %s'%(destinationTableName, sourceTableName)
    execute(db, dbCursor, sql, True, charset=charset, use_unicode=use_unicode)
    
    isDistinctText = ' distinct' if distinct else ''
    warn('grabbing a%s subset (%2.6f percent) of the keys on which to base the new table'%(isDistinctText, 100*percentToSubset))
    sql = 'SELECT DISTINCT(%s) FROM %s'%(keyField, sourceTableName) if distinct else 'SELECT %s FROM %s'%(keyField, sourceTableName)
    uniqueKeyList = executeGetList1(db, dbCursor, sql, True, charset=charset, use_unicode=use_unicode)
    
    warn(str(uniqueKeyList[1:5]))

    newKeys = sample(uniqueKeyList, int(floor(len(uniqueKeyList)*percentToSubset)))
    newKeys = list(map(str, newKeys))

    warn("populating newly created table, TABLE %s"%(destinationTableName))
    populateQuery = "INSERT INTO %s SELECT * FROM %s WHERE %s IN (%s)"%(destinationTableName, sourceTableName, keyField, ','.join(newKeys))
    execute(db, dbCursor, populateQuery, False, charset=charset, use_unicode=use_unicode)

    warn("finished making TABLE %s, a %2.2f percent random subset of TABLE %s on unique key %s!"%(destinationTableName, percentToSubset, sourceTableName, keyField))