def executeWriteMany( db, dbConn, sql, rows, writeCursor=None, warnQuery=False, charset=DEF_ENCODING, use_unicode=DEF_UNICODE_SWITCH, mysql_config_file=MYSQL_CONFIG_FILE): """Executes a write query""" if warnQuery: warn("SQL (write many) QUERY: %s"% sql) if not writeCursor: writeCursor = dbConn.cursor() attempts = 0; while (1): try: writeCursor.executemany(sql, rows) break except MySQLdb.Error as e: attempts += 1 warn(" *MYSQL Corpus DB ERROR on %s:\n%s (%d attempt)"% (sql, e, attempts)) time.sleep(MYSQL_ERROR_SLEEP) (dbConn, dbCursor, dictCursor) = dbConnect(db, charset=charset, use_unicode=use_unicode, mysql_config_file=mysql_config_file) writeCursor = dbConn.cursor() if (attempts > MAX_ATTEMPTS): sys.exit(1) return writeCursor
def executeGetList( db, dbCursor, sql, warnQuery=True, charset=DEF_ENCODING, use_unicode=DEF_UNICODE_SWITCH, mysql_config_file=MYSQL_CONFIG_FILE): """Executes a given query, returns results as a list of lists""" warnQuery = False if warnQuery: warn("SQL QUERY: %s"% sql[:MAX_SQL_PRINT_CHARS]) data = [] attempts = 0; while (1): try: dbCursor.execute(sql) data = dbCursor.fetchall() break except MySQLdb.Error as e: attempts += 1 warn(" *MYSQL Corpus DB ERROR on %s:\n%s (%d attempt)"% (sql, e, attempts)) time.sleep(MYSQL_ERROR_SLEEP) (dbConn, dbCursor, dictCursor) = dbConnect(db, charset=charset, use_unicode=use_unicode, mysql_config_file=mysql_config_file) if (attempts > MAX_ATTEMPTS): sys.exit(1) return data
def dbConnect(db, charset=DEF_ENCODING, use_unicode=DEF_UNICODE_SWITCH, mysql_config_file=MYSQL_CONFIG_FILE): """ Connects to specified database. Returns tuple of (dbConn, dbCursor, dictCursor) """ dbConn = None attempts = 0; while (1): try: dbConn = MySQLdb.connect ( db = db, charset = charset, use_unicode = use_unicode, read_default_file = mysql_config_file ) break except MySQLdb.Error as e: attempts += 1 warn(" *MYSQL Connect ERROR on db:%s\n%s\n (%d attempt)"% (db, e, attempts)) time.sleep(MYSQL_ERROR_SLEEP) if (attempts > MAX_ATTEMPTS): sys.exit(1) dbCursor = dbConn.cursor() dictCursor = dbConn.cursor(MySQLdb.cursors.DictCursor) return dbConn, dbCursor, dictCursor
def executeGetSSCursor(db, sql, warnMsg = True, charset=DEF_ENCODING, use_unicode=DEF_UNICODE_SWITCH, mysql_config_file=MYSQL_CONFIG_FILE): """Executes a given query (ss cursor is good to iterate over for large returns)""" if warnMsg: warn("SQL (SSCursor) QUERY: %s"% sql[:MAX_SQL_PRINT_CHARS]) #print("\n\nmysql config file from mm:", mysql_config_file)#debug ssCursor = dbConnect(db, charset=charset, use_unicode=use_unicode, mysql_config_file=mysql_config_file, )[0].cursor(MySQLdb.cursors.SSCursor) data = [] attempts = 0; while (1): try: ssCursor.execute(sql) break except MySQLdb.Error as e: attempts += 1 warn(" *MYSQL Corpus DB ERROR on %s:\n%s (%d attempt)"% (sql, e, attempts)) time.sleep(MYSQL_ERROR_SLEEP*attempts**2) ssCursor = dbConnect(db, charset=charset, use_unicode=use_unicode, mysql_config_file=mysql_config_file)[0].cursor(MySQLdb.cursors.SSCursor) if (attempts > MAX_ATTEMPTS): sys.exit(1) return ssCursor
import shutil from subprocess import check_output from dlatk.messageTransformer import MessageTransformer sys.path.append(os.path.dirname(os.path.realpath(__file__)).replace("/dlatk/LexicaInterface","")) from dlatk.featureExtractor import FeatureExtractor from dlatk import dlaConstants as dlac try: from gensim import corpora from gensim.models.wrappers import LdaMallet except: LdaMallet = object dlac.warn("Warning: Cannot import gensim (cannot run LDA)") pass from numpy import log2, isnan try: from pymallet import defaults from pymallet.lda import estimate_topics except: dlac.warn("Warning: Cannot import pymallet (cannot run LDA)") pass from json import loads class TopicExtractor(FeatureExtractor):
def addLDAFeatTable(self, ldaMessageTable, tableName=None, valueFunc=lambda d: d): """Creates feature tuples (correl_field, feature, values) table where features are ngrams""" """Optional ValueFunc program scales that features by the function given""" #CREATE TABLE: featureName = 'lda' + '_' + ldaMessageTable.split('$')[1] featureTableName = self.createFeatureTable(featureName, 'SMALLINT UNSIGNED', 'INTEGER', tableName, valueFunc) #SELECT / LOOP ON CORREL FIELD FIRST: usql = """SELECT %s FROM %s GROUP BY %s""" % ( self.correl_field, self.corptable, self.correl_field) msgs = 0 #keeps track of the number of messages read cfRows = self._executeGetList( usql) #SSCursor woudl be better, but it loses connection dlac.warn("finding messages for %d '%s's" % (len(cfRows), self.correl_field)) self._disableTableKeys( featureTableName ) #for faster, when enough space for repair by sorting for cfRow in cfRows: cf_id = cfRow[0] mids = set() #currently seen message ids freqs = dict() #holds frequency of n-grams totalInsts = 0 #total number of (non-distinct) topics #grab topics by messages for that cf: for messageRow in self.getMessagesForCorrelField( cf_id, messageTable=ldaMessageTable): message_id = messageRow[0] topicsEncoded = messageRow[1] if not message_id in mids and topicsEncoded: msgs += 1 if msgs % PROGRESS_AFTER_ROWS == 0: #progress update dlac.warn("Messages Read: %dk" % int(msgs / 1000)) #print topicsEncoded topics = json.loads(topicsEncoded) for topic in topics: totalInsts += 1 topicId = topic['topic_id'] if not topicId in freqs: freqs[topicId] = 1 else: freqs[topicId] += 1 mids.add(message_id) #write topic to database (no need for "REPLACE" because we are creating the table) wsql = """INSERT INTO """ + featureTableName + """ (group_id, feat, value, group_norm) values ('""" + str( cf_id) + """', %s, %s, %s)""" totalInsts = float(totalInsts) #to avoid casting each time below rows = [ (k, v, valueFunc((v / totalInsts))) for k, v in freqs.items() ] #adds group_norm and applies freq filter self._executeWriteMany(wsql, rows) dlac.warn("Done Reading / Inserting.") dlac.warn( "Adding Keys (if goes to keycache, then decrease MAX_TO_DISABLE_KEYS or run myisamchk -n)." ) self._enableTableKeys(featureTableName) #rebuilds keys dlac.warn("Done\n") return featureTableName
def randomSubsetTable( db, sourceTableName, destinationTableName, keyField, percentToSubset=.10, distinct=True, charset=DEF_ENCODING, use_unicode=DEF_UNICODE_SWITCH, mysql_config_file=MYSQL_CONFIG_FILE): warn("making TABLE %s, a %2.2f percent random subset of TABLE %s on unique key %s..."%(destinationTableName, percentToSubset, sourceTableName, keyField)) warn("connecting to DATABASE %s..."%(db)) (dbConn, dbCursor, dictCursor) = dbConnect(db, charset=charset, use_unicode=use_unicode, mysql_config_file=mysql_config_file) warn("removing destination table if it exists...") sql = 'DROP TABLE IF EXISTS %s'%(destinationTableName) execute(db, dbCursor, sql, True, charset=charset, use_unicode=use_unicode) warn("cloning structure of table...") sql = 'CREATE TABLE %s LIKE %s'%(destinationTableName, sourceTableName) execute(db, dbCursor, sql, True, charset=charset, use_unicode=use_unicode) isDistinctText = ' distinct' if distinct else '' warn('grabbing a%s subset (%2.6f percent) of the keys on which to base the new table'%(isDistinctText, 100*percentToSubset)) sql = 'SELECT DISTINCT(%s) FROM %s'%(keyField, sourceTableName) if distinct else 'SELECT %s FROM %s'%(keyField, sourceTableName) uniqueKeyList = executeGetList1(db, dbCursor, sql, True, charset=charset, use_unicode=use_unicode) warn(str(uniqueKeyList[1:5])) newKeys = sample(uniqueKeyList, int(floor(len(uniqueKeyList)*percentToSubset))) newKeys = list(map(str, newKeys)) warn("populating newly created table, TABLE %s"%(destinationTableName)) populateQuery = "INSERT INTO %s SELECT * FROM %s WHERE %s IN (%s)"%(destinationTableName, sourceTableName, keyField, ','.join(newKeys)) execute(db, dbCursor, populateQuery, False, charset=charset, use_unicode=use_unicode) warn("finished making TABLE %s, a %2.2f percent random subset of TABLE %s on unique key %s!"%(destinationTableName, percentToSubset, sourceTableName, keyField))