def __init__(self,dbname=None, variableFile=".bookworm/metadata/jsoncatalog_derived.txt"): """ You can initialize it with a database name; otherwise it defaults to finding a Bookworm configuration file. It also may be initialized with a set of metadata. This is a little wonky, and may be deprecated in favor of a cleaner interface. """ try: self.config_manager = Configfile("local") logging.debug("Connecting from the local config file") except IOError: try: self.config_manager = Configfile("global") logging.debug("No bookworm.cnf in local file: connecting from global defaults") except IOError: self.config_manager = Configfile("admin") logging.debug("No bookworm.cnf in local file: connecting from admin defaults") self.config_manager.read_config_files() config = self.config_manager.config if dbname==None: self.dbname = config.get("client","database") else: self.dbname = dbname self.conn = None self.db = DB(dbname=self.dbname) if variableFile is not None: self.setVariables(originFile=variableFile)
def connect(self, setengine=True): #These scripts run as the Bookworm _Administrator_ on this machine; defined by the location of this my.cnf file. conf = Configfile("admin") conf.read_config_files() connect_args = { "user": conf.config.get("client","user"), "passwd": conf.config.get("client","password"), "use_unicode": 'True', "charset": 'utf8', "db": '', "local_infile": 1} try: self.conn = MySQLdb.connect(**connect_args) except MySQLdb.OperationalError: # Sometimes mysql wants to connect over this rather than a socket: # falling back to it for backward-compatibility. connect_args["host"] = "127.0.0.1" self.conn = MySQLdb.connect(**connect_args) cursor = self.conn.cursor() cursor.execute("CREATE DATABASE IF NOT EXISTS %s default character set utf8" % self.dbname) # Don't use native query attribute here to avoid infinite loops cursor.execute("SET NAMES 'utf8'") cursor.execute("SET CHARACTER SET 'utf8'") if setengine: try: cursor.execute("SET default_storage_engine=MYISAM") except: logging.error("Forcing default engine failed. On some versions of Mysql,\ you may need to add \"default-storage-engine=MYISAM\" manually\ to the [mysqld] user in /etc/my.cnf. Trying again to connect...") self.connect(setengine=False) logging.debug("Connecting to %s" % self.dbname) cursor.execute("USE %s" % self.dbname)
def __init__(self, dbname=None): try: configuration = Configfile("local") logging.debug("Connecting from the local config file") except IOError: try: configuration = Configfile("global") logging.debug( "No bookworm.cnf in local file: connecting from global defaults" ) except IOError: configuration = Configfile("admin") logging.debug( "No bookworm.cnf in local file: connecting from admin defaults" ) configuration.read_config_files() config = configuration.config if dbname == None: self.dbname = config.get("client", "database") else: self.dbname = dbname self.username = config.get("client", "user") self.password = config.get("client", "password") self.conn = None
def grantPrivileges(self): """ Grants select-only privileges to a non-admin mysql user for the API to query with (safer). The Username for these privileges is pulled from the bookworm.cnf file. """ globalfile = Configfile("global") globalfile.read_config_files() username=globalfile.config.get("client","user") password=globalfile.config.get("client","password") self.db.query("GRANT SELECT ON %s.* TO '%s'@'localhost' IDENTIFIED BY '%s'" % (self.dbname,username,password))
def config(self,args): """ Performs useful configuration tasks, such as setting up a MySQL installation. """ if args.target=="mysql": import bookwormDB.configuration bookwormDB.configuration.recommend_my_cnf() if args.target=="mysql-info": from bookwormDB.configuration import Configfile config = Configfile("admin") print("The admin configuration login currently being used should be the following.\n") config.write_out() if args.target=="apache": from bookwormDB.configuration import apache apache()
def __init__(self, dbname=None, variableFile=".bookworm/metadata/jsoncatalog_derived.txt"): """ You can initialize it with a database name; otherwise it defaults to finding a Bookworm configuration file. """ self.config_manager = Configfile("admin") config = self.config_manager.config self.dbname = dbname self.conn = None if self.dbname is not None: # Sometimes this may be called just to access the # variables elements. self.db = DB(dbname=self.dbname) else: self.db = None if variableFile is not None: try: self.setVariables(originFile=variableFile) except FileNotFoundError: pass
def grantPrivileges(self): """ Grants select-only privileges to a non-admin mysql user for the API to query with (safer). The Username for these privileges is pulled from the bookworm.cnf file. """ globalfile = Configfile("global") globalfile.read_config_files() username = globalfile.config.get("client", "user") password = globalfile.config.get("client", "password") self.db.query( "GRANT SELECT ON %s.* TO '%s'@'localhost' IDENTIFIED BY '%s'" % (self.dbname, username, password))
def test_config_files(self): logging.info("\n\nTESTING CONFIG FILE ACCESS\n\n") def test_config_file(conf): user = conf.config.get("client", "user") pw = conf.config.get("client", "password") return (user, pw) global_configuration_file = Configfile("read_only") admin_configuration_file = Configfile("admin") (admin_user, admin_pw) = test_config_file(global_configuration_file) (client_user, client_pw) = test_config_file(admin_configuration_file) logging.info("admin user is {} and password is {}".format( admin_user, admin_pw)) logging.info("client user is {} and password is {}".format( client_user, client_pw)) logging.info("Checking that admin and client users are distinct") self.assertTrue(admin_user != client_user)
def __init__(self,dbname=None): try: configuration = Configfile("local") logging.debug("Connecting from the local config file") except IOError: try: configuration = Configfile("global") logging.debug("No bookworm.cnf in local file: connecting from global defaults") except IOError: configuration = Configfile("admin") logging.debug("No bookworm.cnf in local file: connecting from admin defaults") configuration.read_config_files() config = configuration.config if dbname==None: self.dbname = config.get("client","database") else: self.dbname = dbname self.username=config.get("client","user") self.password=config.get("client","password") self.conn = None
def connect(self, setengine=True): #These scripts run as the Bookworm _Administrator_ on this machine; defined by the location of this my.cnf file. conf = Configfile("admin") conf.read_config_files() connect_args = { "user": conf.config.get("client", "user"), "passwd": conf.config.get("client", "password"), "use_unicode": 'True', "charset": 'utf8', "db": '', "local_infile": 1 } try: self.conn = MySQLdb.connect(**connect_args) except MySQLdb.OperationalError: # Sometimes mysql wants to connect over this rather than a socket: # falling back to it for backward-compatibility. connect_args["host"] = "127.0.0.1" self.conn = MySQLdb.connect(**connect_args) cursor = self.conn.cursor() cursor.execute( "CREATE DATABASE IF NOT EXISTS %s default character set utf8" % self.dbname) # Don't use native query attribute here to avoid infinite loops cursor.execute("SET NAMES 'utf8'") cursor.execute("SET CHARACTER SET 'utf8'") if setengine: try: cursor.execute("SET default_storage_engine=MYISAM") except: logging.error( "Forcing default engine failed. On some versions of Mysql,\ you may need to add \"default-storage-engine=MYISAM\" manually\ to the [mysqld] user in /etc/my.cnf. Trying again to connect..." ) self.connect(setengine=False) logging.debug("Connecting to %s" % self.dbname) cursor.execute("USE %s" % self.dbname)
def grantPrivileges(self): """ Grants select-only privileges to a non-admin mysql user for the API to query with without risking exposing write access to the Internet. The username for these privileges is usually just 'bookworm' without a password, but if you place a file at '/etc/bookworm.cnf', it will be read from there. """ globalfile = Configfile("read_only") username = globalfile.config.get("client", "user") password = globalfile.config.get("client", "password") try: self.db.query( "GRANT SELECT ON %s.* TO '%s'@'localhost' IDENTIFIED BY '%s'" % (self.dbname, username, password)) except MySQLdb._exceptions.OperationalError: self.db.query("CREATE USER '%s'@'localhost' IDENTIFIED BY '%s'" % (username, password)) self.db.query( "GRANT SELECT ON %s.* TO '%s'@'localhost' IDENTIFIED BY '%s'" % (self.dbname, username, password))
def __init__(self, dbname=None, variableFile=".bookworm/metadata/jsoncatalog_derived.txt"): """ You can initialize it with a database name; otherwise it defaults to finding a Bookworm configuration file. It also may be initialized with a set of metadata. This is a little wonky, and may be deprecated in favor of a cleaner interface. """ try: self.config_manager = Configfile("local") logging.debug("Connecting from the local config file") except IOError: try: self.config_manager = Configfile("global") logging.debug( "No bookworm.cnf in local file: connecting from global defaults" ) except IOError: self.config_manager = Configfile("admin") logging.debug( "No bookworm.cnf in local file: connecting from admin defaults" ) self.config_manager.read_config_files() config = self.config_manager.config if dbname == None: self.dbname = config.get("client", "database") else: self.dbname = dbname self.conn = None self.db = DB(dbname=self.dbname) if variableFile is not None: self.setVariables(originFile=variableFile)
class BookwormSQLDatabase: """ This class gives interactions methods to a MySQL database storing Bookworm data. Although the primary methods are about loading data already created into the SQL database, it has a few other operations that write out text files needed by the API and the web front end: I take it as logical to do those here, since that how it fits chronologically in the bookworm-creation sequence. """ def __init__(self,dbname=None, variableFile=".bookworm/metadata/jsoncatalog_derived.txt"): """ You can initialize it with a database name; otherwise it defaults to finding a Bookworm configuration file. It also may be initialized with a set of metadata. This is a little wonky, and may be deprecated in favor of a cleaner interface. """ try: self.config_manager = Configfile("local") logging.debug("Connecting from the local config file") except IOError: try: self.config_manager = Configfile("global") logging.debug("No bookworm.cnf in local file: connecting from global defaults") except IOError: self.config_manager = Configfile("admin") logging.debug("No bookworm.cnf in local file: connecting from admin defaults") self.config_manager.read_config_files() config = self.config_manager.config if dbname==None: self.dbname = config.get("client","database") else: self.dbname = dbname self.conn = None self.db = DB(dbname=self.dbname) if variableFile is not None: self.setVariables(originFile=variableFile) def grantPrivileges(self): """ Grants select-only privileges to a non-admin mysql user for the API to query with (safer). The Username for these privileges is pulled from the bookworm.cnf file. """ globalfile = Configfile("global") globalfile.read_config_files() username=globalfile.config.get("client","user") password=globalfile.config.get("client","password") self.db.query("GRANT SELECT ON %s.* TO '%s'@'localhost' IDENTIFIED BY '%s'" % (self.dbname,username,password)) def setVariables(self,originFile,anchorField="bookid", jsonDefinition=".bookworm/metadata/field_descriptions_derived.json"): self.variableSet = variableSet(originFile=originFile, anchorField=anchorField, jsonDefinition=jsonDefinition,db=self.db) def importNewFile(self,originFile,anchorField,jsonDefinition): """ Add additional metadata from a source collection of json-formatted rows. originFile is the filename of the new metadata, in the same input format as the original jsoncatalog.txt anchorField is the field in the existing dataset it should be anchored onto; jsonDefinition is a filename pointing to a file of the format of field_descriptions.json describing the new data to ingest. If it is of type None, then one will be guessed at. """ self.setVariables(originFile,anchorField=anchorField,jsonDefinition=jsonDefinition) self.variableSet.writeMetadata() self.load_book_list() self.variableSet.updateMasterVariableTable() for variable in self.variableSet.variables: variable.clear_associated_memory_tables() self.reloadMemoryTables() def create_database(self): dbname = self.dbname dbuser = self.dbuser dbpassword = self.dbpassword db = self.db #This must be run as a MySQL user with create_table privileges try: db.query("CREATE DATABASE " + dbname) except: logging.info("Database %s already exists: that might be intentional, so not dying" % dbname) "Setting up permissions for web user..." db.query("GRANT SELECT ON " + dbname + ".*" + " TO '" + dbuser + "'@'localhost' IDENTIFIED BY '" + dbpassword + "'") db.query("FLUSH PRIVILEGES") try: #a field to store stuff we might need later. db.query("CREATE TABLE IF NOT EXISTS bookworm_information (entry VARCHAR(255), PRIMARY KEY (entry), value VARCHAR(50000))") except: raise def load_word_list(self): db = self.db logging.info("Making a SQL table to hold the words") db.query("""DROP TABLE IF EXISTS words""") db.query("""CREATE TABLE IF NOT EXISTS words ( wordid MEDIUMINT UNSIGNED NOT NULL, word VARCHAR(255), INDEX (word), count BIGINT UNSIGNED, casesens VARBINARY(255), stem VARCHAR(255) );""") db.query("ALTER TABLE words DISABLE KEYS") logging.info("loading data using LOAD DATA LOCAL INFILE") db.query("""LOAD DATA LOCAL INFILE '.bookworm/texts/wordlist/wordlist.txt' INTO TABLE words CHARACTER SET binary (wordid,word,count) """) logging.info("creating indexes on words table") db.query("ALTER TABLE words ENABLE KEYS") db.query("UPDATE words SET casesens=word") def load_book_list(self): """ Loads in the tables that have already been created by a previous call to `Bookworm.variableSet.writeMetadata()` """ self.variableSet.loadMetadata() def create_unigram_book_counts(self, newtable=True, ingest=True, index=True, reverse_index=True, table_count=1): import time t0 = time.time() db = self.db ngramname = "unigrams" tablenameroot = "master_bookcounts" # If you are splitting the input into multiple tables # to be joined as a merge table, come up with multiple # table names and we'll cycle through. if table_count == 1: tablenames = [tablenameroot] elif table_count > 1: tablenames = ["%s_p%d" % (tablenameroot, i) for i in range(1, table_count+1)] else: logging.error("You need a positive integer for table_count") raise grampath = ".bookworm/texts/encoded/%s" % ngramname tmpdir = "%s/tmp" % grampath if (len(grampath) == 0) or (grampath == "/"): logging.error("Woah! Don't set the ngram path to your system root!") raise if newtable: if os.path.exists(tmpdir): import shutil shutil.rmtree(tmpdir) logging.info("Dropping older %s table, if it exists" % ngramname) for tablename in tablenames: db.query("DROP TABLE IF EXISTS " + tablename) logging.info("Making a SQL table to hold the %s" % ngramname) reverse_index_sql = "INDEX(bookid,wordid,count), " if reverse_index else "" for tablename in tablenames: db.query("CREATE TABLE IF NOT EXISTS " + tablename + " (" "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " "count MEDIUMINT UNSIGNED NOT NULL);") if ingest: for tablename in tablenames: db.query("ALTER TABLE " + tablename + " DISABLE KEYS") db.query("set NAMES utf8;") db.query("set CHARACTER SET utf8;") logging.info("loading data using LOAD DATA LOCAL INFILE") files = os.listdir(grampath) for i, filename in enumerate(files): if filename.endswith('.txt'): # With each input file, cycle through each table in tablenames tablename = tablenames[i % len(tablenames)] logging.debug("Importing txt file, %s (%d/%d)" % (filename, i, len(files))) try: db.query("LOAD DATA LOCAL INFILE '" + grampath + "/" + filename + "' INTO TABLE " + tablename +" CHARACTER SET utf8 (bookid,wordid,count);") except KeyboardInterrupt: raise except: logging.debug("Falling back on insert without LOCAL DATA INFILE. Slower.") try: import pandas as pd df = pd.read_csv(grampath + "/" + filename, sep='\t', header=None) to_insert = df.apply(tuple, axis=1).tolist() db.query( "INSERT INTO " + tablename + " (bookid,wordid,count) " "VALUES (%s, %s, %s);""", many_params=to_insert ) except KeyboardInterrupt: raise except: logging.exception("Error inserting %s from %s" % (ngramname, filename)) continue elif filename.endswith('.h5'): logging.info("Importing h5 file, %s (%d/%d)" % (filename, i, len(files))) try: # When encountering an .h5 file, this looks for ngram information # in a /#{ngramnames} table (e.g. /unigrams) and writes it out to # temporary TSV files. # Dask is used here simply because it's a dead simple way to multithread # the TSV writing and lower the overhead versus having a TSV already staged. import csv import pandas as pd try: import dask.dataframe as dd except: logging.exception("Ingesting h5 files requires dask") try: os.makedirs(tmpdir) except OSError: if not os.path.isdir(tmpdir): raise # Dask will use #{n_cores-1} threads when saving CSVs. # Ingest and key reload times are identical to txt import, so the only # additional overhead is reading the file (small effect) and writing the csv. ddf = dd.read_hdf(grampath + "/" + filename, ngramname, mode='r', chunksize=2000000) ddf.reset_index().to_csv(tmpdir + '/tmp.*.tsv', index=False, sep='\t', header=False, quoting=csv.QUOTE_NONNUMERIC) logging.info("CSV written from H5. Time passed: %.2f s" % (time.time() - t0)) for j, tmpfile in enumerate(os.listdir(tmpdir)): # With each input file, cycle through each table in tablenames tablename = tablenames[j % len(tablenames)] path = "%s/%s" % (tmpdir, tmpfile) db.query("LOAD DATA LOCAL INFILE '" + path + "' " "INTO TABLE " + tablename + " " "CHARACTER SET utf8 (bookid,wordid,count);") try: os.remove(path) except: pass logging.info("CSVs input. Time passed: %.2f s" % (time.time() - t0)) except KeyboardInterrupt: raise except: logging.exception("Error inserting %s from %s" % (ngramname, filename)) continue else: continue if index: logging.info("Creating Unigram Indexes. Time passed: %.2f s" % (time.time() - t0)) for tablename in tablenames: db.query("ALTER TABLE " + tablename + " ENABLE KEYS") if table_count > 1: logging.info("Creating a merge table for " + ",".join(tablenames)) db.query("CREATE TABLE IF NOT EXISTS " + tablenameroot + " (" "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " "count MEDIUMINT UNSIGNED NOT NULL) " "ENGINE=MERGE UNION=(" + ",".join(tablenames) + ") INSERT_METHOD=LAST;") logging.info("Unigram index created in: %.2f s" % ((time.time() - t0))) def create_bigram_book_counts(self): db = self.db logging.info("Making a SQL table to hold the bigram counts") db.query("""DROP TABLE IF EXISTS master_bigrams""") db.query("""CREATE TABLE master_bigrams ( bookid MEDIUMINT UNSIGNED NOT NULL, word1 MEDIUMINT UNSIGNED NOT NULL, INDEX (word1,word2,bookid,count), word2 MEDIUMINT UNSIGNED NOT NULL, count MEDIUMINT UNSIGNED NOT NULL);""") db.query("ALTER TABLE master_bigrams DISABLE KEYS") logging.info("loading data using LOAD DATA LOCAL INFILE") for filename in os.listdir(".bookworm/texts/encoded/bigrams"): try: db.query("LOAD DATA LOCAL INFILE '.bookworm/texts/encoded/bigrams/"+filename+"' INTO TABLE master_bigrams CHARACTER SET utf8 (bookid,word1,word2,count);") except: raise logging.info("Creating bigram indexes") db.query("ALTER TABLE master_bigrams ENABLE KEYS") def loadVariableDescriptionsIntoDatabase(self): """ This adds a description of files to the master variable table: also, crucially, it puts code specifying their fast creation there, where it will be executed on startup for all eternity. """ logging.debug("Building masterVariableTable") db = self.db db.query("DROP TABLE IF EXISTS masterVariableTable") m = db.query(""" CREATE TABLE IF NOT EXISTS masterVariableTable (dbname VARCHAR(255), PRIMARY KEY (dbname), name VARCHAR(255), type VARCHAR(255), tablename VARCHAR(255), anchor VARCHAR(255), alias VARCHAR(255), status VARCHAR(255), description VARCHAR(5000) ) ENGINE=MYISAM; """) tableTable = db.query(""" CREATE TABLE IF NOT EXISTS masterTableTable (tablename VARCHAR(255), PRIMARY KEY (tablename), dependsOn VARCHAR(255), memoryCode VARCHAR(20000)) ENGINE=MYISAM; """) self.addFilesToMasterVariableTable() self.addWordsToMasterVariableTable() self.variableSet.updateMasterVariableTable() def reloadMemoryTables(self,force=False): """ Checks to see if memory tables need to be repopulated (by seeing if they are empty) and then does so if necessary. """ existingCreateCodes = self.db.query("SELECT tablename,memoryCode FROM masterTableTable").fetchall() for row in existingCreateCodes: """ For each table, it checks to see if the table is currently populated; if not, it runs the stored code to repopulate the table. (It checks length because memory tables are emptied on a restart). """ tablename = row[0] try: cursor = self.db.query("SELECT count(*) FROM %s" %(tablename), silent = True) currentLength = cursor.fetchall()[0][0] logging.debug("Current Length is %d" %currentLength) except: currentLength = 0 if currentLength==0 or force: for query in splitMySQLcode(row[1]): self.db.query("SET optimizer_search_depth=0") self.db.query(query) def addFilesToMasterVariableTable(self): fastFieldsCreateList = ["bookid MEDIUMINT UNSIGNED NOT NULL, PRIMARY KEY (bookid)","nwords MEDIUMINT UNSIGNED NOT NULL"] +\ [variable.fastSQL() for variable in self.variableSet.variables if (variable.unique and variable.fastSQL() is not None)] fileCommand = """DROP TABLE IF EXISTS tmp; CREATE TABLE tmp (""" +",\n".join(fastFieldsCreateList) + """ ) ENGINE=MEMORY;""" #Also update the wordcounts for each text. fastFields = ["bookid","nwords"] + [variable.fastField for variable in self.variableSet.variables if variable.unique and variable.fastSQL() is not None] fileCommand += "INSERT INTO tmp SELECT " + ",".join(fastFields) + " FROM catalog USE INDEX () " + " ".join([" JOIN %(field)s__id USING (%(field)s ) " % variable.__dict__ for variable in self.variableSet.variables if variable.unique and variable.fastSQL() is not None and variable.datatype=="categorical"])+ ";" fileCommand += "DROP TABLE IF EXISTS fastcat;" fileCommand += "RENAME TABLE tmp TO fastcat;" self.db.query('DELETE FROM masterTableTable WHERE masterTableTable.tablename="fastcat";') self.db.query("""INSERT IGNORE INTO masterTableTable VALUES ('fastcat','fastcat','""" + fileCommand + """')""") def addWordsToMasterVariableTable(self): wordCommand = "DROP TABLE IF EXISTS tmp;" wordCommand += "CREATE TABLE tmp (wordid MEDIUMINT UNSIGNED NOT NULL, PRIMARY KEY (wordid), word VARCHAR(30), INDEX (word), casesens VARBINARY(30),UNIQUE INDEX(casesens), lowercase CHAR(30), INDEX (lowercase) ) ENGINE=MEMORY;" wordCommand += "INSERT IGNORE INTO tmp SELECT wordid as wordid,word,casesens,LOWER(word) FROM words WHERE CHAR_LENGTH(word) <= 30 AND wordid <= 1500000 ORDER BY wordid;" wordCommand += "DROP TABLE IF EXISTS wordsheap;" wordCommand += "RENAME TABLE tmp TO wordsheap;" query = """INSERT IGNORE INTO masterTableTable VALUES ('wordsheap','wordsheap','""" + MySQLdb.escape_string(wordCommand) + """')""" logging.info("Creating wordsheap") self.db.query(query) def jsonify_data(self): variables = self.variableSet.variables dbname = self.dbname #This creates a JSON file compliant with the Bookworm web site. #Deprecated. output = dict() output['settings'] = { "dbname": self.dbname, "itemName":" text", "sourceName": self.dbname, "sourceURL": self.dbname } ui_components = [ { "type":"text", "dbfield":"word", "name":"Word(s)" } ] for variable in variables: newdict = variable.jsonDict() if newdict: #(It can be empty, in which case we don't want it for the json) ui_components.append(newdict) try: mytime = [variable.field for variable in variables if variable.datatype=='time'][0] output['default_search'] = [ { "search_limits": [{"word":["test"]}], "time_measure": mytime, "words_collation": "Case_Sensitive", "counttype": "Occurrences_per_Million_Words", "smoothingSpan": 0 } ] except: logging.warning("WARNING: Not enough info for a default search (like, no time variable maybe?)--likely to be some big problems with your bookworm.") output['ui_components'] = ui_components outfile = open('.bookworm/%s.json' % dbname, 'w') outfile.write(json.dumps(output)) def create_API_settings(self): db = self.db try: db.query("DROP TABLE IF EXISTS API_settings") db.query("CREATE TABLE API_settings (settings VARCHAR(8192));") except: pass api_info = { "HOST": "10.102.15.45", "database": self.dbname, "read_default_file": "/etc/mysql/my.cnf", } addCode = json.dumps(api_info) logging.info(addCode) db.query("INSERT INTO API_settings VALUES ('%s');" % addCode) def update_Porter_stemming(self): #We use stems occasionally. """ Still not executed. """ logging.info("Updating stems from Porter algorithm...") from nltk import PorterStemmer db = self.db stemmer = PorterStemmer() cursor = db.query("""SELECT word FROM words""") words = cursor.fetchall() for local in words: word = ''.join(local) # Could probably take the first element of the tuple as well? # Apostrophes have the save stem as the word, if they're included word = word.replace("'s","") if re.match("^[A-Za-z]+$",word): query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';""" z = cursor.execute(query)
class BookwormSQLDatabase: """ This class gives interactions methods to a MySQL database storing Bookworm data. Although the primary methods are about loading data already created into the SQL database, it has a few other operations that write out text files needed by the API and the web front end: I take it as logical to do those here, since that how it fits chronologically in the bookworm-creation sequence. """ def __init__(self, dbname=None, variableFile=".bookworm/metadata/jsoncatalog_derived.txt"): """ You can initialize it with a database name; otherwise it defaults to finding a Bookworm configuration file. It also may be initialized with a set of metadata. This is a little wonky, and may be deprecated in favor of a cleaner interface. """ try: self.config_manager = Configfile("local") logging.debug("Connecting from the local config file") except IOError: try: self.config_manager = Configfile("global") logging.debug( "No bookworm.cnf in local file: connecting from global defaults" ) except IOError: self.config_manager = Configfile("admin") logging.debug( "No bookworm.cnf in local file: connecting from admin defaults" ) self.config_manager.read_config_files() config = self.config_manager.config if dbname == None: self.dbname = config.get("client", "database") else: self.dbname = dbname self.conn = None self.db = DB(dbname=self.dbname) if variableFile is not None: self.setVariables(originFile=variableFile) def grantPrivileges(self): """ Grants select-only privileges to a non-admin mysql user for the API to query with (safer). The Username for these privileges is pulled from the bookworm.cnf file. """ globalfile = Configfile("global") globalfile.read_config_files() username = globalfile.config.get("client", "user") password = globalfile.config.get("client", "password") self.db.query( "GRANT SELECT ON %s.* TO '%s'@'localhost' IDENTIFIED BY '%s'" % (self.dbname, username, password)) def setVariables( self, originFile, anchorField="bookid", jsonDefinition=".bookworm/metadata/field_descriptions_derived.json" ): self.variableSet = variableSet(originFile=originFile, anchorField=anchorField, jsonDefinition=jsonDefinition, db=self.db) def importNewFile(self, originFile, anchorField, jsonDefinition): """ Add additional metadata from a source collection of json-formatted rows. originFile is the filename of the new metadata, in the same input format as the original jsoncatalog.txt anchorField is the field in the existing dataset it should be anchored onto; jsonDefinition is a filename pointing to a file of the format of field_descriptions.json describing the new data to ingest. If it is of type None, then one will be guessed at. """ self.setVariables(originFile, anchorField=anchorField, jsonDefinition=jsonDefinition) self.variableSet.writeMetadata() self.load_book_list() self.variableSet.updateMasterVariableTable() for variable in self.variableSet.variables: variable.clear_associated_memory_tables() self.reloadMemoryTables() def create_database(self): dbname = self.dbname dbuser = self.dbuser dbpassword = self.dbpassword db = self.db #This must be run as a MySQL user with create_table privileges try: db.query("CREATE DATABASE " + dbname) except: logging.info( "Database %s already exists: that might be intentional, so not dying" % dbname) "Setting up permissions for web user..." db.query("GRANT SELECT ON " + dbname + ".*" + " TO '" + dbuser + "'@'localhost' IDENTIFIED BY '" + dbpassword + "'") db.query("FLUSH PRIVILEGES") try: #a field to store stuff we might need later. db.query( "CREATE TABLE IF NOT EXISTS bookworm_information (entry VARCHAR(255), PRIMARY KEY (entry), value VARCHAR(50000))" ) except: raise def load_word_list(self): db = self.db logging.info("Making a SQL table to hold the words") db.query("""DROP TABLE IF EXISTS words""") db.query("""CREATE TABLE IF NOT EXISTS words ( wordid MEDIUMINT UNSIGNED NOT NULL, word VARCHAR(255), INDEX (word), count BIGINT UNSIGNED, casesens VARBINARY(255), stem VARCHAR(255) );""") db.query("ALTER TABLE words DISABLE KEYS") logging.info("loading data using LOAD DATA LOCAL INFILE") db.query( """LOAD DATA LOCAL INFILE '.bookworm/texts/wordlist/wordlist.txt' INTO TABLE words CHARACTER SET binary (wordid,word,count) """) logging.info("creating indexes on words table") db.query("ALTER TABLE words ENABLE KEYS") db.query("UPDATE words SET casesens=word") def load_book_list(self): """ Loads in the tables that have already been created by a previous call to `Bookworm.variableSet.writeMetadata()` """ self.variableSet.loadMetadata() def create_unigram_book_counts(self, newtable=True, ingest=True, index=True, reverse_index=True, table_count=1): import time t0 = time.time() db = self.db ngramname = "unigrams" tablenameroot = "master_bookcounts" # If you are splitting the input into multiple tables # to be joined as a merge table, come up with multiple # table names and we'll cycle through. if table_count == 1: tablenames = [tablenameroot] elif table_count > 1: tablenames = [ "%s_p%d" % (tablenameroot, i) for i in range(1, table_count + 1) ] else: logging.error("You need a positive integer for table_count") raise grampath = ".bookworm/texts/encoded/%s" % ngramname tmpdir = "%s/tmp" % grampath if (len(grampath) == 0) or (grampath == "/"): logging.error( "Woah! Don't set the ngram path to your system root!") raise if newtable: if os.path.exists(tmpdir): import shutil shutil.rmtree(tmpdir) logging.info("Dropping older %s table, if it exists" % ngramname) for tablename in tablenames: db.query("DROP TABLE IF EXISTS " + tablename) logging.info("Making a SQL table to hold the %s" % ngramname) reverse_index_sql = "INDEX(bookid,wordid,count), " if reverse_index else "" for tablename in tablenames: db.query( "CREATE TABLE IF NOT EXISTS " + tablename + " (" "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " "count MEDIUMINT UNSIGNED NOT NULL);") if ingest: for tablename in tablenames: db.query("ALTER TABLE " + tablename + " DISABLE KEYS") db.query("set NAMES utf8;") db.query("set CHARACTER SET utf8;") logging.info("loading data using LOAD DATA LOCAL INFILE") files = os.listdir(grampath) for i, filename in enumerate(files): if filename.endswith('.txt'): # With each input file, cycle through each table in tablenames tablename = tablenames[i % len(tablenames)] logging.debug("Importing txt file, %s (%d/%d)" % (filename, i, len(files))) try: db.query("LOAD DATA LOCAL INFILE '" + grampath + "/" + filename + "' INTO TABLE " + tablename + " CHARACTER SET utf8 (bookid,wordid,count);") except KeyboardInterrupt: raise except: logging.debug( "Falling back on insert without LOCAL DATA INFILE. Slower." ) try: import pandas as pd df = pd.read_csv(grampath + "/" + filename, sep='\t', header=None) to_insert = df.apply(tuple, axis=1).tolist() db.query("INSERT INTO " + tablename + " (bookid,wordid,count) " "VALUES (%s, %s, %s);" "", many_params=to_insert) except KeyboardInterrupt: raise except: logging.exception("Error inserting %s from %s" % (ngramname, filename)) continue elif filename.endswith('.h5'): logging.info("Importing h5 file, %s (%d/%d)" % (filename, i, len(files))) try: # When encountering an .h5 file, this looks for ngram information # in a /#{ngramnames} table (e.g. /unigrams) and writes it out to # temporary TSV files. # Dask is used here simply because it's a dead simple way to multithread # the TSV writing and lower the overhead versus having a TSV already staged. import csv import pandas as pd try: import dask.dataframe as dd except: logging.exception( "Ingesting h5 files requires dask") try: os.makedirs(tmpdir) except OSError: if not os.path.isdir(tmpdir): raise # Dask will use #{n_cores-1} threads when saving CSVs. # Ingest and key reload times are identical to txt import, so the only # additional overhead is reading the file (small effect) and writing the csv. ddf = dd.read_hdf(grampath + "/" + filename, ngramname, mode='r', chunksize=2000000) ddf.reset_index().to_csv(tmpdir + '/tmp.*.tsv', index=False, sep='\t', header=False, quoting=csv.QUOTE_NONNUMERIC) logging.info( "CSV written from H5. Time passed: %.2f s" % (time.time() - t0)) for j, tmpfile in enumerate(os.listdir(tmpdir)): # With each input file, cycle through each table in tablenames tablename = tablenames[j % len(tablenames)] path = "%s/%s" % (tmpdir, tmpfile) db.query( "LOAD DATA LOCAL INFILE '" + path + "' " "INTO TABLE " + tablename + " " "CHARACTER SET utf8 (bookid,wordid,count);") try: os.remove(path) except: pass logging.info("CSVs input. Time passed: %.2f s" % (time.time() - t0)) except KeyboardInterrupt: raise except: logging.exception("Error inserting %s from %s" % (ngramname, filename)) continue else: continue if index: logging.info("Creating Unigram Indexes. Time passed: %.2f s" % (time.time() - t0)) for tablename in tablenames: db.query("ALTER TABLE " + tablename + " ENABLE KEYS") if table_count > 1: logging.info("Creating a merge table for " + ",".join(tablenames)) db.query( "CREATE TABLE IF NOT EXISTS " + tablenameroot + " (" "bookid MEDIUMINT UNSIGNED NOT NULL, " + reverse_index_sql + "wordid MEDIUMINT UNSIGNED NOT NULL, INDEX(wordid,bookid,count), " "count MEDIUMINT UNSIGNED NOT NULL) " "ENGINE=MERGE UNION=(" + ",".join(tablenames) + ") INSERT_METHOD=LAST;") logging.info("Unigram index created in: %.2f s" % ((time.time() - t0))) def create_bigram_book_counts(self): db = self.db logging.info("Making a SQL table to hold the bigram counts") db.query("""DROP TABLE IF EXISTS master_bigrams""") db.query("""CREATE TABLE master_bigrams ( bookid MEDIUMINT UNSIGNED NOT NULL, word1 MEDIUMINT UNSIGNED NOT NULL, INDEX (word1,word2,bookid,count), word2 MEDIUMINT UNSIGNED NOT NULL, count MEDIUMINT UNSIGNED NOT NULL);""") db.query("ALTER TABLE master_bigrams DISABLE KEYS") logging.info("loading data using LOAD DATA LOCAL INFILE") for filename in os.listdir(".bookworm/texts/encoded/bigrams"): try: db.query( "LOAD DATA LOCAL INFILE '.bookworm/texts/encoded/bigrams/" + filename + "' INTO TABLE master_bigrams CHARACTER SET utf8 (bookid,word1,word2,count);" ) except: raise logging.info("Creating bigram indexes") db.query("ALTER TABLE master_bigrams ENABLE KEYS") def loadVariableDescriptionsIntoDatabase(self): """ This adds a description of files to the master variable table: also, crucially, it puts code specifying their fast creation there, where it will be executed on startup for all eternity. """ logging.debug("Building masterVariableTable") db = self.db db.query("DROP TABLE IF EXISTS masterVariableTable") m = db.query(""" CREATE TABLE IF NOT EXISTS masterVariableTable (dbname VARCHAR(255), PRIMARY KEY (dbname), name VARCHAR(255), type VARCHAR(255), tablename VARCHAR(255), anchor VARCHAR(255), alias VARCHAR(255), status VARCHAR(255), description VARCHAR(5000) ) ENGINE=MYISAM; """) tableTable = db.query(""" CREATE TABLE IF NOT EXISTS masterTableTable (tablename VARCHAR(255), PRIMARY KEY (tablename), dependsOn VARCHAR(255), memoryCode VARCHAR(20000)) ENGINE=MYISAM; """) self.addFilesToMasterVariableTable() self.addWordsToMasterVariableTable() self.variableSet.updateMasterVariableTable() def reloadMemoryTables(self, force=False): """ Checks to see if memory tables need to be repopulated (by seeing if they are empty) and then does so if necessary. """ existingCreateCodes = self.db.query( "SELECT tablename,memoryCode FROM masterTableTable").fetchall() for row in existingCreateCodes: """ For each table, it checks to see if the table is currently populated; if not, it runs the stored code to repopulate the table. (It checks length because memory tables are emptied on a restart). """ tablename = row[0] try: cursor = self.db.query("SELECT count(*) FROM %s" % (tablename), silent=True) currentLength = cursor.fetchall()[0][0] logging.debug("Current Length is %d" % currentLength) except: currentLength = 0 if currentLength == 0 or force: for query in splitMySQLcode(row[1]): self.db.query("SET optimizer_search_depth=0") self.db.query(query) def addFilesToMasterVariableTable(self): fastFieldsCreateList = ["bookid MEDIUMINT UNSIGNED NOT NULL, PRIMARY KEY (bookid)","nwords MEDIUMINT UNSIGNED NOT NULL"] +\ [variable.fastSQL() for variable in self.variableSet.variables if (variable.unique and variable.fastSQL() is not None)] fileCommand = """DROP TABLE IF EXISTS tmp; CREATE TABLE tmp (""" + ",\n".join(fastFieldsCreateList) + """ ) ENGINE=MEMORY;""" #Also update the wordcounts for each text. fastFields = ["bookid", "nwords"] + [ variable.fastField for variable in self.variableSet.variables if variable.unique and variable.fastSQL() is not None ] fileCommand += "INSERT INTO tmp SELECT " + ",".join( fastFields) + " FROM catalog USE INDEX () " + " ".join([ " JOIN %(field)s__id USING (%(field)s ) " % variable.__dict__ for variable in self.variableSet.variables if variable.unique and variable.fastSQL() is not None and variable.datatype == "categorical" ]) + ";" fileCommand += "DROP TABLE IF EXISTS fastcat;" fileCommand += "RENAME TABLE tmp TO fastcat;" self.db.query( 'DELETE FROM masterTableTable WHERE masterTableTable.tablename="fastcat";' ) self.db.query("""INSERT IGNORE INTO masterTableTable VALUES ('fastcat','fastcat','""" + fileCommand + """')""") def addWordsToMasterVariableTable(self): wordCommand = "DROP TABLE IF EXISTS tmp;" wordCommand += "CREATE TABLE tmp (wordid MEDIUMINT UNSIGNED NOT NULL, PRIMARY KEY (wordid), word VARCHAR(30), INDEX (word), casesens VARBINARY(30),UNIQUE INDEX(casesens), lowercase CHAR(30), INDEX (lowercase) ) ENGINE=MEMORY;" wordCommand += "INSERT IGNORE INTO tmp SELECT wordid as wordid,word,casesens,LOWER(word) FROM words WHERE CHAR_LENGTH(word) <= 30 AND wordid <= 1500000 ORDER BY wordid;" wordCommand += "DROP TABLE IF EXISTS wordsheap;" wordCommand += "RENAME TABLE tmp TO wordsheap;" query = """INSERT IGNORE INTO masterTableTable VALUES ('wordsheap','wordsheap','""" + MySQLdb.escape_string( wordCommand) + """')""" logging.info("Creating wordsheap") self.db.query(query) def jsonify_data(self): variables = self.variableSet.variables dbname = self.dbname #This creates a JSON file compliant with the Bookworm web site. #Deprecated. output = dict() output['settings'] = { "dbname": self.dbname, "itemName": " text", "sourceName": self.dbname, "sourceURL": self.dbname } ui_components = [{ "type": "text", "dbfield": "word", "name": "Word(s)" }] for variable in variables: newdict = variable.jsonDict() if newdict: #(It can be empty, in which case we don't want it for the json) ui_components.append(newdict) try: mytime = [ variable.field for variable in variables if variable.datatype == 'time' ][0] output['default_search'] = [{ "search_limits": [{ "word": ["test"] }], "time_measure": mytime, "words_collation": "Case_Sensitive", "counttype": "Occurrences_per_Million_Words", "smoothingSpan": 0 }] except: logging.warning( "WARNING: Not enough info for a default search (like, no time variable maybe?)--likely to be some big problems with your bookworm." ) output['ui_components'] = ui_components outfile = open('.bookworm/%s.json' % dbname, 'w') outfile.write(json.dumps(output)) def create_API_settings(self): db = self.db try: db.query("DROP TABLE IF EXISTS API_settings") db.query("CREATE TABLE API_settings (settings VARCHAR(8192));") except: pass api_info = { "HOST": "10.102.15.45", "database": self.dbname, "read_default_file": "/etc/mysql/my.cnf", } addCode = json.dumps(api_info) logging.info(addCode) db.query("INSERT INTO API_settings VALUES ('%s');" % addCode) def update_Porter_stemming(self): #We use stems occasionally. """ Still not executed. """ logging.info("Updating stems from Porter algorithm...") from nltk import PorterStemmer db = self.db stemmer = PorterStemmer() cursor = db.query("""SELECT word FROM words""") words = cursor.fetchall() for local in words: word = ''.join( local ) # Could probably take the first element of the tuple as well? # Apostrophes have the save stem as the word, if they're included word = word.replace("'s", "") if re.match("^[A-Za-z]+$", word): query = """UPDATE words SET stem='""" + stemmer.stem(''.join( local)) + """' WHERE word='""" + ''.join(local) + """';""" z = cursor.execute(query)