def dropTable(tablename): """Drops a SQL table in the user database :arg tablename: str, name of the table """ try: logger.info('Dropping %s' % tablename) cur = sql.getCursor() cur.execute("DROP TABLE IF EXISTS %s;" % tablename) except: pass
def dropTable(tablename): """Drops a SQL table in the user database :arg tablename: str, name of the table """ try: logger.info('Dropping %s'%tablename) cur = sql.getCursor() cur.execute("DROP TABLE IF EXISTS %s;"%tablename) except: pass
def createIndex(query,tablename): """Create an index on a SQL table in the user database :arg tablename: str, name of the table :arg query: str, query to execute """ try: cur = sql.getCursor() cur.execute(query) logger.info("Created indexes on %s"%tablename) except: logger.warning("Could not create index on %s. Possibly it already exists"%tablename)
def tableExists(tablename): """Returns True if the table exists in the user database :arg tablename: str, name of the table """ cur = sql.getCursor() cur.execute("show tables from %s like '%s';"%(settings.sqluserdb, tablename.split('.')[1])) if cur.fetchone() is None: return False else: return True
def tableExists(tablename): """Returns True if the table exists in the user database :arg tablename: str, name of the table """ cur = sql.getCursor() cur.execute("show tables from %s like '%s';" % (settings.sqluserdb, tablename.split('.')[1])) if cur.fetchone() is None: return False else: return True
def createIndex(query, tablename): """Create an index on a SQL table in the user database :arg tablename: str, name of the table :arg query: str, query to execute """ try: cur = sql.getCursor() cur.execute(query) logger.info("Created indexes on %s" % tablename) except: logger.warning( "Could not create index on %s. Possibly it already exists" % tablename)
def process(): """Creates the auxiliary SQL tables on the user database. .. warning: This can take a long time. Especially for larger Wikipedias. For the English Wikipedia, it will take over a week :( """ if settings.language in ['en', 'de']: logger.warning( 'YOU ARE ATTEMPTING TO RUN THE PREPROCESSING ON THE ENGLISH OR GERMAN WIKIPEDIA. HOPEFULLY YOU ARE PATIENT, THIS WILL TAKE A WHILE!' ) else: logger.warning( 'Be patient, this can take a long time. I hope you used the screen command...' ) logger.info('Preprocessing data for %swiki' % settings.language) # Create the user database if it doesn't exist logger.info('Creating Database %s' % settings.sqluserdb) cur = sql.getCursor() cur.execute(CREATE_USER_DATABASE) # CREATE TABLES AND INDEXES createTable(CREATE_USER_COHORTS, USER_COHORT) createIndex(INDEX_USER_COHORTS, USER_COHORT) createTable(CREATE_REV_LEN_CHANGED, REV_LEN_CHANGED) createIndex(INDEX_REV_LEN_CHANGED, REV_LEN_CHANGED) createTable(CREATE_EDITOR_YEAR_MONTH, EDITOR_YEAR_MONTH) createTable(CREATE_EDITOR_YEAR_MONTH_NAMESPACE, EDITOR_YEAR_MONTH_NAMESPACE) # not creating the 'day' table, it can be used for time series analysis, it is not useful for cohort visualizations. # createTable(CREATE_EDITOR_YEAR_MONTH_DAY_NAMESPACE,EDITOR_YEAR_MONTH_DAY_NAMESPACE) createTable(CREATE_TIME_YEAR_MONTH_NAMESPACE, TIME_YEAR_MONTH_NAMESPACE) # not creating the 'day' table, it can be used for time series analysis, it is not useful for cohort visualizations. # createTable(CREATE_TIME_YEAR_MONTH_DAY_NAMESPACE,TIME_YEAR_MONTH_DAY_NAMESPACE) createTable(CREATE_BOT_LIST, BOT_LIST) createIndex(INDEX_BOT_LIST, BOT_LIST) executeCommand(EXPORT_BOT_LIST, 'Exporting bot list for cohort analysis')
def process(): """Creates the auxiliary SQL tables on the user database. .. warning: This can take a long time. Especially for larger Wikipedias. For the English Wikipedia, it will take over a week :( """ if settings.language in ['en','de']: logger.warning('YOU ARE ATTEMPTING TO RUN THE PREPROCESSING ON THE ENGLISH OR GERMAN WIKIPEDIA. HOPEFULLY YOU ARE PATIENT, THIS WILL TAKE A WHILE!') else: logger.warning('Be patient, this can take a long time. I hope you used the screen command...') logger.info('Preprocessing data for %swiki'%settings.language) # Create the user database if it doesn't exist logger.info('Creating Database %s'%settings.sqluserdb) cur = sql.getCursor() cur.execute(CREATE_USER_DATABASE) # CREATE TABLES AND INDEXES createTable(CREATE_USER_COHORTS,USER_COHORT) createIndex(INDEX_USER_COHORTS,USER_COHORT) createTable(CREATE_REV_LEN_CHANGED,REV_LEN_CHANGED) createIndex(INDEX_REV_LEN_CHANGED,REV_LEN_CHANGED) createTable(CREATE_EDITOR_YEAR_MONTH,EDITOR_YEAR_MONTH) createTable(CREATE_EDITOR_YEAR_MONTH_NAMESPACE,EDITOR_YEAR_MONTH_NAMESPACE) # not creating the 'day' table, it can be used for time series analysis, it is not useful for cohort visualizations. # createTable(CREATE_EDITOR_YEAR_MONTH_DAY_NAMESPACE,EDITOR_YEAR_MONTH_DAY_NAMESPACE) createTable(CREATE_TIME_YEAR_MONTH_NAMESPACE,TIME_YEAR_MONTH_NAMESPACE) # not creating the 'day' table, it can be used for time series analysis, it is not useful for cohort visualizations. # createTable(CREATE_TIME_YEAR_MONTH_DAY_NAMESPACE,TIME_YEAR_MONTH_DAY_NAMESPACE) createTable(CREATE_BOT_LIST,BOT_LIST) createIndex(INDEX_BOT_LIST,BOT_LIST) executeCommand(EXPORT_BOT_LIST,'Exporting bot list for cohort analysis')
def createTable(query, tablename): """Create a SQL table in the user database :arg tablename: str, name of the table :arg query: str, query to execute """ try: if settings.sqldroptables: dropTable(tablename) else: # logger.info('Table %s not dropped.'tablename) pass if not tableExists(tablename): cur = sql.getCursor() logger.info('Creating %s table' % tablename) cur.execute(query) logger.info('Finished creating %s table' % tablename) else: logger.info('Table %s exists already! Do nothing' % tablename) except: logger.exception("Could not create table %s" % tablename)
def createTable(query,tablename): """Create a SQL table in the user database :arg tablename: str, name of the table :arg query: str, query to execute """ try: if settings.sqldroptables: dropTable(tablename) else: # logger.info('Table %s not dropped.'tablename) pass if not tableExists(tablename): cur = sql.getCursor() logger.info('Creating %s table'%tablename) cur.execute(query) logger.info('Finished creating %s table'%tablename) else: logger.info('Table %s exists already! Do nothing'%tablename) except: logger.exception("Could not create table %s"%tablename)
def createAutoConfirmedUserTable(): '''This is a function rather than a SQL query because a script is used to create the dataset which is then imported back into the MySQL database ''' from db import sql tempfile = os.path.join(TEMPDIR, 'user_autoconfirmed.tsv') output = open(tempfile, 'a') fourdays = timedelta(days=4) curSS = sql.getSSCursor() logger.info( 'Creating temp file to store autoconfirmation date of all users') curSS.execute('''SELECT u.user_id, u.user_name, u.user_registration, (SELECT rev_timestamp FROM %s.revision WHERE rev_user=u.user_id ORDER BY rev_timestamp ASC LIMIT 9, 1) AS tenthedit FROM %s.user u;''' % (settings.sqlwikidb, settings.sqlwikidb)) for i, res in enumerate(curSS): u_id = res[0] u_text = res[1] tenedits = res[3] ins = (u_id, u_text, 0, 'NULL') if tenedits: # an editors has to have ten edits to be auto-confirmed tenedits = datetime.strptime(tenedits, '%Y%m%d%H%M%S') reg_time = res[2] reg_plus_four = None if reg_time: reg_plus_four = datetime.strptime(reg_time, '%Y%m%d%H%M%S') + fourdays # print 'four days after:',reg_plus_four if reg_plus_four > tenedits: # 10 edits in less than 4 days, auto-confirmed after 4 days # print '-> auto-confirmed after four days' auto = datetime.strftime(reg_plus_four, '%Y%m%d%H%M%S') # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto) ins = (u_id, u_text, 1, auto) else: # 10th edit after than 4 days, auto-confirmed after 10 edits # print '-> auto-confirmed after 10 edits' auto = datetime.strftime(tenedits, '%Y%m%d%H%M%S') # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto) ins = (u_id, u_text, 1, auto) else: # no registration time, just use 10 edits (there are only few like that) auto = datetime.strftime(tenedits, '%Y%m%d%H%M%S') # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto) ins = (u_id, u_text, 1, auto) logger.info('\t'.join(str(v) for v in ins)) output.write('\t'.join(str(v) for v in ins) + '\n') if i % 100 == 0: if i % 3000 == 0: sys.stdout.write('\n.') else: sys.stdout.write('.') sys.stdout.flush() curSS.close() output.close() cur = sql.getCursor() cur.execute('''DROP TABLE %s.user_autoconfirmed; CREATE TABLE IF NOT EXISTS %s.%s_user_autoconfirmed (user_id int(5) unsigned, user_name varchar(255), auto_confirmation tinyint(1) unsigned, confirmation_timestamp char(14)); ''' % (settings.sqluserdb, settings.sqluserdb, settings.sqlwikidb)) cur.close() logger.info('Importing auto_confirmation data into MySQL') os.system('mysqlimport --local %s %s' % (settings.sqluserdb, tempfile)) logger.info('Creating index on user_autoconfirmed table') cur = sql.getCursor() cur.execute("CREATE INDEX user_id ON %s.user_autoconfirmed (user_id);" % settings.sqluserdb)
def createAutoConfirmedUserTable(): '''This is a function rather than a SQL query because a script is used to create the dataset which is then imported back into the MySQL database ''' from db import sql tempfile = os.path.join(TEMPDIR,'user_autoconfirmed.tsv') output = open(tempfile, 'a') fourdays = timedelta(days=4) curSS = sql.getSSCursor() logger.info('Creating temp file to store autoconfirmation date of all users') curSS.execute('''SELECT u.user_id, u.user_name, u.user_registration, (SELECT rev_timestamp FROM %s.revision WHERE rev_user=u.user_id ORDER BY rev_timestamp ASC LIMIT 9, 1) AS tenthedit FROM %s.user u;'''%(settings.sqlwikidb,settings.sqlwikidb)) for i,res in enumerate(curSS): u_id = res[0] u_text = res[1] tenedits = res[3] ins = (u_id,u_text,0,'NULL') if tenedits: # an editors has to have ten edits to be auto-confirmed tenedits = datetime.strptime(tenedits,'%Y%m%d%H%M%S') reg_time = res[2] reg_plus_four = None if reg_time: reg_plus_four = datetime.strptime(reg_time,'%Y%m%d%H%M%S') + fourdays # print 'four days after:',reg_plus_four if reg_plus_four>tenedits: # 10 edits in less than 4 days, auto-confirmed after 4 days # print '-> auto-confirmed after four days' auto = datetime.strftime(reg_plus_four,'%Y%m%d%H%M%S') # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto) ins = (u_id,u_text,1,auto) else: # 10th edit after than 4 days, auto-confirmed after 10 edits # print '-> auto-confirmed after 10 edits' auto = datetime.strftime(tenedits,'%Y%m%d%H%M%S') # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto) ins = (u_id,u_text,1,auto) else: # no registration time, just use 10 edits (there are only few like that) auto = datetime.strftime(tenedits,'%Y%m%d%H%M%S') # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto) ins = (u_id,u_text,1,auto) logger.info('\t'.join(str(v) for v in ins)) output.write('\t'.join(str(v) for v in ins)+'\n') if i%100==0: if i%3000==0: sys.stdout.write('\n.') else: sys.stdout.write('.') sys.stdout.flush() curSS.close() output.close() cur = sql.getCursor() cur.execute('''DROP TABLE %s.user_autoconfirmed; CREATE TABLE IF NOT EXISTS %s.%s_user_autoconfirmed (user_id int(5) unsigned, user_name varchar(255), auto_confirmation tinyint(1) unsigned, confirmation_timestamp char(14)); '''%(settings.sqluserdb,settings.sqluserdb,settings.sqlwikidb)) cur.close() logger.info('Importing auto_confirmation data into MySQL') os.system('mysqlimport --local %s %s'%(settings.sqluserdb,tempfile)) logger.info('Creating index on user_autoconfirmed table') cur = sql.getCursor() cur.execute("CREATE INDEX user_id ON %s.user_autoconfirmed (user_id);"%settings.sqluserdb)