Beispiel #1
0
def dropTable(tablename):
    """Drops a SQL table in the user database

    :arg tablename: str, name of the table
    """
    try:
        logger.info('Dropping %s' % tablename)
        cur = sql.getCursor()
        cur.execute("DROP TABLE IF EXISTS %s;" % tablename)
    except:
        pass
def dropTable(tablename):
    """Drops a SQL table in the user database

    :arg tablename: str, name of the table
    """
    try:        
        logger.info('Dropping %s'%tablename)
        cur = sql.getCursor()
        cur.execute("DROP TABLE IF EXISTS %s;"%tablename)
    except:
        pass
def createIndex(query,tablename):
    """Create an index on a SQL table in the user database

    :arg tablename: str, name of the table
    :arg query: str, query to execute
    """
    try:  
        cur = sql.getCursor()          
        cur.execute(query)
        logger.info("Created indexes on %s"%tablename)
    except:
        logger.warning("Could not create index on %s. Possibly it already exists"%tablename)
def tableExists(tablename):
    """Returns True if the table exists in the user database

    :arg tablename: str, name of the table
    """
    cur = sql.getCursor()
    cur.execute("show tables from %s like '%s';"%(settings.sqluserdb, tablename.split('.')[1]))
    
    if cur.fetchone()  is None:
        return False
    else:
        return True
Beispiel #5
0
def tableExists(tablename):
    """Returns True if the table exists in the user database

    :arg tablename: str, name of the table
    """
    cur = sql.getCursor()
    cur.execute("show tables from %s like '%s';" %
                (settings.sqluserdb, tablename.split('.')[1]))

    if cur.fetchone() is None:
        return False
    else:
        return True
Beispiel #6
0
def createIndex(query, tablename):
    """Create an index on a SQL table in the user database

    :arg tablename: str, name of the table
    :arg query: str, query to execute
    """
    try:
        cur = sql.getCursor()
        cur.execute(query)
        logger.info("Created indexes on %s" % tablename)
    except:
        logger.warning(
            "Could not create index on %s. Possibly it already exists" %
            tablename)
Beispiel #7
0
def process():
    """Creates the auxiliary SQL tables on the user database.

    .. warning:
        This can take a long time. Especially for larger Wikipedias. For the English Wikipedia, it will take over a week :(
    """

    if settings.language in ['en', 'de']:
        logger.warning(
            'YOU ARE ATTEMPTING TO RUN THE PREPROCESSING ON THE ENGLISH OR GERMAN WIKIPEDIA. HOPEFULLY YOU ARE PATIENT, THIS WILL TAKE A WHILE!'
        )
    else:
        logger.warning(
            'Be patient, this can take a long time. I hope you used the screen command...'
        )

    logger.info('Preprocessing data for %swiki' % settings.language)

    # Create the user database if it doesn't exist

    logger.info('Creating Database %s' % settings.sqluserdb)
    cur = sql.getCursor()
    cur.execute(CREATE_USER_DATABASE)

    # CREATE TABLES AND INDEXES

    createTable(CREATE_USER_COHORTS, USER_COHORT)
    createIndex(INDEX_USER_COHORTS, USER_COHORT)

    createTable(CREATE_REV_LEN_CHANGED, REV_LEN_CHANGED)
    createIndex(INDEX_REV_LEN_CHANGED, REV_LEN_CHANGED)

    createTable(CREATE_EDITOR_YEAR_MONTH, EDITOR_YEAR_MONTH)
    createTable(CREATE_EDITOR_YEAR_MONTH_NAMESPACE,
                EDITOR_YEAR_MONTH_NAMESPACE)

    # not creating the 'day' table, it can be used for time series analysis, it is not useful for cohort visualizations.
    # createTable(CREATE_EDITOR_YEAR_MONTH_DAY_NAMESPACE,EDITOR_YEAR_MONTH_DAY_NAMESPACE)

    createTable(CREATE_TIME_YEAR_MONTH_NAMESPACE, TIME_YEAR_MONTH_NAMESPACE)

    # not creating the 'day' table, it can be used for time series analysis, it is not useful for cohort visualizations.
    # createTable(CREATE_TIME_YEAR_MONTH_DAY_NAMESPACE,TIME_YEAR_MONTH_DAY_NAMESPACE)

    createTable(CREATE_BOT_LIST, BOT_LIST)
    createIndex(INDEX_BOT_LIST, BOT_LIST)

    executeCommand(EXPORT_BOT_LIST, 'Exporting bot list for cohort analysis')
def process():
    """Creates the auxiliary SQL tables on the user database.

    .. warning:
        This can take a long time. Especially for larger Wikipedias. For the English Wikipedia, it will take over a week :(
    """

    if settings.language in ['en','de']:
        logger.warning('YOU ARE ATTEMPTING TO RUN THE PREPROCESSING ON THE ENGLISH OR GERMAN WIKIPEDIA. HOPEFULLY YOU ARE PATIENT, THIS WILL TAKE A WHILE!')
    else:
        logger.warning('Be patient, this can take a long time. I hope you used the screen command...')
    
    logger.info('Preprocessing data for %swiki'%settings.language)

    # Create the user database if it doesn't exist
    
    logger.info('Creating Database %s'%settings.sqluserdb)
    cur = sql.getCursor()
    cur.execute(CREATE_USER_DATABASE)
    
    # CREATE TABLES AND INDEXES    
    
    createTable(CREATE_USER_COHORTS,USER_COHORT)
    createIndex(INDEX_USER_COHORTS,USER_COHORT)    

    createTable(CREATE_REV_LEN_CHANGED,REV_LEN_CHANGED)
    createIndex(INDEX_REV_LEN_CHANGED,REV_LEN_CHANGED)

    createTable(CREATE_EDITOR_YEAR_MONTH,EDITOR_YEAR_MONTH)
    createTable(CREATE_EDITOR_YEAR_MONTH_NAMESPACE,EDITOR_YEAR_MONTH_NAMESPACE)

    # not creating the 'day' table, it can be used for time series analysis, it is not useful for cohort visualizations.
    # createTable(CREATE_EDITOR_YEAR_MONTH_DAY_NAMESPACE,EDITOR_YEAR_MONTH_DAY_NAMESPACE)

    createTable(CREATE_TIME_YEAR_MONTH_NAMESPACE,TIME_YEAR_MONTH_NAMESPACE)

    
    # not creating the 'day' table, it can be used for time series analysis, it is not useful for cohort visualizations.
    # createTable(CREATE_TIME_YEAR_MONTH_DAY_NAMESPACE,TIME_YEAR_MONTH_DAY_NAMESPACE)

    createTable(CREATE_BOT_LIST,BOT_LIST)
    createIndex(INDEX_BOT_LIST,BOT_LIST)

    
    executeCommand(EXPORT_BOT_LIST,'Exporting bot list for cohort analysis')
Beispiel #9
0
def createTable(query, tablename):
    """Create a SQL table in the user database

    :arg tablename: str, name of the table
    :arg query: str, query to execute
    """
    try:
        if settings.sqldroptables:
            dropTable(tablename)
        else:
            # logger.info('Table %s not dropped.'tablename)
            pass

        if not tableExists(tablename):
            cur = sql.getCursor()
            logger.info('Creating %s table' % tablename)
            cur.execute(query)
            logger.info('Finished creating %s table' % tablename)
        else:
            logger.info('Table %s exists already! Do nothing' % tablename)
    except:
        logger.exception("Could not create table %s" % tablename)
Beispiel #10
0
def createTable(query,tablename):
    """Create a SQL table in the user database

    :arg tablename: str, name of the table
    :arg query: str, query to execute
    """
    try:
        if settings.sqldroptables:
            dropTable(tablename)
        else:
            # logger.info('Table %s not dropped.'tablename)    
            pass
        
        if not tableExists(tablename):
            cur = sql.getCursor()
            logger.info('Creating %s table'%tablename)
            cur.execute(query)
            logger.info('Finished creating %s table'%tablename)
        else:
            logger.info('Table %s exists already! Do nothing'%tablename)
    except:
        logger.exception("Could not create table %s"%tablename)
Beispiel #11
0
def createAutoConfirmedUserTable():
    '''This is a function rather than a SQL query because a script is used to create the dataset which is then imported back into the MySQL database '''

    from db import sql

    tempfile = os.path.join(TEMPDIR, 'user_autoconfirmed.tsv')
    output = open(tempfile, 'a')

    fourdays = timedelta(days=4)

    curSS = sql.getSSCursor()

    logger.info(
        'Creating temp file to store autoconfirmation date of all users')

    curSS.execute('''SELECT 
	                    u.user_id, 
	                    u.user_name,
	                    u.user_registration,
	                    (SELECT rev_timestamp FROM %s.revision WHERE rev_user=u.user_id ORDER BY rev_timestamp ASC LIMIT 9, 1) AS tenthedit
	                    FROM %s.user u;''' %
                  (settings.sqlwikidb, settings.sqlwikidb))

    for i, res in enumerate(curSS):
        u_id = res[0]
        u_text = res[1]

        tenedits = res[3]

        ins = (u_id, u_text, 0, 'NULL')

        if tenedits:
            # an editors has to have ten edits to be auto-confirmed
            tenedits = datetime.strptime(tenedits, '%Y%m%d%H%M%S')
            reg_time = res[2]

            reg_plus_four = None
            if reg_time:
                reg_plus_four = datetime.strptime(reg_time,
                                                  '%Y%m%d%H%M%S') + fourdays
                # print 'four days after:',reg_plus_four

                if reg_plus_four > tenedits:
                    # 10 edits in less than 4 days, auto-confirmed after 4 days
                    # print '-> auto-confirmed after four days'
                    auto = datetime.strftime(reg_plus_four, '%Y%m%d%H%M%S')
                    # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto)
                    ins = (u_id, u_text, 1, auto)
                else:
                    # 10th edit after than 4 days, auto-confirmed after 10 edits
                    # print '-> auto-confirmed after 10 edits'
                    auto = datetime.strftime(tenedits, '%Y%m%d%H%M%S')
                    # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto)
                    ins = (u_id, u_text, 1, auto)

            else:
                # no registration time, just use 10 edits (there are only few like that)
                auto = datetime.strftime(tenedits, '%Y%m%d%H%M%S')
                # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto)
                ins = (u_id, u_text, 1, auto)

        logger.info('\t'.join(str(v) for v in ins))
        output.write('\t'.join(str(v) for v in ins) + '\n')

        if i % 100 == 0:
            if i % 3000 == 0:
                sys.stdout.write('\n.')
            else:
                sys.stdout.write('.')

            sys.stdout.flush()

    curSS.close()
    output.close()

    cur = sql.getCursor()

    cur.execute('''DROP TABLE %s.user_autoconfirmed;
		CREATE TABLE IF NOT EXISTS %s.%s_user_autoconfirmed
		    (user_id int(5) unsigned,
		    user_name varchar(255),
		    auto_confirmation tinyint(1) unsigned,
		    confirmation_timestamp char(14));
		''' % (settings.sqluserdb, settings.sqluserdb, settings.sqlwikidb))

    cur.close()

    logger.info('Importing auto_confirmation data into MySQL')

    os.system('mysqlimport --local %s %s' % (settings.sqluserdb, tempfile))

    logger.info('Creating index on user_autoconfirmed table')
    cur = sql.getCursor()
    cur.execute("CREATE INDEX user_id ON %s.user_autoconfirmed  (user_id);" %
                settings.sqluserdb)
Beispiel #12
0
def createAutoConfirmedUserTable():
	'''This is a function rather than a SQL query because a script is used to create the dataset which is then imported back into the MySQL database '''

	from db import sql

	tempfile = os.path.join(TEMPDIR,'user_autoconfirmed.tsv')
	output = open(tempfile, 'a')

	fourdays = timedelta(days=4)

	curSS = sql.getSSCursor()

	logger.info('Creating temp file to store autoconfirmation date of all users')

	curSS.execute('''SELECT 
	                    u.user_id, 
	                    u.user_name,
	                    u.user_registration,
	                    (SELECT rev_timestamp FROM %s.revision WHERE rev_user=u.user_id ORDER BY rev_timestamp ASC LIMIT 9, 1) AS tenthedit
	                    FROM %s.user u;'''%(settings.sqlwikidb,settings.sqlwikidb)) 


	for i,res in enumerate(curSS):
	    u_id = res[0]
	    u_text = res[1]

	    tenedits = res[3]

	    ins = (u_id,u_text,0,'NULL')
	    
	    if tenedits:
	        # an editors has to have ten edits to be auto-confirmed
	        tenedits = datetime.strptime(tenedits,'%Y%m%d%H%M%S')
	        reg_time = res[2] 

	        reg_plus_four = None
	        if reg_time:
	            reg_plus_four = datetime.strptime(reg_time,'%Y%m%d%H%M%S') + fourdays
	            # print 'four days after:',reg_plus_four

	            if reg_plus_four>tenedits:
	                # 10 edits in less than 4 days, auto-confirmed after 4 days
	                # print '-> auto-confirmed after four days'
	                auto = datetime.strftime(reg_plus_four,'%Y%m%d%H%M%S')
	                # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto)
	                ins = (u_id,u_text,1,auto)
	            else:
	                # 10th edit after than 4 days, auto-confirmed after 10 edits
	                # print '-> auto-confirmed after 10 edits'
	                auto = datetime.strftime(tenedits,'%Y%m%d%H%M%S')
	                # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto)
	                ins = (u_id,u_text,1,auto)
	        
	        else:
	            # no registration time, just use 10 edits (there are only few like that)
	            auto = datetime.strftime(tenedits,'%Y%m%d%H%M%S')
	            # ins = '"%s","%s",1,"%s"'%(u_id,u_text,auto)
	            ins = (u_id,u_text,1,auto)
	            
	    logger.info('\t'.join(str(v) for v in ins))
	    output.write('\t'.join(str(v) for v in ins)+'\n')

	    if i%100==0:          
	        if i%3000==0:
	            sys.stdout.write('\n.')
	        else:   
	            sys.stdout.write('.')

	        sys.stdout.flush() 

	curSS.close()
	output.close()


	cur = sql.getCursor()

	cur.execute('''DROP TABLE %s.user_autoconfirmed;
		CREATE TABLE IF NOT EXISTS %s.%s_user_autoconfirmed
		    (user_id int(5) unsigned,
		    user_name varchar(255),
		    auto_confirmation tinyint(1) unsigned,
		    confirmation_timestamp char(14));
		'''%(settings.sqluserdb,settings.sqluserdb,settings.sqlwikidb))


	cur.close()

	logger.info('Importing auto_confirmation data into MySQL')

	os.system('mysqlimport --local %s %s'%(settings.sqluserdb,tempfile))


	logger.info('Creating index on user_autoconfirmed table')
	cur = sql.getCursor()
	cur.execute("CREATE INDEX user_id ON %s.user_autoconfirmed  (user_id);"%settings.sqluserdb)