def performAnalysis(self): ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", self.language + "_stub") ## Singleton objects to plot graphics in the class methods self.simpleGraph = graphic2D(self.filePath) ## self.multiGraph=graphic2Dmulti(self.filePath) ## self.giniGraph=graphicGini(self.filePath) ## self.splitHistGraph=graphicSplitHist(self.filePath, self.dataPath) self.graph3D = graphic3D(self.filePath, self.dataPath) print "Starting analysis on DB " + self.language + "_stub\n" ## self.UserNumContribsGroup(self.acceso[1]) ## self.UserNumContribsGenerations() authorsGini = [( 95.9677, 4.046, ), (95.7015, 4.304), (96.2223, 4.363), (95.7104, 4.395), (96.3844, 4.407), (92.4691, 4.528), (95.0077, 4.603), (95.0071, 4.7298), (93.785, 5.051), (93.6076, 5.888)] authorsGini.sort() ##authorsGini=[(4.046,95.9677),(4.304,95.7015),(4.363,96.2223),(4.395,95.7104),(4.407,96.3844),(4.528,92.4691),(4.603,95.0077),(4.7298,95.0071),(5.051,93.785),(5.888,93.6076)] self.simpleGraph.createGraphic( "authors-Gini", (authorsGini, ), "Gini coeff. (%)", "Number of different authors (log)", "Gini coeff. vs. number of registered authors in the top-ten Wikipedias." ) ## Close DB connection dbaccess.close_Connection(self.acceso[0]) print "This is finished"
def infoPages(self): ## Generates statistics per article ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) #Local configuration target="page_id" intervals=["months", "quarters","weeks"] ########################### #Total num of revisions per page for nspace in self.nspaces: self.__total_rev(self.acceso[1], nspace+"_"+self.language, target) ########################### #Total number of different editors per page for nspace in self.nspaces: self.__total_rev_diff(self.acceso[1], nspace+"_"+self.language, target) ########################### #Total number of revisions per page for several time intervals #Currently, we are only interested in months, quarters and weeks for nspace in self.nspaces: for interval in intervals: self.__total_rev_time(self.acceso[1], interval,nspace+"_"+self.language, target) ########################### #Total number of different editors per page; per month ,quarter and week for nspace in self.nspaces: for interval in intervals: self.__total_rev_diff_time(self.acceso[1], interval,nspace+"_"+self.language, target) #Close DB connection dbaccess.close_Connection(self.acceso[0])
def generalStatistics(self): ## Computes the views containing general statistics and overall information: ## For all namespaces (official and artificial): ################ ## View _overall_statistics1_months, which includes ## Total num of pages with at least one edit in that month, total number of contribs, ## total num of users who made at least 1 edit in that month (alive_users) #################################### ## Parameters from Wikistats by Erik Zachte #################################### ## Wikipedians: contributors, active wikipedians, very active wikipedians ## Articles: (WARNING: readable contents are not being filtered out yet) ## new articles per day, edits per article, bytes per article, % of articles over 0.5k, ## % of articles over 2k ## Total size of contribs per month ## Size of pages and number of different authors who have edited them #################################### ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## General statistics for nspace in self.nspaces: self.__gral_stats(self.acceso[1], nspace + "_" + self.language) ## Close DB connection dbaccess.close_Connection(self.acceso[0])
def prepro_pagelen(self): """ Preprocessing tables for evolution of page length over time """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #VIEW page_main_nored (pages in main nspace excluding redirects) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_main_nored as "+\ "(select page_id from page where page_namespace=0 and page_is_redirect=0)") #VIEW rev_main_nored (revisions in main nspace in all pages, excluding redirects) dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_main_nored as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_main_nored))") #TABLES max_rev_YYYY (latest revision for each page in main nspace, up to year YYYY) self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision") self.years=range(int(self.minyear[0][0])+1, 2009) for self.year in self.years: dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_"+str(self.year)) dbaccess.raw_query_SQL(self.access[1],"create table max_rev_"+str(self.year)+\ " as (select max(rev_id) as max_id, rev_page from rev_main_nored "+\ "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add primary key (max_id)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add index (rev_page)") dbaccess.close_Connection(self.access[0])
def generalStatistics(self): ## Computes the views containing general statistics and overall information: ## For all namespaces (official and artificial): ################ ## View _overall_statistics1_months, which includes ## Total num of pages with at least one edit in that month, total number of contribs, ## total num of users who made at least 1 edit in that month (alive_users) #################################### ## Parameters from Wikistats by Erik Zachte #################################### ## Wikipedians: contributors, active wikipedians, very active wikipedians ## Articles: (WARNING: readable contents are not being filtered out yet) ## new articles per day, edits per article, bytes per article, % of articles over 0.5k, ## % of articles over 2k ## Total size of contribs per month ## Size of pages and number of different authors who have edited them #################################### ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## General statistics for nspace in self.nspaces: self.__gral_stats(self.acceso[1], nspace+"_"+self.language) ## Close DB connection dbaccess.close_Connection(self.acceso[0])
def contributions(idiomas): """ Create some graphs and files with statistical results about authors contributions @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ for idioma in idiomas: acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") #acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages") #dbaccess.query_SQL(acceso[1], "page_id, page_namespace", "page", where="page_namespace=0", create="pag_namespace") tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma) tcauthor=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_author_"+idioma) #tc_ann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_Annons_author_text_"+idioma) dbaccess.close_Connection(acceso[0]) data=__tup_to_list(tcnoann) listay_tcnoann=data.pop() listax=data.pop() data=__tup_to_list(tcauthor) listay_tcauthor=data.pop() listax=data.pop() #data=__tup_to_list(tc_ann) #listay_tc_ann=data.pop() #listax=data.pop() r.png("graphics/"+idioma+"/gini_TContrib_NoAnn_"+idioma+".png") __lorenz_Curve(listay_tcnoann) r.png("graphics/"+idioma+"/gini_TContrib_"+idioma+".png") __lorenz_Curve(listay_tcauthor)
def overall(self): """ Preprocessing tables for evolution of page length over time """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_redirect as "+\ "(select page_id from page where page_namespace=0 and page_is_redirect=1)") dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_redirect as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_redirect))") dbaccess.raw_query_SQL(self.access[1], "create or replace view page_talk as "+\ "(select page_id from page where page_namespace=1)") dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_talk as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_talk))") self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision") self.years=range(int(self.minyear[0][0])+1, 2009) for self.year in self.years: dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_talk_"+str(self.year)) dbaccess.raw_query_SQL(self.access[1],"create table max_rev_talk_"+str(self.year)+\ " as (select max(rev_id) as max_id, rev_page from rev_talk "+\ "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add primary key (max_id)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add index (rev_page)") dbaccess.close_Connection(self.access[0])
def infoAuthors(self): ## Generates statistics per user ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## local configuration: retrieving info for authors target = "author" ## intervals might be days, weeks, months, quarters, years intervals = ["months", "quarters", "weeks"] ############################ #Number of total revisions per author ID for nspace in self.nspaces: self.__total_rev(self.acceso[1], nspace + "_" + self.language, target) ############################ #Different articles edited per user for nspace in self.nspaces: self.__total_rev_diff(self.acceso[1], nspace + "_" + self.language, target) ############################ #Total num of articles started per author #We consider as the beginning of an article the first revision of that article for nspace in self.nspaces: self.__total_page_init_author(self.acceso[1], nspace + "_" + self.language) ############################ #Total number of revisions per author for several time intervals #Currently, we are only interested in data per months, quarters and weeks for nspace in self.nspaces: for interval in intervals: self.__total_rev_time(self.acceso[1], interval, nspace + "_" + self.language, target) ############################ #Num of different articles revised per author for several time intervals for nspace in self.nspaces: for interval in intervals: self.__total_rev_diff_time(self.acceso[1], interval, nspace + "_" + self.language, target) ############################ #Num of different articles initiated per author for nspace in self.nspaces: for interval in intervals: self.__total_page_init_author_time( self.acceso[1], interval, nspace + "_" + self.language) ############################ # BIRTH AND DEATH ANALYSIS FOR THE AUTHOR COMMUNITY ############################ #Close DB connection dbaccess.close_Connection(self.acceso[0])
def __init__(self, language="furwiki", dumptype="research", msqlu="", msqlp=""): """ It receives the language and dumptype to download It returns an int =0 if the DB was successfully set up, =-1 if there was an error """ self.language=language #language to download self.dumptype=dumptype #type of dump self.files=["pages-meta-history.xml.7z", "redirect.sql.gz","page_restrictions.sql.gz",\ "user_groups.sql.gz", "logging.sql.gz", "interwiki.sql.gz", "langlinks.sql.gz", "externallinks.sql.gz",\ "templatelinks.sql.gz", "imagelinks.sql.gz", "categorylinks.sql.gz", "pagelinks.sql.gz", "oldimage.sql.gz",\ "image.sql.gz"] self.filename="" self.filenameTemplate=string.Template("""$language-latest-$file""") #dump's filename in Wikimedia's server #URL to download the file self.urld="" self.urldTemplate=string.Template("""http://download.wikimedia.org/$language/latest/$language-latest-$file""") if (msqlu=="" or msqlp==""): print "Error initializing DB dump object. You must provide a valid MySQL username and password" else: self.msqlu=msqlu #MySQL username for accessing and editing the DB self.msqlp=msqlp #MySQL password #We can manage two different types of dumps, stubs (without the text of every revision) and pages #(containing the text of every revision) #self.urld="http://download.wikimedia.org/"+self.language+"/latest/"+\ #self.language+"-latest-pages-meta-history.xml.7z" #File to download #patterns for files #http://download.wikimedia.org/furwiki/20060921/furwiki-20060921-pages-meta-history.xml.7z #http://download.wikimedia.org/amwiki/20061014/amwiki-20061014-stub-meta-history.xml.gz #Create /dumps directory if it does not exist yet directories=os.listdir("./") if ("dumps" not in directories): os.makedirs("./dumps") ## Initialize DB in MySQL: create DB and tables definitions print "Initializing DB for --> "+ self.language +"\n" #Retrieving connection and cursor to access the DB acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,"mysql") dbaccess.createDB_SQL(acceso[1],"wx_"+self.language+"_"+self.dumptype) if self.dumptype=="research": command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\ "wx_"+self.language+"_"+self.dumptype+" < tables_research.sql > debug_mysql.log" elif self.dumptype=="standard": command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\ "wx_"+self.language+"_"+self.dumptype+" < tables_standard.sql > debug_mysql.log" ok=os.system(command) if ok == 0: acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,\ "wx_"+self.language+"_"+self.dumptype) dbaccess.raw_query_SQL(acceso[1], "alter table page max_rows = 200000000000 avg_row_length = 50") dbaccess.raw_query_SQL(acceso[1], "alter table revision max_rows = 200000000000 avg_row_length = 50") if self.dumptype=="standard": dbaccess.raw_query_SQL(acceso[1], "alter table text max_rows = 200000000000 avg_row_length = 50") dbaccess.close_Connection(acceso[0]) else: print "Error! There was a problem initializing definitions for DB tables" dbaccess.close_Connection(acceso[0])
def infoAuthors(self): ## Generates statistics per user ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## local configuration: retrieving info for authors target="author" ## intervals might be days, weeks, months, quarters, years intervals=["months", "quarters","weeks"] ############################ #Number of total revisions per author ID for nspace in self.nspaces: self.__total_rev(self.acceso[1], nspace+"_"+self.language, target) ############################ #Different articles edited per user for nspace in self.nspaces: self.__total_rev_diff(self.acceso[1], nspace+"_"+self.language, target) ############################ #Total num of articles started per author #We consider as the beginning of an article the first revision of that article for nspace in self.nspaces: self.__total_page_init_author(self.acceso[1], nspace+"_"+self.language) ############################ #Total number of revisions per author for several time intervals #Currently, we are only interested in data per months, quarters and weeks for nspace in self.nspaces: for interval in intervals: self.__total_rev_time(self.acceso[1], interval,nspace+"_"+self.language, target) ############################ #Num of different articles revised per author for several time intervals for nspace in self.nspaces: for interval in intervals: self.__total_rev_diff_time(self.acceso[1], interval,nspace+"_"+self.language, target) ############################ #Num of different articles initiated per author for nspace in self.nspaces: for interval in intervals: self.__total_page_init_author_time(self.acceso[1], interval,nspace+"_"+self.language) ############################ # BIRTH AND DEATH ANALYSIS FOR THE AUTHOR COMMUNITY ############################ #Close DB connection dbaccess.close_Connection(self.acceso[0])
def infoContents(self): ########################### #Contents analysis ########################### ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## For all namespaces (official an artificial): ## Evolution in time of the lenght of user contributions (per month; per quarter) ## Evolution in time of the lenght of pages (per month; per quarter supported but commented) for nspace in self.nspaces: self.__content_evolution(self.acceso[1], nspace+"_"+self.language) dbaccess.close_Connection(self.acceso[0])
def infoContents(self): ########################### #Contents analysis ########################### ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## For all namespaces (official an artificial): ## Evolution in time of the lenght of user contributions (per month; per quarter) ## Evolution in time of the lenght of pages (per month; per quarter supported but commented) for nspace in self.nspaces: self.__content_evolution(self.acceso[1], nspace + "_" + self.language) dbaccess.close_Connection(self.acceso[0])
def cox_prop(self): """ Creates intermediate files and tables for Cox-prop hazards analysis """ #Initialize file header f=open("wkp_cox_prop_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts,in_talk,in_FAs\n") f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" print "Starting language "+self.language+"\n" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) ##TABLE: Create table of users in talk pages dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_talk") dbaccess.raw_query_SQL(self.access[1],"create table users_in_talk as (select distinct(rev_user) from revision "+\ "where rev_page in (select page_id from page where page_namespace=1))") dbaccess.raw_query_SQL(self.access[1],"alter table users_in_talk add primary key (rev_user)") ##TABLE: Create table of users in FAs dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_FAs") dbaccess.raw_query_SQL(self.access[1],"create table users_in_FAs as (select distinct(rev_user) from revision_FAs)") dbaccess.raw_query_SQL(self.access[1],"alter table users_in_FAs add primary key (rev_user)") ##TABLE: MIX previous info with time_range_authors --> save result in new table time_range_cox dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_cox") dbaccess.raw_query_SQL(self.access[1],"create table time_range_cox as (select rev_user, "+\ "date(min_ts) as min_ts, date(max_ts) as max_ts, "+\ "case when rev_user in (select rev_user from users_in_talk) then 1 else 0 end as in_talk, "+\ "case when rev_user in (select rev_user from users_in_FAs) then 1 else 0 end as in_FAs "+\ "from time_range_authors)") ##IN SYSTEM print "Interm. tables created proceeding to write out data..."+self.language+"\n" results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, min_ts, max_ts, in_talk, in_FAs "+\ "from time_range_cox "+\ " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_cox_prop_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\","+\ str(int(result[3]))+","+str(int(result[4]))+"\n") f.close() print "Finished all cox-prop tasks for "+self.language+"\n"
def test_funciones(self): self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+self.conf.dumptype) __total_rev(self.acceso[1], table="stats_nlwiki", target="author") ## targets=["page_id"] ## for target in targets: ## __total_rev(self.acceso[1], language, target) ## __total_rev_target(self.acceso[1], language, target) ## __total_rev_time(self.acceso[1],"years",language, target) ## __total_rev_target_time(self.acceso[1],"years",language, target) ## __total_article_init_author(self.acceso[1], language) ## __article_init_author_time(self.acceso[1],"years",language) ## __article_rev_author_time(self.acceso[1], "years", language) ## __total_rev_time(self.acceso[1],"months",language, "page_id") ## __total_article_init_author(self.acceso[1], language, target="author") ## __content_evolution(self.acceso[1], language) dbaccess.close_Connection(self.acceso[0])
def ratios(self): """ .dat files showing interesting descriptive ratios """ #FILE author-pages.dat ratio no. logged editors/no. user pages file=open("overall/data/editors-userpages.dat",'w') file.write("logged_authors\tuser_pages\tratio\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Obtain number of different logged authors self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\ "revision where rev_user!=0") #Obtain number of different user pages (nspace =2) self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=2") dbaccess.close_Connection(self.access[0]) #Writing data to file file=open("overall/data/author-pages.dat",'a') file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\ str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n") file.close() #print "Completed lang "+self.language+"\n" #FILE articles-talk-ratio.dat ratio of no. articles/no. talk pages (excluding redirects) file=open("overall/data/articles-talk-ratio.dat",'w') file.write("articles\ttalk\tratio\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Obtain number of articles excluding redirects self.articles=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=0 and page_is_redirect=0") #Obtain number of talk pages self.talk=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=1") dbaccess.close_Connection(self.access[0]) #Writing data to file file=open("overall/data/articles-talk-ratio.dat",'a') file.write(str(int(self.articles[0][0]))+"\t"+str(int(self.talk[0][0]))+"\t"+\ str(float(self.talk[0][0])/float(self.articles[0][0]))+"\t"+self.language+"\n") file.close()
def overall(self): """ Preprocessing tables for evolution of page length over time """ file=open("author-pages.dat",'w') file.write("logged_authors\tuser_pages\tratio\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\ "revision where rev_user!=0") self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=2") dbaccess.close_Connection(self.access[0]) file=open("author-pages.dat",'a') file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\ str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n") file.close() print "Completed lang "+self.language+"\n"
def time_range(self): """ Creates intermediate tables with time frame of editors activity """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) ##### TIME RANGE FOR AUTHORS IN ALL NAMESPACES #TABLE: Total no. of revisions made by every logged author dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\ "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\ "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user") dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)") print "Created table user_revs for "+self.language+"wiki...\n" #TABLE: Min and max timestamp for every logged author + total num_revs dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_authors AS "+\ "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\ "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\ "ORDER BY min_ts)") print "Created table time_range_authors for "+self.language+"wiki...\n" ##### TIME RANGE FOR AUTHORS IN MAIN ONLY print "Processing language "+self.language+"\n" #VIEW: Create view for filtering annons and bots #Filter from rev_main_nored revisions from logged authors only dbaccess.raw_query_SQL(self.access[1],"create or replace view revision_logged as (select * from rev_main_nored "+\ " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot') )") dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_users") #TABLE: Intermediate table, storing for each logged author the min and max ts in the system dbaccess.raw_query_SQL(self.access[1],"create table time_range_users as (SELECT rev_user, "+\ "min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision_logged group by rev_user)") dbaccess.raw_query_SQL(self.access[1],"alter table time_range_users add primary key (rev_user)") print "Created time_range_users for "+self.language +"\n" #Close DB connection dbaccess.close_Connection(self.access[0])
def comparative_contributions(): listaidiomas=["dewiki", "jawiki", "frwiki", "plwiki", "nlwiki", "itwiki", "ptwiki", "eswiki", "svwiki"] ## lista=["eswiki", "svwiki"] r.png("graphics/AAA/gini_comparative_top10.png") flag=0 for idioma in listaidiomas: print "Starting comparative Gini analysis for language..."+idioma+"\n" acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma) dbaccess.close_Connection(acceso[0]) data=__tup_to_list(tcnoann) listay_tcnoann=data.pop() listax=data.pop() if flag==0: _lorenz_Comp_Curves(listay_tcnoann,flag) flag=1 else: _lorenz_Comp_Curves(listay_tcnoann,flag) r.dev_off() print "Comparative graphic for Gini curves finished!!"
def infoPages(self): ## Generates statistics per article ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) #Local configuration target = "page_id" intervals = ["months", "quarters", "weeks"] ########################### #Total num of revisions per page for nspace in self.nspaces: self.__total_rev(self.acceso[1], nspace + "_" + self.language, target) ########################### #Total number of different editors per page for nspace in self.nspaces: self.__total_rev_diff(self.acceso[1], nspace + "_" + self.language, target) ########################### #Total number of revisions per page for several time intervals #Currently, we are only interested in months, quarters and weeks for nspace in self.nspaces: for interval in intervals: self.__total_rev_time(self.acceso[1], interval, nspace + "_" + self.language, target) ########################### #Total number of different editors per page; per month ,quarter and week for nspace in self.nspaces: for interval in intervals: self.__total_rev_diff_time(self.acceso[1], interval, nspace + "_" + self.language, target) #Close DB connection dbaccess.close_Connection(self.acceso[0])
def histogram(idiomas): """ Create histograms depicting article size distribution for a certain language version @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ filenames=["boxplot_log.png", "histogram_log.png", "histogram_log_low.png", "histogram_log_high.png", "ecdf_log_low.png", "ecdf_log_high.png", "data/page_len_log.data", "/data/histograms.info", "ecdf_total.png"] for idioma in idiomas: print "Creando histogramas para el idioma ... "+idioma #Print to another file the names of graphics files, following the order in the GNU R script histogram.R f=open("./data/hist_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") #Considering only database pages corresponding to articles, with NAMESPACE=MAIN=0 #dbaccess.dropTab_SQL(acceso[1], "aux") #dbaccess.query_SQL(acceso[1],"page_id, page_len","page", where="page_namespace=0", order="page_len", create="aux") result=dbaccess.query_SQL(acceso[1], "page_id, page_len", "aux") dbaccess.close_Connection(acceso[0]) data=__tup_to_list(result) page_len=data.pop() for i in range(len(page_len)): if page_len[i]!=0: page_len[i]=math.log10(page_len[i]) #Print to another file a list with article sizes to plot histograms f=open("./graphics/"+idioma+"/data/page_len_log.data", 'w') for value in page_len: f.writelines(str(value)+"\n") f.close() #CALL THE GNU R SCRIPT Histogram.R succ=os.system("R --vanilla < ./histogram.R > debug_R") if succ==0: print "Funcion histogram ejecutada con exito para el lenguage... "+idioma
def prepro_red_talk(self): """ Data and evolution for redirects and talk pages """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #VIEW page_redirect (pages with redirect flag activated) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_redirect as "+\ "(select page_id from page where page_namespace=0 and page_is_redirect=1)") #VIEW rev_redirect (revisions corresponding to redirect pages) dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_redirect as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_redirect))") #VIEW page_talk (pages in talk nspace) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_talk as "+\ "(select page_id from page where page_namespace=1)") #VIEW rev_talk (revisions corresponding to talk pages) dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_talk as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_talk))") #TABLES max_rev_talk_YYYY (latest revision for each pages in talk nspace, in year YYYY) self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision") self.years=range(int(self.minyear[0][0])+1, 2009) for self.year in self.years: dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_talk_"+str(self.year)) dbaccess.raw_query_SQL(self.access[1],"create table max_rev_talk_"+str(self.year)+\ " as (select max(rev_id) as max_id, rev_page from rev_talk "+\ "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add primary key (max_id)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add index (rev_page)") dbaccess.close_Connection(self.access[0])
def performAnalysis(self): ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", self.language + "_stub") ## Singleton objects to plot graphics in the class methods self.simpleGraph = graphic2D(self.filePath) ## self.multiGraph=graphic2Dmulti(self.filePath) ## self.giniGraph=graphicGini(self.filePath) ## self.splitHistGraph=graphicSplitHist(self.filePath, self.dataPath) self.graph3D = graphic3D(self.filePath, self.dataPath) print "Starting analysis on DB " + self.language + "_stub\n" ## self.UserNumContribsGroup(self.acceso[1]) ## self.UserNumContribsGenerations() authorsGini = [ (95.9677, 4.046), (95.7015, 4.304), (96.2223, 4.363), (95.7104, 4.395), (96.3844, 4.407), (92.4691, 4.528), (95.0077, 4.603), (95.0071, 4.7298), (93.785, 5.051), (93.6076, 5.888), ] authorsGini.sort() ##authorsGini=[(4.046,95.9677),(4.304,95.7015),(4.363,96.2223),(4.395,95.7104),(4.407,96.3844),(4.528,92.4691),(4.603,95.0077),(4.7298,95.0071),(5.051,93.785),(5.888,93.6076)] self.simpleGraph.createGraphic( "authors-Gini", (authorsGini,), "Gini coeff. (%)", "Number of different authors (log)", "Gini coeff. vs. number of registered authors in the top-ten Wikipedias.", ) ## Close DB connection dbaccess.close_Connection(self.acceso[0]) print "This is finished"
def general_stats(self): """ Preprocessing actions for general statistics scripts """ #FILE page_len.dat, with info about length of pages self.f=open("overall/data/page_len.dat", 'w') self.f.write("page_len\tns\tis_redirect\tis_stub\tis_new\tlang\n") self.f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) print "Retrieving info from "+self.language+"\n" results=dbaccess.raw_query_SQL(self.access[1], "SELECT page_len, page_namespace, page_is_redirect, page_is_stub, "+\ "page_is_new FROM page") print "Updating page_len info file with "+self.language+"\n" self.f=open("overall/data/page_len.dat", 'a') for result in results: self.f.write(str(int(result[0]))+"\t"+str(int(result[1]))+"\t"+str(int(result[2]))+"\t"+\ str(int(result[3]))+"\t"+str(int(result[4]))+"\t"+self.language+"\n") self.f.close() results=None dbaccess.close_Connection(self.access[0])
def analyze(self): for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Total no. of revisions made by every logged author dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\ "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\ "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user") dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)") print "Created table user_revs for "+self.language+"wiki...\n" #Min and max timestamp for every logged author + total num_revs dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_allns AS "+\ "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\ "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\ "ORDER BY min_ts)") print "Created table time_range_allns for "+self.language+"wiki...\n" #Close DB connection dbaccess.close_Connection(self.access[0])
self.access[1], "ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)" ) except Exception, e: print "An exception ocurred, the problem was the following:\n" print e print "*************\n\n" # try: # print "Generating index for rev_user_text and timestamp...\n" # dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)") # except Exception, e: # print "An exception ocurred, the problem was the following:\n" # print e # print "*************\n\n" print "Database" + "wx_" + self.language + "wiki_" + self.dumptype + " ready for quantitative analysis...\n" ##Close connection to DB server dbaccess.close_Connection(self.access[0]) if __name__ == "__main__": languages = ["pt", "it", "nl", "ja", "pl", "fr", "de"] # Normal languages for lang in languages: new_index = indexes("root", "phoenix", lang, "research") new_index.make_indexes() # The stub dump for enwiki # index_english=indexes("root","phoenix","en","stub_research") # index_english.make_indexes()
print self.pageinsert.encode('utf_8') elif self.options.monitor: while 1: try: dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode('utf_8')) except (Exception), e: print e else: break #Reset status vars self.pageinsertrows = 0 self.pageinsertsize = 0 ########IF WE USE MONITOR MODE, CLOSE DB CONNECTION if options.monitor and (not options.fileout and not options.streamout): dbaccess.close_Connection(self.acceso[1]) ################################################ #Checking out total time consumed and display end message self.timeCheck = datetime.datetime.now() self.timeDelta = self.timeCheck - self.start print >> sys.stderr, "\n" print >> sys.stderr, "File successfully parsed..." print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)" % (self.page_num,\ float(self.page_num)/self.timeDelta.seconds, self.rev_num, float(self.rev_num)/self.timeDelta.seconds) ##Main zone if __name__ == '__main__': usage = "usage: %prog [options]" parserc = OptionParser(usage) parserc.add_option(
def decompress (self): """ Decompress the DB dumps into MySQL """ ##TODO: Ad-hoc acuerdate de quitar esto POR DIOSSS ##self.filename="mtwiki-latest-pages-meta-history.xml.7z" if self.dumptype=="research": program="dump_sax_research.py" elif self.dumptype=="standard": program="dump_sax.py" else: print "Error! Unexpected type of dump received" return -1 self.filename=self.filenameTemplate.safe_substitute(language=self.language,file=self.files[0]) #Then we call our parser "dump_sax_research.py" to load data into MySQL command_7z="7za e -so dumps/"+self.filename+" | "+"python "+program+\ " -u "+self.msqlu+" -p "+self.msqlp+" -d "+"wx_"+self.language+"_"+self.dumptype+\ " --log "+self.language+".log" success=os.system(command_7z) if success == 0: print "DB "+"wx_"+self.language+\ self.dumptype+" successfully decompressed...\n\n" else: print "Error! There was an error trying to decompress database --> "+\ "wx_"+self.language+self.dumptype return -1 #Loading into MySQL other interesting tables directly provided in SQL format #SQL code to generate the tables is embedded in the SQL file itself ## for index in range(1,len(self.files)): ## self.filename=self.filenameTemplate.safe_substitute(language=self.language, file=self.files[index]) ## command_gzip="gzip -d dumps/"+self.filename ## command_mysql="mysql -u "+self.msqlu+" -p"+self.msqlp+\ ## " wx_"+self.language+"_"+self.dumptype+\ ## " < dumps/"+self.filename.rstrip(".gz") ## command_comp="gzip dumps/"+self.filename.rstrip(".gz") ## print "Decompressing "+self.filename+"..." ## success=os.system(command_gzip) ## if success==0: ## print "Loading "+self.filename.rstrip(".gz")+" into MySQL database..." ## success=os.system(command_mysql) ## if success==0: ## print "Compressing again "+self.filename.rstrip(".gz")+"..." ## success=os.system(command_comp) ## if success!=0: ## print "Error compressing again "+self.filename.rstrip(".gz") ## return -1 ## else: ## print "Error loading "+self.filename.rstrip(".gz") ## return -1 ## else: ## print "Error decompressing "+self.filename ## return -1 print "Generating indexes for tables page and revision...\n" print "Depending on the dump size this may take a while...\n" acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu,\ self.msqlp, "wx_"+self.language+"_"+self.dumptype) #Generate adequate indexes and keys in tables page and revision print "Generating index for page_len...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE page ADD INDEX (page_len)") print "Modifying rev_timestamp to support DATETIME and creating index...\n" #dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision MODIFY rev_timestamp DATETIME") dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX timestamp (rev_timestamp)") print "Generating index for rev_page and rev_timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)") print "Generating index for rev_user and rev_timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)") print "Generating index for rev_user_text and timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)") dbaccess.close_Connection(acceso[0]) print "Database ready for quantitative analysis...\n" print "Let's go on... Cross your fingers... ;-) \n\n\n" return success
def analyze(self): #Initialize all files headers #Survival data for all users (including editors out of MAIN) f=open("wkp_surv_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() #Survival data for all logged users who edited in MAIN f=open("wkp_surv_main_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() f=open("wkp_surv_join_core_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() f=open("wkp_surv_in_core_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() f=open("wkp_surv_core_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() f=open("wkp_surv_join_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() f=open("wkp_surv_in_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" print "Starting language "+self.language+"\n" ##IN SYSTEM self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\ "where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN MAIN self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_main_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished core users for language "+self.language+"\n" ########################### ##REV CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished all tasks for "+self.language+"\n"
def surv_files(self): """ Creates all data files used as input for demography scripts in GNU R """ #Initialize all files headers #FILE: Survival data for all users (including editors out of MAIN) f=open("wkp_surv_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() #FILE: Survival data for all logged users who edited in MAIN f=open("wkp_surv_main_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() #FILE: Survival data for all logged editors until they join the core (activity) f=open("wkp_surv_join_core_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() #FILE: Survival data for logged editors since they join the core until they leave it (activity) f=open("wkp_surv_in_core_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() #FILE: Survival data for loged editors since they leave the core until death (activity) f=open("wkp_surv_core_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() #FILE: Survival data for all logged editors until they join the core (revisions) f=open("wkp_surv_join_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() #FILE: Survival data for logged editors since they join the core until they leave it (revisions) f=open("wkp_surv_in_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() #FILE: Survival data for loged editors since they leave the core until death (revisions) f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" print "Starting language "+self.language+"\n" ##IN SYSTEM self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\ " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN MAIN self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_main_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished core users by activity for language "+self.language+"\n" ########################### ##REV CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished all surv_file tasks for "+self.language+"\n"
def core_prepro(self): """ Creates intermediate tables with info about core members (by activity and by top % of total number of revisions """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Obtain the list of years and months, with total num. of revisions and total num of logged users dbaccess.raw_query_SQL(self.access[1],"drop table if exists core_limits_monthly") dbaccess.raw_query_SQL(self.access[1],"create table core_limits_monthly as "+\ "(select year(rev_timestamp) as year, month(rev_timestamp) as month, "+\ "count(distinct(rev_user)) num_users, count(*) num_revs from revision_logged group by year, month "+\ "order by year, month)") print "Created table core_limits_monthly "+self.language+"\n" date_range=dbaccess.raw_query_SQL(self.access[1],"select * from core_limits_monthly "+\ "order by year, month") #Core users: top-10% of total number of authors in that month #Core users with top-10% of total number of revisions in that month #Loop for each month need_create=True #LOOP FOR EACH MONTH IN LANG for adate in date_range: print "Processing year "+str(adate[0])+" month "+str(adate[1])+"\n" total_users=adate[2] #Total number of authors in that month total_revs=adate[3] #Total number of revisions in that month # To take the core of top-10% most active authors in that month limit_auth=int(round(total_users*0.1))+1 # To take the core of authors responsible for top-10% of tot num.revs in that month limit_revs=int(round(total_revs*0.1)) count_users=0 count_revs=0 insert_users=True insert_revs=True #Get the list of active logged users for that month (descendent order!) ##IMPORTANT NOTE: FIRST APPLY SUBQUERY TO FILTER ALL REVISIONS IN THIS MONTH ##THEN APPLY THE GROUP AND ORDER CLAUSES ON THAT SUBQUERY ##THIS WAY, WE SAVE **A LOT** OF TIME DURING THIS PREPROCESSING STAGE month_users=dbaccess.raw_query_SQL(self.access[1],"select rev_user, count(*) num_revs_month from "+\ "(select rev_user, rev_timestamp from revision_logged where "+\ "year(rev_timestamp)="+str(int(adate[0]))+" and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\ "order by num_revs_month desc" ) #Calculate num. of authors accumulating top-10% of revs in that month for auser in month_users: count_revs=count_revs+int(auser[1]) count_users=count_users+1 if (count_revs>limit_revs): break if (need_create): #TABLE: Monthly info for users in core (by activity) dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_core_monthly") dbaccess.raw_query_SQL(self.access[1],"create table users_core_monthly as (select "+\ "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\ "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\ "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\ " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\ "order by num_revs_month desc limit "+str(limit_auth)+")" ) #TABLE: Monthly info for users in core (by revisions) dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_rev_core_monthly") dbaccess.raw_query_SQL(self.access[1],"create table users_rev_core_monthly as (select "+\ "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\ "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\ "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\ " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\ "order by num_revs_month desc limit "+str(count_users)+")" ) print "Created tables monthly data for "+self.language+"\n" need_create=False else: #Insert info in table with monthly info for users in core (by activity) dbaccess.raw_query_SQL(self.access[1],"insert into users_core_monthly (select "+\ "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\ "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\ "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\ " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\ "order by num_revs_month desc limit "+str(limit_auth)+")" ) #Insert info in table with monthly info for users in core (by revisions) dbaccess.raw_query_SQL(self.access[1],"insert into users_rev_core_monthly (select "+\ "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\ "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\ "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\ " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\ "order by num_revs_month desc limit "+str(count_users)+")" ) print "Inserted monthly data for "+self.language+"\n" ####NOTE: WE ARE SUPPOSING THAT USERS DOES NOT LEAVE THE CORE SUBSEQUENTLY, TO COME BACK AGAIN, i.e. ####ONCE THEY JOIN THE CORE, WE ASSUME THAT THE DEFINITELY LEAVE IT AT max_ts_core ####BY THE MOMENT, WE WILL STICK TO THIS ASSUMPTION. LATER ON, WE CAN SEE HOW TO IDENTIFY BLANK PERIODS #Insert in table of core users values #users_core = top-10% most active authors in each month #users_rev_core = authors accumulating top-10% of tot. num. of revs. in that month #TABLE: Users in core by activity (user, min_ts, max_ts, min_ts_core, max_ts_core) print "Creating table users_core for "+ self.language+"\n" dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_core") dbaccess.raw_query_SQL(self.access[1], "create table users_core as (select x.*, "+\ "(select min_ts from time_range_users r where r.rev_user=x.rev_user) min_ts, (select max_ts from "+\ "time_range_users s where s.rev_user=x.rev_user) max_ts "+\ "from (select rev_user, min(lower_ts_month) min_ts_core, "+\ "max(upper_ts_month) max_ts_core from users_core_monthly group by rev_user) x)") #TABLE: Users in core by activity (user, min_ts, max_ts, min_ts_core, max_ts_core) print "Creating table users_rev_core for "+ self.language+"\n" dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_rev_core") dbaccess.raw_query_SQL(self.access[1], "create table users_rev_core as (select x.*, "+\ "(select min_ts from time_range_users r where r.rev_user=x.rev_user) min_ts, (select max_ts from "+\ "time_range_users s where s.rev_user=x.rev_user) max_ts "+\ "from (select rev_user, min(lower_ts_month) min_ts_core, "+\ "max(upper_ts_month) max_ts_core from users_rev_core_monthly group by rev_user) x)") print "All core_prepro tasks finished for"+ self.language+"\n" #Close DB connection dbaccess.close_Connection(self.access[0])
"CREATE TABLE page_FAs (page_id int(10) unsigned NOT NULL, page_title varchar(255), in_cover integer(1), PRIMARY KEY page_id(page_id))") ## dbaccess.raw_query_SQL(acceso[1], "DROP TABLE IF EXISTS page_talk_FAs") ## dbaccess.raw_query_SQL(acceso[1],\ ## "CREATE TABLE page_talk_FAs (page_id int(10) unsigned NOT NULL, page_title varchar(255), PRIMARY KEY page_id(page_id))") for element in listFA: print "Quering for ---> "+element[0].decode('utf_8')+" \n" try: dbaccess.raw_query_SQL(acceso[1], "INSERT INTO page_FAs (SELECT page_id, page_title, "+str(element[1])+" FROM page WHERE page_title='"+\ element[0].replace("'","\\'").replace('"', '\\"')+"')") ## print "Quering for ---> Discusión:"+element[0].decode('utf_8')+" \n" ## try: ## dbaccess.raw_query_SQL(acceso[1], "INSERT INTO page_talk_FAs (SELECT page_id, page_title FROM page WHERE page_title='Discusión:"+\ ## element[0].replace("'","\\'").replace('"', '\\"')+"' and page_namespace=1)") except (Exception), e: print "Ehhhhh, an exception ocurred..."+str(e)+"\n" #if len(result)>0: #fileID.write(str(result[0][0])+",") #listID.append(str(result[0][0])) #print "ID "+str(result[0][0])+"\n" #fileID.close() dbaccess.close_Connection(acceso[0]) ## for element in listFA: ## print "--> '"+element[0].decode('utf_8')+"'\n" ## print '\n' ## count=0 ## for element in listFA: ## if element[1]==True: ## count+=1 ## print count
def decompress (self): """ Decompress the DB dumps into MySQL """ ##TODO: Ad-hoc acuerdate de quitar esto POR DIOSSS ##self.filename="mtwiki-latest-pages-meta-history.xml.7z" if self.dumptype=="research": program="dump_sax_research.py" elif self.dumptype=="standard": program="dump_sax.py" else: print "Error! Unexpected type of dump received" return -1 self.filename=self.filenameTemplate.safe_substitute(language=self.language,file=self.files[0]) #Then we call our parser "dump_sax_research.py" to load data into MySQL command_7z="7za e -so dumps/"+self.filename+" | "+"python "+program+\ " -u "+self.msqlu+" -p "+self.msqlp+" -d "+"wx_"+self.language+"_"+self.dumptype+\ " --log "+self.language+".log" success=os.system(command_7z) if success == 0: print "DB "+"wx_"+self.language+\ self.dumptype+" successfully decompressed...\n\n" else: print "Error! There was an error trying to decompress database --> "+\ "wx_"+self.language+self.dumptype return -1 #Loading into MySQL other interesting tables directly provided in SQL format #SQL code to generate the tables is embedded in the SQL file itself for index in range(1,len(self.files)): self.filename=self.filenameTemplate.safe_substitute(language=self.language, file=self.files[index]) command_gzip="gzip -d dumps/"+self.filename command_mysql="mysql -u "+self.msqlu+" -p"+self.msqlp+\ " wx_"+self.language+"_"+self.dumptype+\ " < dumps/"+self.filename.strip(".gz") command_comp="gzip dumps/"+self.filename.strip(".gz") print "Decompressing "+self.filename+"..." success=os.system(command_gzip) if success==0: print "Loading "+self.filename.strip(".gz")+" into MySQL database..." success=os.system(command_mysql) if success==0: print "Compressing again "+self.filename.strip(".gz")+"..." success=os.system(command_comp) if success!=0: print "Error compressing again "+self.filename.strip(".gz") return -1 else: print "Error loading "+self.filename.strip(".gz") return -1 else: print "Error decompressing "+self.filename return -1 print "Generating indexes for tables page and revision...\n" print "Depending on the dump size this may take a while...\n" acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu,\ self.msqlp, "wx_"+self.language+"_"+self.dumptype) #Generate adequate indexes and keys in tables page and revision print "Generating index for page_len...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE page ADD INDEX (page_len)") print "Modifying rev_timestamp to support DATETIME and creating index...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision MODIFY rev_timestamp DATETIME") dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX timestamp (rev_timestamp)") print "Generating index for rev_page and rev_timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)") print "Generating index for rev_user and rev_timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)") print "Generating index for rev_user_text and timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)") dbaccess.close_Connection(acceso[0]) print "Database ready for quantitative analysis...\n" print "Let's go on... Cross your fingers... ;-) \n\n\n" return success
def __init__(self, conf, language="furwiki"): """ Creates multiple views to create a convenient interface to access quantitative data It also generates necessary tables and views to store intermidiate results, so that other methods can later store data directly. """ self.conf = conf self.language = language ##List of namespaces to analyse. We have added new special namespaces (e.g. subsets of main) self.nspaces=["all","ns0","articles","redirects","cur_redirects","cur_stubs","stubs","talk",\ "pageUser", "userTalk","meta", "metaTalk", "image", "imageTalk", "mediawiki",\ "mediawikiTalk", "template", "templateTalk", "help", "helpTalk", "category", "categoryTalk"] ##Some fancy lists to work with time intervals in some private methods following self.type_interval_columns={"days":"day, year", "weeks":"week, year", "months":"month, year",\ "quarters":"quarter, year", "years":"year"} self.type_interval_select={"days":"DAYOFYEAR(rev_timestamp) AS day, YEAR(rev_timestamp) AS year ",\ "weeks":"WEEK(rev_timestamp,1) AS week, YEAR(rev_timestamp) AS year ",\ "months":"MONTH(rev_timestamp) AS month, YEAR(rev_timestamp) AS year ",\ "quarters":"QUARTER(rev_timestamp) AS quarter, YEAR(rev_timestamp) AS year ",\ "years":"YEAR(rev_timestamp) AS year "} self.type_interval_group={"days":"year, day", "weeks":"year, week", "months":"year, month",\ "quarters":"year, quarter", "years":"year"} ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## Delete previous versions of views for nspace in self.nspaces: dbaccess.dropView(self.acceso[1], nspace + "_" + self.language) ## Create updated versions for views from revision table #View sumarizing all info for every revision (linking with info from table page) dbaccess.createView(self.acceso[1], view="all_"+self.language,\ columns="rev_id, page_id, rev_len, page_ns, page_len, is_redirect, author, author_text,"+\ " rev_timestamp, rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_namespace, page_len, rev_is_redirect,"+\ " rev_user, rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id") #View sumarizing info regarding pages in namespace=0 (including articles, stubs and redirects) dbaccess.createView(self.acceso[1], view="ns0_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, is_redirect, author, author_text,"+\ " rev_timestamp, rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, page_is_redirect, rev_user,"+\ " rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id"+\ " AND page_namespace=0") #View sumarizing info for articles (excluding pages that currently are redirects and stubs) dbaccess.createView(self.acceso[1], view="articles_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text,"+\ " rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND page_namespace=0 AND "+\ "page_is_redirect=0 AND page_is_stub=0") #View with info only for redirects (pages that were redirects when that revision was made) dbaccess.createView(self.acceso[1], view="redirects_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\ "page_namespace=0 AND rev_is_redirect=1") #View with info only for current redirects dbaccess.createView(self.acceso[1], view="cur_redirects_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\ "page_namespace=0 AND page_is_redirect=1") #View with info only for revisions of stub pages (pages that were stubs when that revision was made) dbaccess.createView(self.acceso[1], view="stubs_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace=0 AND rev_is_stub=1") #View with info only for revisions of current stub pages dbaccess.createView(self.acceso[1], view="cur_stubs_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace=0 AND page_is_stub=1") #From this point on, automatically create views for the set of pages included in the remaining namespaces in MediaWiki for nspace, nsnum in zip(self.nspaces[7:], range(1, 16)): dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"+\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace="+str(nsnum)) #View sumarizing the distribution of pages among namespaces dbaccess.dropView(self.acceso[1], "nspaces_" + self.language) dbaccess.createView(self.acceso[1],view="nspaces_"+self.language, columns="namespace, pages_in_nspace",\ query="SELECT page_namespace, COUNT(*) FROM page GROUP BY page_namespace") ## Intermidiate views for the minimun timestamp of every page [annons, and logged users] ## And other useful intermediate views regarding page evolution for nspace in self.nspaces: dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\ "_page_min_timestamp_logged") dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\ "_page_min_timestamp_logged", columns="page_id, rev_id, author, rev_timestamp",\ query="SELECT page_id, rev_id,author, MIN(rev_timestamp) FROM "+\ nspace+"_"+self.language+" WHERE author!=0 GROUP BY page_id") dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\ "_page_min_timestamp_annons") dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\ "_page_min_timestamp_annons",\ columns="page_id, rev_id, author_text, rev_timestamp",\ query="SELECT page_id,rev_id,author_text, MIN(rev_timestamp) FROM "+\ nspace+"_"+self.language+" WHERE author=0 GROUP BY page_id") dbaccess.dropView(self.acceso[1], nspace + "_" + self.language + "_list_months") dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_months",\ columns="month, year",query="SELECT MONTH(rev_timestamp) as month, "+\ "YEAR(rev_timestamp) as year"+\ " FROM "+nspace+"_"+self.language+" GROUP BY year, month ORDER BY year, month") dbaccess.dropView(self.acceso[1], nspace + "_" + self.language + "_list_quarters") dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_quarters",\ columns="quarter, year",query="SELECT QUARTER(rev_timestamp) as quarter, "+\ "YEAR(rev_timestamp) as year FROM "+nspace+"_"+self.language+" GROUP BY year,"+\ " quarter ORDER BY year, quarter") ## Close DB connection dbaccess.close_Connection(self.acceso[0])
def summary_evol(idiomas): """ Create some graphs summarizing the evolution in time of critical quantitative parameters for each language version to explore @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ ## ¡¡WARNING!! Please be careful when selecting values from tables storing evolution in time of number of articles, size etc. ## You must always use a GROUP BY(pageCount, limitDate) clause, due to ## periods of inactivity that could generate duplicate entries in the graphics filenames=["page_dates.data", "page_Count_evol.data", "page_Len_Sum_log.data", "contribs_evol.data", "nspaces.data", "nspace_distrib.data", "diffArticles.data", "authors.data", "diff_authors_x_article.data", "authors_authors_per_pagelen.data", "pagelen_authors_per_pagelen.data"] filenames_out=["Tot_num_articles_absx_absy.png", "Tot_num_articles_absx_logy.png", "Tot_num_articles_logx_logy.png", "Tot_pagelensum_absx_absy.png", "Tot_pagelensum_absx_logy.png", "Tot_pagelensum_logx_logy.png", "Tot_contribs_absx_absy.png", "Tot_contribs_absx_logy.png", "Tot_contribs_logx_logy.png", "Diffs_articles_per_author.png", "Diffs_authors_per_article.png", "Diff_authors_against_page_len.png"] for idioma in idiomas: acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") #acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages") result=dbaccess.query_SQL(acceso[1], "pageCount, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)") result2=dbaccess.query_SQL(acceso[1], "pageLenSum, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)") result3=dbaccess.query_SQL(acceso[1], "contribs, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)") resultnspace=dbaccess.query_SQL(acceso[1], "pages_nspace, namespace", "stats_nspace_"+idioma) diffArticlesNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_NoAnnons_author_"+idioma) diffInitNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_Init_NoAnnons_author_"+idioma) totRevperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Contrib_NoAnnons_page_id_"+idioma) diffAuthorperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Article_NoAnnons_page_id_"+idioma) dautxplen=dbaccess.query_SQL(acceso[1], "page_len, authors", "stats_pagelen_difauthors_"+idioma) dbaccess.close_Connection(acceso[0]) data=__tup_to_list(result, 1) dates_x=data.pop() page_Count=data.pop() ## if idioma=="frwiki": data2=__tup_to_list(result2, 2) dates_x=data2.pop() dates_x.pop(0) dates_x.pop(0) page_Len_Sum=data2.pop() page_Len_Sum.pop(0) page_Len_Sum.pop(0) ## else: ## data2=__tup_to_list(result2, 1) ## dates_x=data2.pop() ## page_Len_Sum=data2.pop() data3=__tup_to_list(result3, 1) dates_x=data3.pop() contribs=data3.pop() datanspace=__tup_to_list(resultnspace) namespaces=datanspace.pop() pages_nspace=datanspace.pop() dataDiffArticlesNoann=__tup_to_list(diffArticlesNoann) diffArticles=dataDiffArticlesNoann.pop() authors=dataDiffArticlesNoann.pop() dataDiffInitNoann=__tup_to_list(diffInitNoann) diffInitArticles=dataDiffInitNoann.pop() authors=dataDiffInitNoann.pop() datatotRevperArticle=__tup_to_list(totRevperArticle) totalRev=datatotRevperArticle.pop() article=datatotRevperArticle.pop() datadiffAuthorperArticle=__tup_to_list(diffAuthorperArticle) diffAuthors=datadiffAuthorperArticle.pop() article=datadiffAuthorperArticle.pop() datadautxplen=__tup_to_list(dautxplen) autxplen=datadautxplen.pop() lenautxplen=datadautxplen.pop() ## Introduce in data list results form queries in the proper order ## corresponding with the name files we pass to the GNU R script summary_evol.R for i in range(len(page_Len_Sum)): if page_Len_Sum[i]!=0: page_Len_Sum[i]=math.log10(page_Len_Sum[i]) dataList=[dates_x, page_Count, page_Len_Sum, contribs, namespaces, pages_nspace, diffArticles, authors, diffAuthors, autxplen, lenautxplen] for filename, data in zip (filenames, dataList): if(filename.find('date')!=-1): __makeDatesFile(idioma, filename, data) else: __makeDataFile(idioma, filename, data) ###################################### #Pass data filenames to the GNU R script with a file f=open("./data/summary_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/data/"+line+"\n") f.close() #Idem with graphic output filenames f=open("./data/summary_files_out.data",'w') for line in filenames_out: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() #CALL THE GNU R SCRIPT summary_evol.R succ=os.system("R --vanilla < ./summary_evol.R > debug_R") if succ==0: print "Funcion summary_evol ejecutada con exito para el lenguage... "+idioma
def __init__(self, conf, language="furwiki"): """ Creates multiple views to create a convenient interface to access quantitative data It also generates necessary tables and views to store intermidiate results, so that other methods can later store data directly. """ self.conf=conf self.language=language ##List of namespaces to analyse. We have added new special namespaces (e.g. subsets of main) self.nspaces=["all","ns0","articles","redirects","cur_redirects","cur_stubs","stubs","talk",\ "pageUser", "userTalk","meta", "metaTalk", "image", "imageTalk", "mediawiki",\ "mediawikiTalk", "template", "templateTalk", "help", "helpTalk", "category", "categoryTalk"] ##Some fancy lists to work with time intervals in some private methods following self.type_interval_columns={"days":"day, year", "weeks":"week, year", "months":"month, year",\ "quarters":"quarter, year", "years":"year"} self.type_interval_select={"days":"DAYOFYEAR(rev_timestamp) AS day, YEAR(rev_timestamp) AS year ",\ "weeks":"WEEK(rev_timestamp,1) AS week, YEAR(rev_timestamp) AS year ",\ "months":"MONTH(rev_timestamp) AS month, YEAR(rev_timestamp) AS year ",\ "quarters":"QUARTER(rev_timestamp) AS quarter, YEAR(rev_timestamp) AS year ",\ "years":"YEAR(rev_timestamp) AS year "} self.type_interval_group={"days":"year, day", "weeks":"year, week", "months":"year, month",\ "quarters":"year, quarter", "years":"year"} ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## Delete previous versions of views for nspace in self.nspaces: dbaccess.dropView(self.acceso[1], nspace+"_"+self.language) ## Create updated versions for views from revision table #View sumarizing all info for every revision (linking with info from table page) dbaccess.createView(self.acceso[1], view="all_"+self.language,\ columns="rev_id, page_id, rev_len, page_ns, page_len, is_redirect, author, author_text,"+\ " rev_timestamp, rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_namespace, page_len, rev_is_redirect,"+\ " rev_user, rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id") #View sumarizing info regarding pages in namespace=0 (including articles, stubs and redirects) dbaccess.createView(self.acceso[1], view="ns0_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, is_redirect, author, author_text,"+\ " rev_timestamp, rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, page_is_redirect, rev_user,"+\ " rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id"+\ " AND page_namespace=0") #View sumarizing info for articles (excluding pages that currently are redirects and stubs) dbaccess.createView(self.acceso[1], view="articles_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text,"+\ " rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND page_namespace=0 AND "+\ "page_is_redirect=0 AND page_is_stub=0") #View with info only for redirects (pages that were redirects when that revision was made) dbaccess.createView(self.acceso[1], view="redirects_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\ "page_namespace=0 AND rev_is_redirect=1") #View with info only for current redirects dbaccess.createView(self.acceso[1], view="cur_redirects_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\ "page_namespace=0 AND page_is_redirect=1") #View with info only for revisions of stub pages (pages that were stubs when that revision was made) dbaccess.createView(self.acceso[1], view="stubs_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace=0 AND rev_is_stub=1") #View with info only for revisions of current stub pages dbaccess.createView(self.acceso[1], view="cur_stubs_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace=0 AND page_is_stub=1") #From this point on, automatically create views for the set of pages included in the remaining namespaces in MediaWiki for nspace, nsnum in zip(self.nspaces[7:], range(1,16)): dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"+\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace="+str(nsnum)) #View sumarizing the distribution of pages among namespaces dbaccess.dropView(self.acceso[1], "nspaces_"+self.language) dbaccess.createView(self.acceso[1],view="nspaces_"+self.language, columns="namespace, pages_in_nspace",\ query="SELECT page_namespace, COUNT(*) FROM page GROUP BY page_namespace") ## Intermidiate views for the minimun timestamp of every page [annons, and logged users] ## And other useful intermediate views regarding page evolution for nspace in self.nspaces: dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\ "_page_min_timestamp_logged") dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\ "_page_min_timestamp_logged", columns="page_id, rev_id, author, rev_timestamp",\ query="SELECT page_id, rev_id,author, MIN(rev_timestamp) FROM "+\ nspace+"_"+self.language+" WHERE author!=0 GROUP BY page_id") dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\ "_page_min_timestamp_annons") dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\ "_page_min_timestamp_annons",\ columns="page_id, rev_id, author_text, rev_timestamp",\ query="SELECT page_id,rev_id,author_text, MIN(rev_timestamp) FROM "+\ nspace+"_"+self.language+" WHERE author=0 GROUP BY page_id") dbaccess.dropView(self.acceso[1],nspace+"_"+self.language+"_list_months") dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_months",\ columns="month, year",query="SELECT MONTH(rev_timestamp) as month, "+\ "YEAR(rev_timestamp) as year"+\ " FROM "+nspace+"_"+self.language+" GROUP BY year, month ORDER BY year, month") dbaccess.dropView(self.acceso[1],nspace+"_"+self.language+"_list_quarters") dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_quarters",\ columns="quarter, year",query="SELECT QUARTER(rev_timestamp) as quarter, "+\ "YEAR(rev_timestamp) as year FROM "+nspace+"_"+self.language+" GROUP BY year,"+\ " quarter ORDER BY year, quarter") ## Close DB connection dbaccess.close_Connection(self.acceso[0])
def measuring(idiomas): """ Create some graphs following the research presented by Jakob Voss in his paper Mesuring Wikipedia (ISSI 2005) @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ ## Generates some graphics reproducing those in Measuring Wikipedia article filenames=["total_edits.data", "noannons_edits.data", "annon_edits.data", "authors_per_article_desc.data", "articles_per_logged_author_desc.data", "articles_per_anonymous_author_desc.data"] filenames_out=["total_edits_per_author.png", "total_edits_per_noannon_author.png", "total_edits_per_annon_author.png", "diff_authors_per_article_descending.png", "diff_articles_per_logged_author_descending.png", "diff_articles_per_anonymous_author_descending.png"] for idioma in idiomas: acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") ## acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages") #Combined evolution graphics #ALL THESE GRAPHICS ARE ALREADY GENERATED BY ERIK ZATCHE'S OFFICIAL PERL SCRIPTS #Database size #Total number of words #Total number of internal links #Number of articles (including redirects) #Number of active wikipedians (more than 5 contributions in a given month) #Number of very active wikipedians (more than 100 contributions in a given month) #Namespace size #OK, it is generated in summary_evol() method #Evolution in time of article size (histogram) #IDEA: Download page.sql files for a language for each semester period #Number of distinct authors per article (descending sorted graphic) #Already generated in summary_evol, ONLY NEED TO SORT AND ADJUST IN GNU R diffAuthorperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Article_NoAnnons_page_id_"+idioma) #Number of distinct articles per author (descending sorted graphic) #Idem as in the previous case diffArticlesNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_NoAnnons_author_"+idioma) diffArticlesAnn=dbaccess.query_SQL(acceso[1], "author_text, theCount", "stats_Article_Annons_author_text_"+idioma) data=__tup_to_list(diffAuthorperArticle) lisdiffauthorartic=data.pop() data=__tup_to_list(diffArticlesNoann) lisdiffarticleaut=data.pop() data=__tup_to_list(diffArticlesAnn,2) lisdiffarticleannon=data.pop() ## Ordenamos los resultados para que se puedan ajustar a una Power Law lisdiffauthorartic.sort(reverse=True) lisdiffarticleaut.sort(reverse=True) lisdiffarticleannon.sort(reverse=True) #Number of edtis per author #Retrieve results from database #We have already created GINI graphics for this parameter #ALSO AVAILABLE DATABASE TABLES WITH EVOLUTION IN TIME OF THIS PARAMETER tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma) tcauthor=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_author_"+idioma) tc_ann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_Annons_author_text_"+idioma) data=__tup_to_list(tcnoann) listcnoann=data.pop() data=__tup_to_list(tcauthor) listcauthors=data.pop() #BTW, we are also obtaining but not using the IP adresses of annon users data=__tup_to_list(tc_ann,2) listcann=data.pop() ## Arranging results in a decreasing way to adjust them to a power law listcnoann.sort(reverse=True) listcauthors.sort(reverse=True) listcann.sort(reverse=True) #Ingoing and outgoing number of links per article #STILL TO BE DEVELOPED #NEED TO FIRST IDENTIFY LINKS FOR A GIVEN ARTICLE IN THE DATABASE #LINKS TABLES MAY HELP, but in these dump versions they are all empty!!! #BROKEN LINKS also need to be considered dbaccess.close_Connection(acceso[0]) dataList=[listcauthors, listcnoann, listcann, lisdiffauthorartic, lisdiffarticleaut, lisdiffarticleannon] for filename, data in zip (filenames, dataList): if(filename.find('date')!=-1): __makeDatesFile(idioma, filename, data) else: __makeDataFile(idioma, filename, data) #Pass data filenames to the GNU R script with a file f=open("./data/measuring_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/data/"+line+"\n") f.close() #Idem with graphic output filenames f=open("./data/measuring_files_out.data",'w') for line in filenames_out: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() #CALL GNU R SCRIPT measuring_Wiki.R succ=os.system("R --vanilla < ./measuring_Wiki.R > debug_R") if succ==0: print "Funcion measuring_Wiki.R ejecutada con exito para el lenguage... "+idioma
def bots(self): """ Preprocessing actions with bots data """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #TABLE revs_bots (revisions made by officially identified bots, by year, month) dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_bots") dbaccess.raw_query_SQL(self.access[1], "create table revs_bots as select year(rev_timestamp) "+\ "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision where rev_user!=0 "+\ "and rev_user in (select ug_user from user_groups where ug_group='bot') group by "+\ "year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)") #TABLE revs_logged (revisions made by logged authors, by year, month) dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_logged") dbaccess.raw_query_SQL(self.access[1],"create table revs_logged as select year(rev_timestamp) "+\ "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision where rev_user!=0 "+\ "group by year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)") #TABLE revs_all (revisions made by all authors, by year, month) dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_all") dbaccess.raw_query_SQL(self.access[1], "create table revs_all as select year(rev_timestamp) "+\ "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision "+\ "group by year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)") dbaccess.close_Connection(self.access[0]) #FILE perc-bots-all-revs.dat % of all revisions due to bots file=open("overall/data/perc-bots-all-revs.dat",'w') file.write("year\tmonth\tperc_revs\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Obtain % of total no. revs due to bots, by year, month self.perc_revs=dbaccess.raw_query_SQL(self.access[1], "select bot.theyear, bot.themonth, "+\ "(bot.num_revs/tot.num_revs)*100 perc_revs from revs_bots as bot, revs_all as tot "+\ "where bot.theyear=tot.theyear and bot.themonth=tot.themonth;") dbaccess.close_Connection(self.access[0]) #Writing data to file file=open("overall/data/perc-bots-all-revs.dat",'a') for item in self.perc_revs: file.write(str(int(item[0]))+"\t"+str(int(item[1]))+"\t"+\ str(float(item[2]))+"\t"+self.language+"\n") file.close() #file perc-bots-logged-revs.dat % of all revisions due to bots file=open("overall/data/perc-bots-logged-revs.dat",'w') file.write("year\tmonth\tperc_revs\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #obtain % of no. revs by logged editors due to bots, by year, month self.perc_revs=dbaccess.raw_query_sql(self.access[1], "select bot.theyear, bot.themonth, "+\ "(bot.num_revs/logged.num_revs)*100 perc_logged_revs from revs_bots as bot, "+\ "revs_logged as logged where bot.theyear=logged.theyear and bot.themonth=logged.themonth;") dbaccess.close_connection(self.access[0]) #writing data to file file=open("overall/data/perc-bots-logged-revs.dat",'a') for item in self.perc_revs: file.write(str(int(item[0]))+"\t"+str(int(item[1]))+"\t"+\ str(float(item[2]))+"\t"+self.language+"\n") file.close()
self.pageinsert+=";" print self.pageinsert.encode('utf-8') elif self.options.monitor: while 1: try: dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode('utf-8')) except (Exception), e: print e else: break #Reset status vars self.pageinsertrows=0 self.pageinsertsize=0 ########IF WE USE MONITOR MODE, CLOSE DB CONNECTION if self.options.monitor and not self.options.fileout and not self.options.streamout: dbaccess.close_Connection(self.acceso[1]) ################################################ #Checking out total time consumed and display end message self.timeCheck=datetime.datetime.now() self.timeDelta=self.timeCheck-self.start print >> sys.stderr, "\n" print >> sys.stderr, "File successfully parsed..." print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)" % (self.page_num,\ float(self.page_num)/self.timeDelta.seconds, self.rev_num, float(self.rev_num)/self.timeDelta.seconds) ##Main zone if __name__ == '__main__': usage = "usage: %prog [options]" parserc = OptionParser(usage) parserc.add_option("-t","--stubth", dest="stubth", type="int", metavar="STUBTH", default=256, help="Max. size in bytes to consider an article as stub [default: %default]")