def __init__(self, language="furwiki", dumptype="research", msqlu="", msqlp=""): """ It receives the language and dumptype to download It returns an int =0 if the DB was successfully set up, =-1 if there was an error """ self.language=language #language to download self.dumptype=dumptype #type of dump self.files=["pages-meta-history.xml.7z", "redirect.sql.gz","page_restrictions.sql.gz",\ "user_groups.sql.gz", "logging.sql.gz", "interwiki.sql.gz", "langlinks.sql.gz", "externallinks.sql.gz",\ "templatelinks.sql.gz", "imagelinks.sql.gz", "categorylinks.sql.gz", "pagelinks.sql.gz", "oldimage.sql.gz",\ "image.sql.gz"] self.filename="" self.filenameTemplate=string.Template("""$language-latest-$file""") #dump's filename in Wikimedia's server #URL to download the file self.urld="" self.urldTemplate=string.Template("""http://download.wikimedia.org/$language/latest/$language-latest-$file""") if (msqlu=="" or msqlp==""): print "Error initializing DB dump object. You must provide a valid MySQL username and password" else: self.msqlu=msqlu #MySQL username for accessing and editing the DB self.msqlp=msqlp #MySQL password #We can manage two different types of dumps, stubs (without the text of every revision) and pages #(containing the text of every revision) #self.urld="http://download.wikimedia.org/"+self.language+"/latest/"+\ #self.language+"-latest-pages-meta-history.xml.7z" #File to download #patterns for files #http://download.wikimedia.org/furwiki/20060921/furwiki-20060921-pages-meta-history.xml.7z #http://download.wikimedia.org/amwiki/20061014/amwiki-20061014-stub-meta-history.xml.gz #Create /dumps directory if it does not exist yet directories=os.listdir("./") if ("dumps" not in directories): os.makedirs("./dumps") ## Initialize DB in MySQL: create DB and tables definitions print "Initializing DB for --> "+ self.language +"\n" #Retrieving connection and cursor to access the DB acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,"mysql") dbaccess.createDB_SQL(acceso[1],"wx_"+self.language+"_"+self.dumptype) if self.dumptype=="research": command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\ "wx_"+self.language+"_"+self.dumptype+" < tables_research.sql > debug_mysql.log" elif self.dumptype=="standard": command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\ "wx_"+self.language+"_"+self.dumptype+" < tables_standard.sql > debug_mysql.log" ok=os.system(command) if ok == 0: acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,\ "wx_"+self.language+"_"+self.dumptype) dbaccess.raw_query_SQL(acceso[1], "alter table page max_rows = 200000000000 avg_row_length = 50") dbaccess.raw_query_SQL(acceso[1], "alter table revision max_rows = 200000000000 avg_row_length = 50") if self.dumptype=="standard": dbaccess.raw_query_SQL(acceso[1], "alter table text max_rows = 200000000000 avg_row_length = 50") dbaccess.close_Connection(acceso[0]) else: print "Error! There was a problem initializing definitions for DB tables" dbaccess.close_Connection(acceso[0])
def __init__(self, options): self.options=options if self.options.monitor and (not self.options.fileout and not self.options.streamout): self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\ self.options.user, self.options.passwd, self.options.database) self.nspace_dict={} self.codens='' self.page_dict={} self.rev_dict = {} self.stack=[] self.current_text = '' self.current_elem=None self.revfile=None self.pagefile=None self.page_num = 0 self.rev_num=0 self.last_page_len=0 self.rev_count=0 self.prior_rev_id='NULL' self.isRedirect='0' self.isStub='0' self.isMinor='0' self.inlinks=None # internal links self.outlinks=None # external links self.trans=None # translations to other language editions self.sections=None # sections (no matter their level) self.highwords=None #highlighted words (bold/italics/bold+italics) self.special=None #rev_text, special links filtered out ######################################## ##REGEXPS ######################################## self.pathighlight=r"\'\'+"#Regexp matching bold/italics/bold+italics wikitags self.pathighwords=r"\'\'+.*\'\'+" #Regexp for highlighted words self.pathtml=r"\<[^\>]+\>" #Regexp matching HTML tags self.patunicode=r"\&\w+\;|\&\#\d+\;|[\xc0-\xf7][\x80-\xbf]+" #Regexp matching unicode chars self.patspecial=r"\[\[[^\:\]]+\:[^\]]*\]\]" #Regexp matching special inlinks (image/category/interwiki) self.patinlink=r"\[\[.*\]\]" #Regexp matching inlinks (after filtering image/category/interwiki links) self.patoutlink=r"\s\[[^\[\]]*\]|http[s]?://" #Regexp matching outlinks self.patsection=r"\=\=+[\s]*[^\=]*[\s]*\=\=+" #Regexp matching section titles self.pattrans=r"\[\[..[.]?:"#Regexp matching translation links self.patitemize=r"\n\**" #Regexp matching itemize bullets and line branches self.patdumb=r"\)\(" #A rapid solution to concatenate tuples in special instert strings self.fileErrPath="./errors.log" #TODO: Solve lookup in global scope if the special item did not show up in any previous revision #of the whole dump (maybe lookup query to DB??) self.highwords_dict={}; self.special_dict={}; self.inlinks_dict={}; self.outlinks_dict={}; self.trans_dict={} self.highwords_id=1; self.special_id=1; self.inlinks_id=1; self.outlinks_id=1 self.trans_id=1; self.highwords_rev_insert=[]; self.special_rev_insert=[]; self.inlinks_rev_insert=[] self.outlinks_rev_insert=[]; self.trans_rev_insert=[]; self.revinsert='' self.pageinsert='' self.revinsertrows=0 self.revinsertsize=0 self.pageinsertrows=0 self.pageinsertsize=0 self.start=datetime.datetime.now() self.timeCheck=None self.timeDelta=None
def __init__(self, options): self.fileErrPath = "./errors.log" self.options = options if options.monitor and (not options.fileout and not options.streamout): self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\ self.options.user, self.options.passwd, self.options.database) self.nspace_dict = {} self.codens = '' self.page_dict = {} self.rev_dict = {} self.stack = [] self.current_text = '' self.current_elem = None self.revfile = None self.pagefile = None self.page_num = 0 self.rev_num = 0 self.last_page_len = 0 self.rev_count = 0 self.prior_rev_id = 'NULL' self.isRedirect = '0' self.isStub = '0' self.isMinor = '0' self.revinsert = '' self.pageinsert = '' self.textinsert = '' self.revinsertrows = 0 self.revinsertsize = 0 self.pageinsertrows = 0 self.pageinsertsize = 0 self.textinsertrows = 0 self.textinsertsize = 0 self.start = datetime.datetime.now() self.timeCheck = None self.timeDelta = None
def overall(self): """ Preprocessing tables for evolution of page length over time """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_redirect as "+\ "(select page_id from page where page_namespace=0 and page_is_redirect=1)") dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_redirect as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_redirect))") dbaccess.raw_query_SQL(self.access[1], "create or replace view page_talk as "+\ "(select page_id from page where page_namespace=1)") dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_talk as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_talk))") self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision") self.years=range(int(self.minyear[0][0])+1, 2009) for self.year in self.years: dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_talk_"+str(self.year)) dbaccess.raw_query_SQL(self.access[1],"create table max_rev_talk_"+str(self.year)+\ " as (select max(rev_id) as max_id, rev_page from rev_talk "+\ "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add primary key (max_id)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add index (rev_page)") dbaccess.close_Connection(self.access[0])
def performAnalysis(self): ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", self.language + "_stub") ## Singleton objects to plot graphics in the class methods self.simpleGraph = graphic2D(self.filePath) ## self.multiGraph=graphic2Dmulti(self.filePath) ## self.giniGraph=graphicGini(self.filePath) ## self.splitHistGraph=graphicSplitHist(self.filePath, self.dataPath) self.graph3D = graphic3D(self.filePath, self.dataPath) print "Starting analysis on DB " + self.language + "_stub\n" ## self.UserNumContribsGroup(self.acceso[1]) ## self.UserNumContribsGenerations() authorsGini = [( 95.9677, 4.046, ), (95.7015, 4.304), (96.2223, 4.363), (95.7104, 4.395), (96.3844, 4.407), (92.4691, 4.528), (95.0077, 4.603), (95.0071, 4.7298), (93.785, 5.051), (93.6076, 5.888)] authorsGini.sort() ##authorsGini=[(4.046,95.9677),(4.304,95.7015),(4.363,96.2223),(4.395,95.7104),(4.407,96.3844),(4.528,92.4691),(4.603,95.0077),(4.7298,95.0071),(5.051,93.785),(5.888,93.6076)] self.simpleGraph.createGraphic( "authors-Gini", (authorsGini, ), "Gini coeff. (%)", "Number of different authors (log)", "Gini coeff. vs. number of registered authors in the top-ten Wikipedias." ) ## Close DB connection dbaccess.close_Connection(self.acceso[0]) print "This is finished"
def prepro_pagelen(self): """ Preprocessing tables for evolution of page length over time """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #VIEW page_main_nored (pages in main nspace excluding redirects) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_main_nored as "+\ "(select page_id from page where page_namespace=0 and page_is_redirect=0)") #VIEW rev_main_nored (revisions in main nspace in all pages, excluding redirects) dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_main_nored as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_main_nored))") #TABLES max_rev_YYYY (latest revision for each page in main nspace, up to year YYYY) self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision") self.years=range(int(self.minyear[0][0])+1, 2009) for self.year in self.years: dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_"+str(self.year)) dbaccess.raw_query_SQL(self.access[1],"create table max_rev_"+str(self.year)+\ " as (select max(rev_id) as max_id, rev_page from rev_main_nored "+\ "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add primary key (max_id)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add index (rev_page)") dbaccess.close_Connection(self.access[0])
def __init__(self, options): self.options=options if self.options.monitor and (not self.options.fileout and not self.options.streamout): self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\ self.options.user, self.options.passwd, self.options.database) self.log_dict = {} self.stack=[] self.current_text = '' self.current_elem=None #self.log_count=0 self.log_num=0 self.nspace_dict={} self.codens='' ######################################## ##REGEXPS ######################################## ######################################## #REMAINING GLOBAL ATTRIBUTES ######################################## self.fileErrPath="./errors"+self.options.database+".log" self.loginsert='' self.loginsertrows=0 self.loginsertsize=0 self.start=datetime.datetime.now() self.timeCheck=None self.timeDelta=None
def generalStatistics(self): ## Computes the views containing general statistics and overall information: ## For all namespaces (official and artificial): ################ ## View _overall_statistics1_months, which includes ## Total num of pages with at least one edit in that month, total number of contribs, ## total num of users who made at least 1 edit in that month (alive_users) #################################### ## Parameters from Wikistats by Erik Zachte #################################### ## Wikipedians: contributors, active wikipedians, very active wikipedians ## Articles: (WARNING: readable contents are not being filtered out yet) ## new articles per day, edits per article, bytes per article, % of articles over 0.5k, ## % of articles over 2k ## Total size of contribs per month ## Size of pages and number of different authors who have edited them #################################### ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## General statistics for nspace in self.nspaces: self.__gral_stats(self.acceso[1], nspace+"_"+self.language) ## Close DB connection dbaccess.close_Connection(self.acceso[0])
def infoPages(self): ## Generates statistics per article ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) #Local configuration target="page_id" intervals=["months", "quarters","weeks"] ########################### #Total num of revisions per page for nspace in self.nspaces: self.__total_rev(self.acceso[1], nspace+"_"+self.language, target) ########################### #Total number of different editors per page for nspace in self.nspaces: self.__total_rev_diff(self.acceso[1], nspace+"_"+self.language, target) ########################### #Total number of revisions per page for several time intervals #Currently, we are only interested in months, quarters and weeks for nspace in self.nspaces: for interval in intervals: self.__total_rev_time(self.acceso[1], interval,nspace+"_"+self.language, target) ########################### #Total number of different editors per page; per month ,quarter and week for nspace in self.nspaces: for interval in intervals: self.__total_rev_diff_time(self.acceso[1], interval,nspace+"_"+self.language, target) #Close DB connection dbaccess.close_Connection(self.acceso[0])
def generalStatistics(self): ## Computes the views containing general statistics and overall information: ## For all namespaces (official and artificial): ################ ## View _overall_statistics1_months, which includes ## Total num of pages with at least one edit in that month, total number of contribs, ## total num of users who made at least 1 edit in that month (alive_users) #################################### ## Parameters from Wikistats by Erik Zachte #################################### ## Wikipedians: contributors, active wikipedians, very active wikipedians ## Articles: (WARNING: readable contents are not being filtered out yet) ## new articles per day, edits per article, bytes per article, % of articles over 0.5k, ## % of articles over 2k ## Total size of contribs per month ## Size of pages and number of different authors who have edited them #################################### ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## General statistics for nspace in self.nspaces: self.__gral_stats(self.acceso[1], nspace + "_" + self.language) ## Close DB connection dbaccess.close_Connection(self.acceso[0])
def contributions(idiomas): """ Create some graphs and files with statistical results about authors contributions @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ for idioma in idiomas: acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") #acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages") #dbaccess.query_SQL(acceso[1], "page_id, page_namespace", "page", where="page_namespace=0", create="pag_namespace") tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma) tcauthor=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_author_"+idioma) #tc_ann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_Annons_author_text_"+idioma) dbaccess.close_Connection(acceso[0]) data=__tup_to_list(tcnoann) listay_tcnoann=data.pop() listax=data.pop() data=__tup_to_list(tcauthor) listay_tcauthor=data.pop() listax=data.pop() #data=__tup_to_list(tc_ann) #listay_tc_ann=data.pop() #listax=data.pop() r.png("graphics/"+idioma+"/gini_TContrib_NoAnn_"+idioma+".png") __lorenz_Curve(listay_tcnoann) r.png("graphics/"+idioma+"/gini_TContrib_"+idioma+".png") __lorenz_Curve(listay_tcauthor)
def infoAuthors(self): ## Generates statistics per user ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## local configuration: retrieving info for authors target = "author" ## intervals might be days, weeks, months, quarters, years intervals = ["months", "quarters", "weeks"] ############################ #Number of total revisions per author ID for nspace in self.nspaces: self.__total_rev(self.acceso[1], nspace + "_" + self.language, target) ############################ #Different articles edited per user for nspace in self.nspaces: self.__total_rev_diff(self.acceso[1], nspace + "_" + self.language, target) ############################ #Total num of articles started per author #We consider as the beginning of an article the first revision of that article for nspace in self.nspaces: self.__total_page_init_author(self.acceso[1], nspace + "_" + self.language) ############################ #Total number of revisions per author for several time intervals #Currently, we are only interested in data per months, quarters and weeks for nspace in self.nspaces: for interval in intervals: self.__total_rev_time(self.acceso[1], interval, nspace + "_" + self.language, target) ############################ #Num of different articles revised per author for several time intervals for nspace in self.nspaces: for interval in intervals: self.__total_rev_diff_time(self.acceso[1], interval, nspace + "_" + self.language, target) ############################ #Num of different articles initiated per author for nspace in self.nspaces: for interval in intervals: self.__total_page_init_author_time( self.acceso[1], interval, nspace + "_" + self.language) ############################ # BIRTH AND DEATH ANALYSIS FOR THE AUTHOR COMMUNITY ############################ #Close DB connection dbaccess.close_Connection(self.acceso[0])
def ratios(self): """ .dat files showing interesting descriptive ratios """ #FILE author-pages.dat ratio no. logged editors/no. user pages file=open("overall/data/editors-userpages.dat",'w') file.write("logged_authors\tuser_pages\tratio\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Obtain number of different logged authors self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\ "revision where rev_user!=0") #Obtain number of different user pages (nspace =2) self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=2") dbaccess.close_Connection(self.access[0]) #Writing data to file file=open("overall/data/author-pages.dat",'a') file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\ str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n") file.close() #print "Completed lang "+self.language+"\n" #FILE articles-talk-ratio.dat ratio of no. articles/no. talk pages (excluding redirects) file=open("overall/data/articles-talk-ratio.dat",'w') file.write("articles\ttalk\tratio\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Obtain number of articles excluding redirects self.articles=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=0 and page_is_redirect=0") #Obtain number of talk pages self.talk=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=1") dbaccess.close_Connection(self.access[0]) #Writing data to file file=open("overall/data/articles-talk-ratio.dat",'a') file.write(str(int(self.articles[0][0]))+"\t"+str(int(self.talk[0][0]))+"\t"+\ str(float(self.talk[0][0])/float(self.articles[0][0]))+"\t"+self.language+"\n") file.close()
def calculate(self): self.access = dbaccess.get_Connection("localhost", 3306, self.user,\ self.passw, "wx_"+self.language+"wiki_"+self.dumptype) try: print "Creating table for logged users..." users=dbaccess.raw_query_SQL(self.access[1],"create table lag_info (rev_user INT(10) UNSIGNED NOT NULL,"+\ "fecha1 datetime not null, fecha2 datetime not null)") except Exception, e: print "An exception ocurred, the problem was the following:\n" print e print "*************\n\n"
def infoAuthors(self): ## Generates statistics per user ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## local configuration: retrieving info for authors target="author" ## intervals might be days, weeks, months, quarters, years intervals=["months", "quarters","weeks"] ############################ #Number of total revisions per author ID for nspace in self.nspaces: self.__total_rev(self.acceso[1], nspace+"_"+self.language, target) ############################ #Different articles edited per user for nspace in self.nspaces: self.__total_rev_diff(self.acceso[1], nspace+"_"+self.language, target) ############################ #Total num of articles started per author #We consider as the beginning of an article the first revision of that article for nspace in self.nspaces: self.__total_page_init_author(self.acceso[1], nspace+"_"+self.language) ############################ #Total number of revisions per author for several time intervals #Currently, we are only interested in data per months, quarters and weeks for nspace in self.nspaces: for interval in intervals: self.__total_rev_time(self.acceso[1], interval,nspace+"_"+self.language, target) ############################ #Num of different articles revised per author for several time intervals for nspace in self.nspaces: for interval in intervals: self.__total_rev_diff_time(self.acceso[1], interval,nspace+"_"+self.language, target) ############################ #Num of different articles initiated per author for nspace in self.nspaces: for interval in intervals: self.__total_page_init_author_time(self.acceso[1], interval,nspace+"_"+self.language) ############################ # BIRTH AND DEATH ANALYSIS FOR THE AUTHOR COMMUNITY ############################ #Close DB connection dbaccess.close_Connection(self.acceso[0])
def infoContents(self): ########################### #Contents analysis ########################### ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## For all namespaces (official an artificial): ## Evolution in time of the lenght of user contributions (per month; per quarter) ## Evolution in time of the lenght of pages (per month; per quarter supported but commented) for nspace in self.nspaces: self.__content_evolution(self.acceso[1], nspace+"_"+self.language) dbaccess.close_Connection(self.acceso[0])
def __init__(self, options): self.fileErrPath="./errors.log"; self.options=options if self.options.monitor and not self.options.fileout and not self.options.streamout: self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\ self.options.user, self.options.passwd, self.options.database) self.nspace_dict={}; self.codens=''; self.page_dict={}; self.rev_dict = {} self.stack=[]; self.current_text = ''; self.current_elem=None; self.revfile=None self.pagefile=None self.page_num = 0; self.rev_num=0; self.last_page_len=0; self.rev_count=0 self.prior_rev_id='NULL'; self.isRedirect='0'; self.isStub='0'; self.isMinor='0' self.revinsert=''; self.pageinsert=''; self.textinsert='' self.revinsertrows=0; self.revinsertsize=0; self.pageinsertrows=0 self.pageinsertsize=0; self.textinsertrows=0; self.textinsertsize=0 self.start=datetime.datetime.now(); self.timeCheck=None; self.timeDelta=None
def infoContents(self): ########################### #Contents analysis ########################### ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## For all namespaces (official an artificial): ## Evolution in time of the lenght of user contributions (per month; per quarter) ## Evolution in time of the lenght of pages (per month; per quarter supported but commented) for nspace in self.nspaces: self.__content_evolution(self.acceso[1], nspace + "_" + self.language) dbaccess.close_Connection(self.acceso[0])
def overall(self): """ Preprocessing tables for evolution of page length over time """ file=open("author-pages.dat",'w') file.write("logged_authors\tuser_pages\tratio\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\ "revision where rev_user!=0") self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=2") dbaccess.close_Connection(self.access[0]) file=open("author-pages.dat",'a') file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\ str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n") file.close() print "Completed lang "+self.language+"\n" file=open("articles-talk-ratio.dat",'w') file.write("articles\ttalk\tratio\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) self.articles=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=0 and page_is_redirect=0") self.talk=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=1") dbaccess.close_Connection(self.access[0]) file=open("articles-talk-ratio.dat",'a') file.write(str(int(self.articles[0][0]))+"\t"+str(int(self.talk[0][0]))+"\t"+\ str(float(self.talk[0][0])/float(self.articles[0][0]))+"\t"+self.language+"\n") file.close() print "Completed lang "+self.language+"\n"
def cox_prop(self): """ Creates intermediate files and tables for Cox-prop hazards analysis """ #Initialize file header f=open("wkp_cox_prop_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts,in_talk,in_FAs\n") f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" print "Starting language "+self.language+"\n" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) ##TABLE: Create table of users in talk pages dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_talk") dbaccess.raw_query_SQL(self.access[1],"create table users_in_talk as (select distinct(rev_user) from revision "+\ "where rev_page in (select page_id from page where page_namespace=1))") dbaccess.raw_query_SQL(self.access[1],"alter table users_in_talk add primary key (rev_user)") ##TABLE: Create table of users in FAs dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_FAs") dbaccess.raw_query_SQL(self.access[1],"create table users_in_FAs as (select distinct(rev_user) from revision_FAs)") dbaccess.raw_query_SQL(self.access[1],"alter table users_in_FAs add primary key (rev_user)") ##TABLE: MIX previous info with time_range_authors --> save result in new table time_range_cox dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_cox") dbaccess.raw_query_SQL(self.access[1],"create table time_range_cox as (select rev_user, "+\ "date(min_ts) as min_ts, date(max_ts) as max_ts, "+\ "case when rev_user in (select rev_user from users_in_talk) then 1 else 0 end as in_talk, "+\ "case when rev_user in (select rev_user from users_in_FAs) then 1 else 0 end as in_FAs "+\ "from time_range_authors)") ##IN SYSTEM print "Interm. tables created proceeding to write out data..."+self.language+"\n" results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, min_ts, max_ts, in_talk, in_FAs "+\ "from time_range_cox "+\ " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_cox_prop_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\","+\ str(int(result[3]))+","+str(int(result[4]))+"\n") f.close() print "Finished all cox-prop tasks for "+self.language+"\n"
def test_funciones(self): self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+self.conf.dumptype) __total_rev(self.acceso[1], table="stats_nlwiki", target="author") ## targets=["page_id"] ## for target in targets: ## __total_rev(self.acceso[1], language, target) ## __total_rev_target(self.acceso[1], language, target) ## __total_rev_time(self.acceso[1],"years",language, target) ## __total_rev_target_time(self.acceso[1],"years",language, target) ## __total_article_init_author(self.acceso[1], language) ## __article_init_author_time(self.acceso[1],"years",language) ## __article_rev_author_time(self.acceso[1], "years", language) ## __total_rev_time(self.acceso[1],"months",language, "page_id") ## __total_article_init_author(self.acceso[1], language, target="author") ## __content_evolution(self.acceso[1], language) dbaccess.close_Connection(self.acceso[0])
def make_indexes(self): self.access = dbaccess.get_Connection( "localhost", 3306, self.user, self.passw, "wx_" + self.language + "wiki_" + self.dumptype ) # Generate adequate indexes and keys in tables page and revision # try: # print "Generating index for page_len...\n" # dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE page ADD INDEX page_len(page_len)") # except Exception, e: # print "An exception ocurred, the problem was the following:\n" # print e # print "*************\n\n" try: print "Creating index for rev_timestamp" dbaccess.raw_query_SQL(self.access[1], "ALTER TABLE revision ADD INDEX timestamp(rev_timestamp)") except Exception, e: print "An exception ocurred, the problem was the following:\n" print e print "*************\n\n"
def time_range(self): """ Creates intermediate tables with time frame of editors activity """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) ##### TIME RANGE FOR AUTHORS IN ALL NAMESPACES #TABLE: Total no. of revisions made by every logged author dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\ "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\ "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user") dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)") print "Created table user_revs for "+self.language+"wiki...\n" #TABLE: Min and max timestamp for every logged author + total num_revs dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_authors AS "+\ "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\ "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\ "ORDER BY min_ts)") print "Created table time_range_authors for "+self.language+"wiki...\n" ##### TIME RANGE FOR AUTHORS IN MAIN ONLY print "Processing language "+self.language+"\n" #VIEW: Create view for filtering annons and bots #Filter from rev_main_nored revisions from logged authors only dbaccess.raw_query_SQL(self.access[1],"create or replace view revision_logged as (select * from rev_main_nored "+\ " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot') )") dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_users") #TABLE: Intermediate table, storing for each logged author the min and max ts in the system dbaccess.raw_query_SQL(self.access[1],"create table time_range_users as (SELECT rev_user, "+\ "min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision_logged group by rev_user)") dbaccess.raw_query_SQL(self.access[1],"alter table time_range_users add primary key (rev_user)") print "Created time_range_users for "+self.language +"\n" #Close DB connection dbaccess.close_Connection(self.access[0])
def comparative_contributions(): listaidiomas=["dewiki", "jawiki", "frwiki", "plwiki", "nlwiki", "itwiki", "ptwiki", "eswiki", "svwiki"] ## lista=["eswiki", "svwiki"] r.png("graphics/AAA/gini_comparative_top10.png") flag=0 for idioma in listaidiomas: print "Starting comparative Gini analysis for language..."+idioma+"\n" acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma) dbaccess.close_Connection(acceso[0]) data=__tup_to_list(tcnoann) listay_tcnoann=data.pop() listax=data.pop() if flag==0: _lorenz_Comp_Curves(listay_tcnoann,flag) flag=1 else: _lorenz_Comp_Curves(listay_tcnoann,flag) r.dev_off() print "Comparative graphic for Gini curves finished!!"
def infoPages(self): ## Generates statistics per article ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) #Local configuration target = "page_id" intervals = ["months", "quarters", "weeks"] ########################### #Total num of revisions per page for nspace in self.nspaces: self.__total_rev(self.acceso[1], nspace + "_" + self.language, target) ########################### #Total number of different editors per page for nspace in self.nspaces: self.__total_rev_diff(self.acceso[1], nspace + "_" + self.language, target) ########################### #Total number of revisions per page for several time intervals #Currently, we are only interested in months, quarters and weeks for nspace in self.nspaces: for interval in intervals: self.__total_rev_time(self.acceso[1], interval, nspace + "_" + self.language, target) ########################### #Total number of different editors per page; per month ,quarter and week for nspace in self.nspaces: for interval in intervals: self.__total_rev_diff_time(self.acceso[1], interval, nspace + "_" + self.language, target) #Close DB connection dbaccess.close_Connection(self.acceso[0])
def histogram(idiomas): """ Create histograms depicting article size distribution for a certain language version @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ filenames=["boxplot_log.png", "histogram_log.png", "histogram_log_low.png", "histogram_log_high.png", "ecdf_log_low.png", "ecdf_log_high.png", "data/page_len_log.data", "/data/histograms.info", "ecdf_total.png"] for idioma in idiomas: print "Creando histogramas para el idioma ... "+idioma #Print to another file the names of graphics files, following the order in the GNU R script histogram.R f=open("./data/hist_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") #Considering only database pages corresponding to articles, with NAMESPACE=MAIN=0 #dbaccess.dropTab_SQL(acceso[1], "aux") #dbaccess.query_SQL(acceso[1],"page_id, page_len","page", where="page_namespace=0", order="page_len", create="aux") result=dbaccess.query_SQL(acceso[1], "page_id, page_len", "aux") dbaccess.close_Connection(acceso[0]) data=__tup_to_list(result) page_len=data.pop() for i in range(len(page_len)): if page_len[i]!=0: page_len[i]=math.log10(page_len[i]) #Print to another file a list with article sizes to plot histograms f=open("./graphics/"+idioma+"/data/page_len_log.data", 'w') for value in page_len: f.writelines(str(value)+"\n") f.close() #CALL THE GNU R SCRIPT Histogram.R succ=os.system("R --vanilla < ./histogram.R > debug_R") if succ==0: print "Funcion histogram ejecutada con exito para el lenguage... "+idioma
def performAnalysis(self): ## Get DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", self.language + "_stub") ## Singleton objects to plot graphics in the class methods self.simpleGraph = graphic2D(self.filePath) ## self.multiGraph=graphic2Dmulti(self.filePath) ## self.giniGraph=graphicGini(self.filePath) ## self.splitHistGraph=graphicSplitHist(self.filePath, self.dataPath) self.graph3D = graphic3D(self.filePath, self.dataPath) print "Starting analysis on DB " + self.language + "_stub\n" ## self.UserNumContribsGroup(self.acceso[1]) ## self.UserNumContribsGenerations() authorsGini = [ (95.9677, 4.046), (95.7015, 4.304), (96.2223, 4.363), (95.7104, 4.395), (96.3844, 4.407), (92.4691, 4.528), (95.0077, 4.603), (95.0071, 4.7298), (93.785, 5.051), (93.6076, 5.888), ] authorsGini.sort() ##authorsGini=[(4.046,95.9677),(4.304,95.7015),(4.363,96.2223),(4.395,95.7104),(4.407,96.3844),(4.528,92.4691),(4.603,95.0077),(4.7298,95.0071),(5.051,93.785),(5.888,93.6076)] self.simpleGraph.createGraphic( "authors-Gini", (authorsGini,), "Gini coeff. (%)", "Number of different authors (log)", "Gini coeff. vs. number of registered authors in the top-ten Wikipedias.", ) ## Close DB connection dbaccess.close_Connection(self.acceso[0]) print "This is finished"
def prepro_red_talk(self): """ Data and evolution for redirects and talk pages """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #VIEW page_redirect (pages with redirect flag activated) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_redirect as "+\ "(select page_id from page where page_namespace=0 and page_is_redirect=1)") #VIEW rev_redirect (revisions corresponding to redirect pages) dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_redirect as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_redirect))") #VIEW page_talk (pages in talk nspace) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_talk as "+\ "(select page_id from page where page_namespace=1)") #VIEW rev_talk (revisions corresponding to talk pages) dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_talk as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_talk))") #TABLES max_rev_talk_YYYY (latest revision for each pages in talk nspace, in year YYYY) self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision") self.years=range(int(self.minyear[0][0])+1, 2009) for self.year in self.years: dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_talk_"+str(self.year)) dbaccess.raw_query_SQL(self.access[1],"create table max_rev_talk_"+str(self.year)+\ " as (select max(rev_id) as max_id, rev_page from rev_talk "+\ "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add primary key (max_id)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add index (rev_page)") dbaccess.close_Connection(self.access[0])
def general_stats(self): """ Preprocessing actions for general statistics scripts """ #FILE page_len.dat, with info about length of pages self.f=open("overall/data/page_len.dat", 'w') self.f.write("page_len\tns\tis_redirect\tis_stub\tis_new\tlang\n") self.f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) print "Retrieving info from "+self.language+"\n" results=dbaccess.raw_query_SQL(self.access[1], "SELECT page_len, page_namespace, page_is_redirect, page_is_stub, "+\ "page_is_new FROM page") print "Updating page_len info file with "+self.language+"\n" self.f=open("overall/data/page_len.dat", 'a') for result in results: self.f.write(str(int(result[0]))+"\t"+str(int(result[1]))+"\t"+str(int(result[2]))+"\t"+\ str(int(result[3]))+"\t"+str(int(result[4]))+"\t"+self.language+"\n") self.f.close() results=None dbaccess.close_Connection(self.access[0])
def analyze(self): for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Total no. of revisions made by every logged author dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\ "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\ "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user") dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)") print "Created table user_revs for "+self.language+"wiki...\n" #Min and max timestamp for every logged author + total num_revs dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_allns AS "+\ "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\ "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\ "ORDER BY min_ts)") print "Created table time_range_allns for "+self.language+"wiki...\n" #Close DB connection dbaccess.close_Connection(self.access[0])
def surv_files(self): """ Creates all data files used as input for demography scripts in GNU R """ #Initialize all files headers #FILE: Survival data for all users (including editors out of MAIN) f=open("wkp_surv_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() #FILE: Survival data for all logged users who edited in MAIN f=open("wkp_surv_main_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() #FILE: Survival data for all logged editors until they join the core (activity) f=open("wkp_surv_join_core_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() #FILE: Survival data for logged editors since they join the core until they leave it (activity) f=open("wkp_surv_in_core_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() #FILE: Survival data for loged editors since they leave the core until death (activity) f=open("wkp_surv_core_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() #FILE: Survival data for all logged editors until they join the core (revisions) f=open("wkp_surv_join_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() #FILE: Survival data for logged editors since they join the core until they leave it (revisions) f=open("wkp_surv_in_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() #FILE: Survival data for loged editors since they leave the core until death (revisions) f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" print "Starting language "+self.language+"\n" ##IN SYSTEM self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\ " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN MAIN self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_main_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished core users by activity for language "+self.language+"\n" ########################### ##REV CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished all surv_file tasks for "+self.language+"\n"
def __init__(self, conf, language="furwiki"): """ Creates multiple views to create a convenient interface to access quantitative data It also generates necessary tables and views to store intermidiate results, so that other methods can later store data directly. """ self.conf=conf self.language=language ##List of namespaces to analyse. We have added new special namespaces (e.g. subsets of main) self.nspaces=["all","ns0","articles","redirects","cur_redirects","cur_stubs","stubs","talk",\ "pageUser", "userTalk","meta", "metaTalk", "image", "imageTalk", "mediawiki",\ "mediawikiTalk", "template", "templateTalk", "help", "helpTalk", "category", "categoryTalk"] ##Some fancy lists to work with time intervals in some private methods following self.type_interval_columns={"days":"day, year", "weeks":"week, year", "months":"month, year",\ "quarters":"quarter, year", "years":"year"} self.type_interval_select={"days":"DAYOFYEAR(rev_timestamp) AS day, YEAR(rev_timestamp) AS year ",\ "weeks":"WEEK(rev_timestamp,1) AS week, YEAR(rev_timestamp) AS year ",\ "months":"MONTH(rev_timestamp) AS month, YEAR(rev_timestamp) AS year ",\ "quarters":"QUARTER(rev_timestamp) AS quarter, YEAR(rev_timestamp) AS year ",\ "years":"YEAR(rev_timestamp) AS year "} self.type_interval_group={"days":"year, day", "weeks":"year, week", "months":"year, month",\ "quarters":"year, quarter", "years":"year"} ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## Delete previous versions of views for nspace in self.nspaces: dbaccess.dropView(self.acceso[1], nspace+"_"+self.language) ## Create updated versions for views from revision table #View sumarizing all info for every revision (linking with info from table page) dbaccess.createView(self.acceso[1], view="all_"+self.language,\ columns="rev_id, page_id, rev_len, page_ns, page_len, is_redirect, author, author_text,"+\ " rev_timestamp, rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_namespace, page_len, rev_is_redirect,"+\ " rev_user, rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id") #View sumarizing info regarding pages in namespace=0 (including articles, stubs and redirects) dbaccess.createView(self.acceso[1], view="ns0_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, is_redirect, author, author_text,"+\ " rev_timestamp, rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, page_is_redirect, rev_user,"+\ " rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id"+\ " AND page_namespace=0") #View sumarizing info for articles (excluding pages that currently are redirects and stubs) dbaccess.createView(self.acceso[1], view="articles_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text,"+\ " rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND page_namespace=0 AND "+\ "page_is_redirect=0 AND page_is_stub=0") #View with info only for redirects (pages that were redirects when that revision was made) dbaccess.createView(self.acceso[1], view="redirects_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\ "page_namespace=0 AND rev_is_redirect=1") #View with info only for current redirects dbaccess.createView(self.acceso[1], view="cur_redirects_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\ "page_namespace=0 AND page_is_redirect=1") #View with info only for revisions of stub pages (pages that were stubs when that revision was made) dbaccess.createView(self.acceso[1], view="stubs_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace=0 AND rev_is_stub=1") #View with info only for revisions of current stub pages dbaccess.createView(self.acceso[1], view="cur_stubs_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace=0 AND page_is_stub=1") #From this point on, automatically create views for the set of pages included in the remaining namespaces in MediaWiki for nspace, nsnum in zip(self.nspaces[7:], range(1,16)): dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"+\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace="+str(nsnum)) #View sumarizing the distribution of pages among namespaces dbaccess.dropView(self.acceso[1], "nspaces_"+self.language) dbaccess.createView(self.acceso[1],view="nspaces_"+self.language, columns="namespace, pages_in_nspace",\ query="SELECT page_namespace, COUNT(*) FROM page GROUP BY page_namespace") ## Intermidiate views for the minimun timestamp of every page [annons, and logged users] ## And other useful intermediate views regarding page evolution for nspace in self.nspaces: dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\ "_page_min_timestamp_logged") dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\ "_page_min_timestamp_logged", columns="page_id, rev_id, author, rev_timestamp",\ query="SELECT page_id, rev_id,author, MIN(rev_timestamp) FROM "+\ nspace+"_"+self.language+" WHERE author!=0 GROUP BY page_id") dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\ "_page_min_timestamp_annons") dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\ "_page_min_timestamp_annons",\ columns="page_id, rev_id, author_text, rev_timestamp",\ query="SELECT page_id,rev_id,author_text, MIN(rev_timestamp) FROM "+\ nspace+"_"+self.language+" WHERE author=0 GROUP BY page_id") dbaccess.dropView(self.acceso[1],nspace+"_"+self.language+"_list_months") dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_months",\ columns="month, year",query="SELECT MONTH(rev_timestamp) as month, "+\ "YEAR(rev_timestamp) as year"+\ " FROM "+nspace+"_"+self.language+" GROUP BY year, month ORDER BY year, month") dbaccess.dropView(self.acceso[1],nspace+"_"+self.language+"_list_quarters") dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_quarters",\ columns="quarter, year",query="SELECT QUARTER(rev_timestamp) as quarter, "+\ "YEAR(rev_timestamp) as year FROM "+nspace+"_"+self.language+" GROUP BY year,"+\ " quarter ORDER BY year, quarter") ## Close DB connection dbaccess.close_Connection(self.acceso[0])
def __init__(self, conf, language="furwiki"): """ Creates multiple views to create a convenient interface to access quantitative data It also generates necessary tables and views to store intermidiate results, so that other methods can later store data directly. """ self.conf = conf self.language = language ##List of namespaces to analyse. We have added new special namespaces (e.g. subsets of main) self.nspaces=["all","ns0","articles","redirects","cur_redirects","cur_stubs","stubs","talk",\ "pageUser", "userTalk","meta", "metaTalk", "image", "imageTalk", "mediawiki",\ "mediawikiTalk", "template", "templateTalk", "help", "helpTalk", "category", "categoryTalk"] ##Some fancy lists to work with time intervals in some private methods following self.type_interval_columns={"days":"day, year", "weeks":"week, year", "months":"month, year",\ "quarters":"quarter, year", "years":"year"} self.type_interval_select={"days":"DAYOFYEAR(rev_timestamp) AS day, YEAR(rev_timestamp) AS year ",\ "weeks":"WEEK(rev_timestamp,1) AS week, YEAR(rev_timestamp) AS year ",\ "months":"MONTH(rev_timestamp) AS month, YEAR(rev_timestamp) AS year ",\ "quarters":"QUARTER(rev_timestamp) AS quarter, YEAR(rev_timestamp) AS year ",\ "years":"YEAR(rev_timestamp) AS year "} self.type_interval_group={"days":"year, day", "weeks":"year, week", "months":"year, month",\ "quarters":"year, quarter", "years":"year"} ## Get new DB connection self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\ "wx_"+self.language+"_"+self.conf.dumptype) ## Delete previous versions of views for nspace in self.nspaces: dbaccess.dropView(self.acceso[1], nspace + "_" + self.language) ## Create updated versions for views from revision table #View sumarizing all info for every revision (linking with info from table page) dbaccess.createView(self.acceso[1], view="all_"+self.language,\ columns="rev_id, page_id, rev_len, page_ns, page_len, is_redirect, author, author_text,"+\ " rev_timestamp, rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_namespace, page_len, rev_is_redirect,"+\ " rev_user, rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id") #View sumarizing info regarding pages in namespace=0 (including articles, stubs and redirects) dbaccess.createView(self.acceso[1], view="ns0_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, is_redirect, author, author_text,"+\ " rev_timestamp, rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, page_is_redirect, rev_user,"+\ " rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id"+\ " AND page_namespace=0") #View sumarizing info for articles (excluding pages that currently are redirects and stubs) dbaccess.createView(self.acceso[1], view="articles_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text,"+\ " rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND page_namespace=0 AND "+\ "page_is_redirect=0 AND page_is_stub=0") #View with info only for redirects (pages that were redirects when that revision was made) dbaccess.createView(self.acceso[1], view="redirects_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\ "page_namespace=0 AND rev_is_redirect=1") #View with info only for current redirects dbaccess.createView(self.acceso[1], view="cur_redirects_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\ "rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\ "page_namespace=0 AND page_is_redirect=1") #View with info only for revisions of stub pages (pages that were stubs when that revision was made) dbaccess.createView(self.acceso[1], view="stubs_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace=0 AND rev_is_stub=1") #View with info only for revisions of current stub pages dbaccess.createView(self.acceso[1], view="cur_stubs_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace=0 AND page_is_stub=1") #From this point on, automatically create views for the set of pages included in the remaining namespaces in MediaWiki for nspace, nsnum in zip(self.nspaces[7:], range(1, 16)): dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language,\ columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"+\ " rev_parent_id", query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\ "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\ " page_namespace="+str(nsnum)) #View sumarizing the distribution of pages among namespaces dbaccess.dropView(self.acceso[1], "nspaces_" + self.language) dbaccess.createView(self.acceso[1],view="nspaces_"+self.language, columns="namespace, pages_in_nspace",\ query="SELECT page_namespace, COUNT(*) FROM page GROUP BY page_namespace") ## Intermidiate views for the minimun timestamp of every page [annons, and logged users] ## And other useful intermediate views regarding page evolution for nspace in self.nspaces: dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\ "_page_min_timestamp_logged") dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\ "_page_min_timestamp_logged", columns="page_id, rev_id, author, rev_timestamp",\ query="SELECT page_id, rev_id,author, MIN(rev_timestamp) FROM "+\ nspace+"_"+self.language+" WHERE author!=0 GROUP BY page_id") dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\ "_page_min_timestamp_annons") dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\ "_page_min_timestamp_annons",\ columns="page_id, rev_id, author_text, rev_timestamp",\ query="SELECT page_id,rev_id,author_text, MIN(rev_timestamp) FROM "+\ nspace+"_"+self.language+" WHERE author=0 GROUP BY page_id") dbaccess.dropView(self.acceso[1], nspace + "_" + self.language + "_list_months") dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_months",\ columns="month, year",query="SELECT MONTH(rev_timestamp) as month, "+\ "YEAR(rev_timestamp) as year"+\ " FROM "+nspace+"_"+self.language+" GROUP BY year, month ORDER BY year, month") dbaccess.dropView(self.acceso[1], nspace + "_" + self.language + "_list_quarters") dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_quarters",\ columns="quarter, year",query="SELECT QUARTER(rev_timestamp) as quarter, "+\ "YEAR(rev_timestamp) as year FROM "+nspace+"_"+self.language+" GROUP BY year,"+\ " quarter ORDER BY year, quarter") ## Close DB connection dbaccess.close_Connection(self.acceso[0])
def analyze(self): #Initialize all files headers #Survival data for all users (including editors out of MAIN) f=open("wkp_surv_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() #Survival data for all logged users who edited in MAIN f=open("wkp_surv_main_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() f=open("wkp_surv_join_core_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() f=open("wkp_surv_in_core_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() f=open("wkp_surv_core_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() f=open("wkp_surv_join_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() f=open("wkp_surv_in_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" print "Starting language "+self.language+"\n" ##IN SYSTEM self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\ "where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN MAIN self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_main_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished core users for language "+self.language+"\n" ########################### ##REV CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished all tasks for "+self.language+"\n"
import dbaccess import csv from pprint import pprint import re msqlu = 'root' msqlp = 'qhshfl27' msqldb = 'setags_ux' msqlh = 'localhost' acceso = dbaccess.get_Connection(msqlh, 3306, msqlu, msqlp, msqldb) def getTags(rawtagstr): # split tags mlist = re.finditer("\<(?P<tag>[A-Za-z0-9\_\-]+)\>", rawtagstr) tags = [] for m in mlist: tags.append(m.group('tag')) return tags def createTagChangeDB(filename, tablename1, tablename2): # read in csv data reader = csv.reader(open(filename, "rU")) header = reader.next() for (i, row) in enumerate(reader):
def summary_evol(idiomas): """ Create some graphs summarizing the evolution in time of critical quantitative parameters for each language version to explore @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ ## ¡¡WARNING!! Please be careful when selecting values from tables storing evolution in time of number of articles, size etc. ## You must always use a GROUP BY(pageCount, limitDate) clause, due to ## periods of inactivity that could generate duplicate entries in the graphics filenames=["page_dates.data", "page_Count_evol.data", "page_Len_Sum_log.data", "contribs_evol.data", "nspaces.data", "nspace_distrib.data", "diffArticles.data", "authors.data", "diff_authors_x_article.data", "authors_authors_per_pagelen.data", "pagelen_authors_per_pagelen.data"] filenames_out=["Tot_num_articles_absx_absy.png", "Tot_num_articles_absx_logy.png", "Tot_num_articles_logx_logy.png", "Tot_pagelensum_absx_absy.png", "Tot_pagelensum_absx_logy.png", "Tot_pagelensum_logx_logy.png", "Tot_contribs_absx_absy.png", "Tot_contribs_absx_logy.png", "Tot_contribs_logx_logy.png", "Diffs_articles_per_author.png", "Diffs_authors_per_article.png", "Diff_authors_against_page_len.png"] for idioma in idiomas: acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") #acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages") result=dbaccess.query_SQL(acceso[1], "pageCount, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)") result2=dbaccess.query_SQL(acceso[1], "pageLenSum, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)") result3=dbaccess.query_SQL(acceso[1], "contribs, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)") resultnspace=dbaccess.query_SQL(acceso[1], "pages_nspace, namespace", "stats_nspace_"+idioma) diffArticlesNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_NoAnnons_author_"+idioma) diffInitNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_Init_NoAnnons_author_"+idioma) totRevperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Contrib_NoAnnons_page_id_"+idioma) diffAuthorperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Article_NoAnnons_page_id_"+idioma) dautxplen=dbaccess.query_SQL(acceso[1], "page_len, authors", "stats_pagelen_difauthors_"+idioma) dbaccess.close_Connection(acceso[0]) data=__tup_to_list(result, 1) dates_x=data.pop() page_Count=data.pop() ## if idioma=="frwiki": data2=__tup_to_list(result2, 2) dates_x=data2.pop() dates_x.pop(0) dates_x.pop(0) page_Len_Sum=data2.pop() page_Len_Sum.pop(0) page_Len_Sum.pop(0) ## else: ## data2=__tup_to_list(result2, 1) ## dates_x=data2.pop() ## page_Len_Sum=data2.pop() data3=__tup_to_list(result3, 1) dates_x=data3.pop() contribs=data3.pop() datanspace=__tup_to_list(resultnspace) namespaces=datanspace.pop() pages_nspace=datanspace.pop() dataDiffArticlesNoann=__tup_to_list(diffArticlesNoann) diffArticles=dataDiffArticlesNoann.pop() authors=dataDiffArticlesNoann.pop() dataDiffInitNoann=__tup_to_list(diffInitNoann) diffInitArticles=dataDiffInitNoann.pop() authors=dataDiffInitNoann.pop() datatotRevperArticle=__tup_to_list(totRevperArticle) totalRev=datatotRevperArticle.pop() article=datatotRevperArticle.pop() datadiffAuthorperArticle=__tup_to_list(diffAuthorperArticle) diffAuthors=datadiffAuthorperArticle.pop() article=datadiffAuthorperArticle.pop() datadautxplen=__tup_to_list(dautxplen) autxplen=datadautxplen.pop() lenautxplen=datadautxplen.pop() ## Introduce in data list results form queries in the proper order ## corresponding with the name files we pass to the GNU R script summary_evol.R for i in range(len(page_Len_Sum)): if page_Len_Sum[i]!=0: page_Len_Sum[i]=math.log10(page_Len_Sum[i]) dataList=[dates_x, page_Count, page_Len_Sum, contribs, namespaces, pages_nspace, diffArticles, authors, diffAuthors, autxplen, lenautxplen] for filename, data in zip (filenames, dataList): if(filename.find('date')!=-1): __makeDatesFile(idioma, filename, data) else: __makeDataFile(idioma, filename, data) ###################################### #Pass data filenames to the GNU R script with a file f=open("./data/summary_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/data/"+line+"\n") f.close() #Idem with graphic output filenames f=open("./data/summary_files_out.data",'w') for line in filenames_out: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() #CALL THE GNU R SCRIPT summary_evol.R succ=os.system("R --vanilla < ./summary_evol.R > debug_R") if succ==0: print "Funcion summary_evol ejecutada con exito para el lenguage... "+idioma
def measuring(idiomas): """ Create some graphs following the research presented by Jakob Voss in his paper Mesuring Wikipedia (ISSI 2005) @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ ## Generates some graphics reproducing those in Measuring Wikipedia article filenames=["total_edits.data", "noannons_edits.data", "annon_edits.data", "authors_per_article_desc.data", "articles_per_logged_author_desc.data", "articles_per_anonymous_author_desc.data"] filenames_out=["total_edits_per_author.png", "total_edits_per_noannon_author.png", "total_edits_per_annon_author.png", "diff_authors_per_article_descending.png", "diff_articles_per_logged_author_descending.png", "diff_articles_per_anonymous_author_descending.png"] for idioma in idiomas: acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") ## acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages") #Combined evolution graphics #ALL THESE GRAPHICS ARE ALREADY GENERATED BY ERIK ZATCHE'S OFFICIAL PERL SCRIPTS #Database size #Total number of words #Total number of internal links #Number of articles (including redirects) #Number of active wikipedians (more than 5 contributions in a given month) #Number of very active wikipedians (more than 100 contributions in a given month) #Namespace size #OK, it is generated in summary_evol() method #Evolution in time of article size (histogram) #IDEA: Download page.sql files for a language for each semester period #Number of distinct authors per article (descending sorted graphic) #Already generated in summary_evol, ONLY NEED TO SORT AND ADJUST IN GNU R diffAuthorperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Article_NoAnnons_page_id_"+idioma) #Number of distinct articles per author (descending sorted graphic) #Idem as in the previous case diffArticlesNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_NoAnnons_author_"+idioma) diffArticlesAnn=dbaccess.query_SQL(acceso[1], "author_text, theCount", "stats_Article_Annons_author_text_"+idioma) data=__tup_to_list(diffAuthorperArticle) lisdiffauthorartic=data.pop() data=__tup_to_list(diffArticlesNoann) lisdiffarticleaut=data.pop() data=__tup_to_list(diffArticlesAnn,2) lisdiffarticleannon=data.pop() ## Ordenamos los resultados para que se puedan ajustar a una Power Law lisdiffauthorartic.sort(reverse=True) lisdiffarticleaut.sort(reverse=True) lisdiffarticleannon.sort(reverse=True) #Number of edtis per author #Retrieve results from database #We have already created GINI graphics for this parameter #ALSO AVAILABLE DATABASE TABLES WITH EVOLUTION IN TIME OF THIS PARAMETER tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma) tcauthor=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_author_"+idioma) tc_ann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_Annons_author_text_"+idioma) data=__tup_to_list(tcnoann) listcnoann=data.pop() data=__tup_to_list(tcauthor) listcauthors=data.pop() #BTW, we are also obtaining but not using the IP adresses of annon users data=__tup_to_list(tc_ann,2) listcann=data.pop() ## Arranging results in a decreasing way to adjust them to a power law listcnoann.sort(reverse=True) listcauthors.sort(reverse=True) listcann.sort(reverse=True) #Ingoing and outgoing number of links per article #STILL TO BE DEVELOPED #NEED TO FIRST IDENTIFY LINKS FOR A GIVEN ARTICLE IN THE DATABASE #LINKS TABLES MAY HELP, but in these dump versions they are all empty!!! #BROKEN LINKS also need to be considered dbaccess.close_Connection(acceso[0]) dataList=[listcauthors, listcnoann, listcann, lisdiffauthorartic, lisdiffarticleaut, lisdiffarticleannon] for filename, data in zip (filenames, dataList): if(filename.find('date')!=-1): __makeDatesFile(idioma, filename, data) else: __makeDataFile(idioma, filename, data) #Pass data filenames to the GNU R script with a file f=open("./data/measuring_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/data/"+line+"\n") f.close() #Idem with graphic output filenames f=open("./data/measuring_files_out.data",'w') for line in filenames_out: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() #CALL GNU R SCRIPT measuring_Wiki.R succ=os.system("R --vanilla < ./measuring_Wiki.R > debug_R") if succ==0: print "Funcion measuring_Wiki.R ejecutada con exito para el lenguage... "+idioma
def decompress (self): """ Decompress the DB dumps into MySQL """ ##TODO: Ad-hoc acuerdate de quitar esto POR DIOSSS ##self.filename="mtwiki-latest-pages-meta-history.xml.7z" if self.dumptype=="research": program="dump_sax_research.py" elif self.dumptype=="standard": program="dump_sax.py" else: print "Error! Unexpected type of dump received" return -1 self.filename=self.filenameTemplate.safe_substitute(language=self.language,file=self.files[0]) #Then we call our parser "dump_sax_research.py" to load data into MySQL command_7z="7za e -so dumps/"+self.filename+" | "+"python "+program+\ " -u "+self.msqlu+" -p "+self.msqlp+" -d "+"wx_"+self.language+"_"+self.dumptype+\ " --log "+self.language+".log" success=os.system(command_7z) if success == 0: print "DB "+"wx_"+self.language+\ self.dumptype+" successfully decompressed...\n\n" else: print "Error! There was an error trying to decompress database --> "+\ "wx_"+self.language+self.dumptype return -1 #Loading into MySQL other interesting tables directly provided in SQL format #SQL code to generate the tables is embedded in the SQL file itself ## for index in range(1,len(self.files)): ## self.filename=self.filenameTemplate.safe_substitute(language=self.language, file=self.files[index]) ## command_gzip="gzip -d dumps/"+self.filename ## command_mysql="mysql -u "+self.msqlu+" -p"+self.msqlp+\ ## " wx_"+self.language+"_"+self.dumptype+\ ## " < dumps/"+self.filename.rstrip(".gz") ## command_comp="gzip dumps/"+self.filename.rstrip(".gz") ## print "Decompressing "+self.filename+"..." ## success=os.system(command_gzip) ## if success==0: ## print "Loading "+self.filename.rstrip(".gz")+" into MySQL database..." ## success=os.system(command_mysql) ## if success==0: ## print "Compressing again "+self.filename.rstrip(".gz")+"..." ## success=os.system(command_comp) ## if success!=0: ## print "Error compressing again "+self.filename.rstrip(".gz") ## return -1 ## else: ## print "Error loading "+self.filename.rstrip(".gz") ## return -1 ## else: ## print "Error decompressing "+self.filename ## return -1 print "Generating indexes for tables page and revision...\n" print "Depending on the dump size this may take a while...\n" acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu,\ self.msqlp, "wx_"+self.language+"_"+self.dumptype) #Generate adequate indexes and keys in tables page and revision print "Generating index for page_len...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE page ADD INDEX (page_len)") print "Modifying rev_timestamp to support DATETIME and creating index...\n" #dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision MODIFY rev_timestamp DATETIME") dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX timestamp (rev_timestamp)") print "Generating index for rev_page and rev_timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)") print "Generating index for rev_user and rev_timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)") print "Generating index for rev_user_text and timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)") dbaccess.close_Connection(acceso[0]) print "Database ready for quantitative analysis...\n" print "Let's go on... Cross your fingers... ;-) \n\n\n" return success
def community_contrib(idiomas): for idioma in idiomas: list_admins=test_admins.process_admins(idioma) num_admins=list_admins.pop() where_clause1=list_admins.pop() acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") admins_ids=dbaccess.raw_query_SQL(acceso[1], "SELECT DISTINCT(author) FROM stats_"+idioma+" WHERE "+where_clause1+" LIMIT "+str(num_admins)) ## MONTAR WHERE CLAUSE CON ADMINS IDS list_admins_ids=[] for item in list_admins_ids: list_admins_ids.append(int(item[0])) where_clause2=test_admins.process_users_ids(list_admins_ids,idioma) edits_admin_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_NoAnnons_months_author_"+idioma+" ", where=where_clause2, group="year, month ", order="year, month") dates_admins=[] admins_contribs=[] for element in edits_admin_month: dates_admins.append(list(element[0:2])) admins_contribs.append(int(element[2])) ## PASAR A UN ARCHIVO PARA PLOT (FIG 2) ## RECUPERAMOS CONTRIBUCIONES TOTALES POR MESES total_edits_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, group="year, month ") dates_contribs=[] total_contribs=[] for element in total_edits_month: dates_contribs.append(list(element[0:2])) total_contribs.append(int(element[2])) ## DIVIDIR LA PRIMERA LISTA POR LA SEGUNDA perc_contribs_admins=[] for admin_contrib, total_contrib in zip(admins_contribs, total_contribs): perc_contribs_admins.append((float(admin_contrib)/total_contrib)) ## PASAR A UN ARCHIVO PARA PLOT (FIG 1) ## FIG 4 TOTAL EDITS MADE BY USERS WITH DIFFERENT EDIT LEVELS ## CREATE CLUSTER OF USERS IDENTIFIED BY CONTRIBUTIONS LEVEL ## 5 LEVELS: <100, 100-1K, 1K-5K, 5K-10K, >10K users_level1=[] users_level2=[] users_level3=[] users_level4=[] users_level5=[] level1=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount<=100") for userid in level1: users_level1.append(int(userid[0])) level2=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>100 AND theCount<=1000") for userid in level2: users_level2.append(int(userid[0])) level3=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>1000 AND theCount<=5000") for userid in level3: users_level3.append(int(userid[0])) level4=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>5000 AND theCount<=10000") for userid in level4: users_level4.append(int(userid[0])) level5=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>10000") for userid in level5: users_level5.append(int(userid[0])) where_clause_level1=test_admins.process_users_ids(users_level1,idioma) where_clause_level2=test_admins.process_users_ids(users_level2,idioma) where_clause_level3=test_admins.process_users_ids(users_level3,idioma) where_clause_level4=test_admins.process_users_ids(users_level4,idioma) where_clause_level5=test_admins.process_users_ids(users_level5,idioma) contribs_level1_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level1, group="year, month") contribs_level2_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level2, group="year, month") contribs_level3_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level3, group="year, month") contribs_level4_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level4, group="year, month") contribs_level5_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level5, group="year, month") list_level1=__process_contribs(contribs_level1_month, total_contribs) perc_contribs_level1=list_level1.pop() contribs_level1=list_level1.pop() dates_level1=list_level1.pop() list_level2=__process_contribs(contribs_level2_month, total_contribs) perc_contribs_level2=list_level2.pop() contribs_level2=list_level2.pop() dates_level2=list_level2.pop() list_level3=__process_contribs(contribs_level3_month, total_contribs) perc_contribs_level3=list_level3.pop() contribs_level3=list_level3.pop() dates_level3=list_level1.pop() list_level4=__process_contribs(contribs_level4_month, total_contribs) perc_contribs_level4=list_level4.pop() contribs_level4=list_level4.pop() dates_level4=list_level4.pop() list_level5=__process_contribs(contribs_level5_month, total_contribs) perc_contribs_level5=list_level5.pop() contribs_level5=list_level5.pop() dates_level5=list_level5.pop() ## FIG 5 PLOT 4b ## FIG 6 AVERAGE NUMBER OF EDITS PER USER PER MONTH FOR EACH LEVEL ## RETRIEVE NUM USERS FOR EACH MONTH IN EACH LEVEL WHO HAVE MADE AT LEAST ONE CONTRIB num_users_1_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level1, group="year, month") num_users_2_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level2, group="year, month") num_users_3_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level3, group="year, month") num_users_4_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level4, group="year, month") num_users_5_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level5, group="year, month") list_users_1_month=[] for element in num_users_1_month: list_users_1_month.append(int(element[0])) list_users_2_month=[] for element in num_users_2_month: list_users_2_month.append(int(element[0])) list_users_3_month=[] for element in num_users_3_month: list_users_3_month.append(int(element[0])) list_users_4_month=[] for element in num_users_4_month: list_users_4_month.append(int(element[0])) list_users_5_month=[] for element in num_users_5_month: list_users_5_month.append(int(element[0])) ## DIVIDE TOT NUM CONTRIBS PER LEVEL PER MONTH BY THE NUM USERS FOR EACH MONTH IN EACH LEVEL avg_contribs_user_1_month=[] for contribmonth, usermonth in zip(contribs_level1, list_users_1_month): avg_contribs_user_1_month.append(float(contribmonth)/usermonth) avg_contribs_user_2_month=[] for contribmonth, usermonth in zip(contribs_level2, list_users_2_month): avg_contribs_user_2_month.append(float(contribmonth)/usermonth) avg_contribs_user_3_month=[] for contribmonth, usermonth in zip(contribs_level3, list_users_3_month): avg_contribs_user_3_month.append(float(contribmonth)/usermonth) avg_contribs_user_4_month=[] for contribmonth, usermonth in zip(contribs_level4, list_users_4_month): avg_contribs_user_4_month.append(float(contribmonth)/usermonth) avg_contribs_user_5_month=[] for contribmonth, usermonth in zip(contribs_level5, list_users_5_month): avg_contribs_user_5_month.append(float(contribmonth)/usermonth) ## FIG 7 POPULATION GROWTH FOR EACH USER GROUP ## SIMPLY RETRIEVE list_users_X_month ## FIG 8 % OF TOTAL POPULATION OF EACH USER GROUP perc_users_1_months=[] perc_users_2_months=[] perc_users_3_months=[] perc_users_4_months=[] perc_users_5_months=[] for e1, e2, e3, e4, e5 in zip(list_users_1_month,list_users_2_month,list_users_3_month,list_users_4_month,list_users_5_month): total_users_month=e1+e2+e3+e4+e5 perc_users_1_months.append((float(e1)/total_users_month)) perc_users_2_months.append((float(e2)/total_users_month)) perc_users_3_months.append((float(e3)/total_users_month)) perc_users_4_months.append((float(e4)/total_users_month)) perc_users_5_months.append((float(e5)/total_users_month)) ############################### ## FINAL DUTIES, TRANSFER DATA AND EXECUTE R SCRIPT filenames=["dates_admin_contrib.data","contribs_admins_months.data", "perc_contribs_months.data","dates_level1_contrib.data", "contribs_level1_months.data", "perc_contribs_level1_months.data", "dates_level2_contrib.data", "contribs_level2_months.data", "perc_contribs_level2_months.data","dates_level3_contrib.data", "contribs_level3_months.data", "perc_contribs_level3_months.data","dates_level4_contrib.data", "contribs_level4_months.data", "perc_contribs_level4_months.data","dates_level5_contrib.data" ,"contribs_level5_months.data", "perc_contribs_level5_months.data", "avg_contribs_user_1_month.data", "avg_contribs_user_2_month.data", "avg_contribs_user_3_month.data", "avg_contribs_user_4_month.data", "avg_contribs_user_5_month.data", "users_1_month.data", "users_2_month.data", "users_3_month.data", "users_4_month.data", "users_5_month.data", "perc_users_1_months.data","perc_users_2_months.data", "perc_users_3_months.data", "perc_users_4_months.data", "perc_users_5_months.data"] filenames_out=["Figure1.png", "Figure_2.png", "Figure4.png", "Figure5.png", "Figure6.png", "Figure7.png", "Figure8.png"] dataList=[dates_contribs, admins_contribs, perc_contribs_admins, dates_level1, contribs_level1, perc_contribs_level1,dates_level2, contribs_level2, perc_contribs_level2,dates_level3, contribs_level3, perc_contribs_level3, dates_level4, contribs_level4, perc_contribs_level4,dates_level5, contribs_level5, perc_contribs_level5, avg_contribs_user_1_month, avg_contribs_user_2_month, avg_contribs_user_3_month, avg_contribs_user_4_month, avg_contribs_user_5_month, list_users_1_month, list_users_2_month, list_users_3_month, list_users_4_month, list_users_5_month, perc_users_1_months, perc_users_2_months, perc_users_3_months, perc_users_4_months, perc_users_5_months] for filename, data in zip (filenames, dataList): if(filename.find('date')!=-1): f=open("./graphics/"+idioma+"/data/"+filename, 'w') for adate in data: f.writelines(str(adate)+"\n") f.close() else: __makeDataFile(idioma, filename, data) #Pass data filenames to the GNU R script with a file f=open("./data/community_contrib_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/data/"+line+"\n") f.close() #Idem with graphic output filenames f=open("./data/community_contrib_files_out.data",'w') for line in filenames_out: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() #CALL GNU R SCRIPT measuring_Wiki.R succ=os.system("R --vanilla < ./community_contrib.R > debug_R") if succ==0: print "Funcion community_contrib.R ejecutada con exito para el lenguage... "+idioma