Example #1
0
    def performAnalysis(self):

        ##        Get DB connection
        self.acceso = dbaccess.get_Connection("localhost", 3306, "root",
                                              "phoenix",
                                              self.language + "_stub")
        ##        Singleton objects to plot graphics in the class methods
        self.simpleGraph = graphic2D(self.filePath)
        ##        self.multiGraph=graphic2Dmulti(self.filePath)
        ##        self.giniGraph=graphicGini(self.filePath)
        ##        self.splitHistGraph=graphicSplitHist(self.filePath, self.dataPath)
        self.graph3D = graphic3D(self.filePath, self.dataPath)
        print "Starting analysis on DB " + self.language + "_stub\n"
        ##        self.UserNumContribsGroup(self.acceso[1])
        ##        self.UserNumContribsGenerations()
        authorsGini = [(
            95.9677,
            4.046,
        ), (95.7015, 4.304), (96.2223, 4.363), (95.7104, 4.395),
                       (96.3844, 4.407), (92.4691, 4.528), (95.0077, 4.603),
                       (95.0071, 4.7298), (93.785, 5.051), (93.6076, 5.888)]
        authorsGini.sort()
        ##authorsGini=[(4.046,95.9677),(4.304,95.7015),(4.363,96.2223),(4.395,95.7104),(4.407,96.3844),(4.528,92.4691),(4.603,95.0077),(4.7298,95.0071),(5.051,93.785),(5.888,93.6076)]

        self.simpleGraph.createGraphic(
            "authors-Gini", (authorsGini, ), "Gini coeff. (%)",
            "Number of different authors (log)",
            "Gini coeff. vs. number of registered authors in the top-ten Wikipedias."
        )
        ##            Close DB connection
        dbaccess.close_Connection(self.acceso[0])
        print "This is finished"
Example #2
0
 def infoPages(self):
     ##	Generates statistics per article
     ##	Get new DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     
     #Local configuration
     target="page_id"
     intervals=["months", "quarters","weeks"]
     
     ###########################
     #Total num of revisions per page
     for nspace in self.nspaces:
         self.__total_rev(self.acceso[1], nspace+"_"+self.language, target)
     
     ###########################
     #Total number of different editors per page
     for nspace in self.nspaces:
         self.__total_rev_diff(self.acceso[1], nspace+"_"+self.language, target)
     
     ###########################
     #Total number of revisions per page for several time intervals
     #Currently, we are only interested in months, quarters and weeks
     for nspace in self.nspaces:
         for interval in intervals:
             self.__total_rev_time(self.acceso[1], interval,nspace+"_"+self.language, target)
     
     ###########################
     #Total number of different editors per page; per month ,quarter and week
     for nspace in self.nspaces:
         for interval in intervals:
             self.__total_rev_diff_time(self.acceso[1], interval,nspace+"_"+self.language, target)
     
     #Close DB connection
     dbaccess.close_Connection(self.acceso[0])
Example #3
0
 def generalStatistics(self):
     ##  Computes the views containing general statistics and overall information:
     ##  For all namespaces (official and artificial):
     ################
     ##  View _overall_statistics1_months, which includes
     ##  Total num of pages with at least one edit in that month, total number of contribs,
     ##  total num of users who made at least 1 edit in that month (alive_users)
     ####################################
     ##  Parameters from Wikistats by Erik Zachte
     ####################################
     ##  Wikipedians: contributors, active wikipedians, very active wikipedians
     ##  Articles: (WARNING: readable contents are not being filtered out yet)
     ##  new articles per day, edits per article, bytes per article, % of articles over 0.5k,
     ##  % of articles over 2k
     ##  Total size of contribs per month
     ##  Size of pages and number of different authors who have edited them
     ####################################
     ##    Get new DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     ##    General statistics
     for nspace in self.nspaces:
         self.__gral_stats(self.acceso[1], nspace + "_" + self.language)
 ##    Close DB connection
     dbaccess.close_Connection(self.acceso[0])
Example #4
0
    def prepro_pagelen(self):
        """
        Preprocessing tables for evolution of page length over time
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"   
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            
            #VIEW page_main_nored (pages in main nspace excluding redirects)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view page_main_nored as "+\
            "(select page_id from page where page_namespace=0 and page_is_redirect=0)")

            #VIEW rev_main_nored (revisions in main nspace in all pages, excluding redirects)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_main_nored as ("+\
            "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
            "(select page_id from page_main_nored))")
            
            #TABLES max_rev_YYYY (latest revision for each page in main nspace, up to year YYYY)
            self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision")
            self.years=range(int(self.minyear[0][0])+1, 2009)
            for self.year in self.years:
                dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_"+str(self.year))
                dbaccess.raw_query_SQL(self.access[1],"create table max_rev_"+str(self.year)+\
                " as (select max(rev_id) as max_id, rev_page from rev_main_nored "+\
                "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)")
                dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add primary key (max_id)")
                dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add index (rev_page)")
                
            dbaccess.close_Connection(self.access[0])
Example #5
0
 def generalStatistics(self):
 ##  Computes the views containing general statistics and overall information:
 ##  For all namespaces (official and artificial):
 ################
 ##  View _overall_statistics1_months, which includes
 ##  Total num of pages with at least one edit in that month, total number of contribs, 
 ##  total num of users who made at least 1 edit in that month (alive_users)
 ####################################
 ##  Parameters from Wikistats by Erik Zachte
 ####################################
 ##  Wikipedians: contributors, active wikipedians, very active wikipedians
 ##  Articles: (WARNING: readable contents are not being filtered out yet)
 ##  new articles per day, edits per article, bytes per article, % of articles over 0.5k,
 ##  % of articles over 2k
 ##  Total size of contribs per month
 ##  Size of pages and number of different authors who have edited them
 ####################################
 ##    Get new DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
 ##    General statistics
     for nspace in self.nspaces:
         self.__gral_stats(self.acceso[1], nspace+"_"+self.language)
 ##    Close DB connection
     dbaccess.close_Connection(self.acceso[0])
Example #6
0
def contributions(idiomas):
    """
    Create some graphs and files with statistical results about authors contributions
    
    @type  idiomas: list of strings
    @param idiomas: list of strings indicating the language versions to process
    """
    for idioma in idiomas:
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
        #acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages")
        #dbaccess.query_SQL(acceso[1], "page_id, page_namespace", "page", where="page_namespace=0", create="pag_namespace")
        tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma)
        tcauthor=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_author_"+idioma)
        #tc_ann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_Annons_author_text_"+idioma)
        dbaccess.close_Connection(acceso[0])
        
        data=__tup_to_list(tcnoann)
        listay_tcnoann=data.pop()
        listax=data.pop()
        data=__tup_to_list(tcauthor)
        listay_tcauthor=data.pop()
        listax=data.pop()
        #data=__tup_to_list(tc_ann)
        #listay_tc_ann=data.pop()
        #listax=data.pop()
        r.png("graphics/"+idioma+"/gini_TContrib_NoAnn_"+idioma+".png")
        __lorenz_Curve(listay_tcnoann)
        r.png("graphics/"+idioma+"/gini_TContrib_"+idioma+".png")
        __lorenz_Curve(listay_tcauthor)
Example #7
0
 def overall(self):
     """
     Preprocessing tables for evolution of page length over time
     """
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research"	
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         dbaccess.raw_query_SQL(self.access[1], "create or replace view page_redirect as "+\
         "(select page_id from page where page_namespace=0 and page_is_redirect=1)")
         dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_redirect as ("+\
         "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
         "(select page_id from page_redirect))")
         dbaccess.raw_query_SQL(self.access[1], "create or replace view page_talk as "+\
         "(select page_id from page where page_namespace=1)")
         dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_talk as ("+\
         "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
         "(select page_id from page_talk))")
         self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision")
         self.years=range(int(self.minyear[0][0])+1, 2009)
         for self.year in self.years:
             dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_talk_"+str(self.year))
                     dbaccess.raw_query_SQL(self.access[1],"create table max_rev_talk_"+str(self.year)+\
             " as (select max(rev_id) as max_id, rev_page from rev_talk "+\
             "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)")
             dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add primary key (max_id)")
             dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add index (rev_page)")
             
         dbaccess.close_Connection(self.access[0])
Example #8
0
    def infoAuthors(self):
        ##  Generates statistics per user
        ##  Get DB connection
        self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
        "wx_"+self.language+"_"+self.conf.dumptype)

        ##	local configuration: retrieving info for authors
        target = "author"
        ##	intervals might be days, weeks, months, quarters, years
        intervals = ["months", "quarters", "weeks"]

        ############################
        #Number of total revisions per author ID
        for nspace in self.nspaces:
            self.__total_rev(self.acceso[1], nspace + "_" + self.language,
                             target)

        ############################
        #Different articles edited per user
        for nspace in self.nspaces:
            self.__total_rev_diff(self.acceso[1], nspace + "_" + self.language,
                                  target)

        ############################
        #Total num of articles started per author
        #We consider as the beginning of an article the first revision of that article
        for nspace in self.nspaces:
            self.__total_page_init_author(self.acceso[1],
                                          nspace + "_" + self.language)

        ############################
        #Total number of revisions per author for several time intervals
        #Currently, we are only interested in data per months, quarters and weeks
        for nspace in self.nspaces:
            for interval in intervals:
                self.__total_rev_time(self.acceso[1], interval,
                                      nspace + "_" + self.language, target)

        ############################
        #Num of different articles revised per author for several time intervals
        for nspace in self.nspaces:
            for interval in intervals:
                self.__total_rev_diff_time(self.acceso[1], interval,
                                           nspace + "_" + self.language,
                                           target)

        ############################
        #Num of different articles initiated per author
        for nspace in self.nspaces:
            for interval in intervals:
                self.__total_page_init_author_time(
                    self.acceso[1], interval, nspace + "_" + self.language)

        ############################
        #   BIRTH AND DEATH ANALYSIS FOR THE AUTHOR COMMUNITY
        ############################

        #Close DB connection
        dbaccess.close_Connection(self.acceso[0])
Example #9
0
 def __init__(self, language="furwiki", dumptype="research", msqlu="", msqlp=""):
     """
     It receives the language and dumptype to download
     It returns an int =0 if the DB was successfully set up, =-1 if there was an error
     """
     self.language=language       #language to download
     self.dumptype=dumptype      #type of dump      
     self.files=["pages-meta-history.xml.7z", "redirect.sql.gz","page_restrictions.sql.gz",\
     "user_groups.sql.gz", "logging.sql.gz", "interwiki.sql.gz", "langlinks.sql.gz", "externallinks.sql.gz",\
     "templatelinks.sql.gz", "imagelinks.sql.gz", "categorylinks.sql.gz", "pagelinks.sql.gz", "oldimage.sql.gz",\
     "image.sql.gz"]
     self.filename=""
     self.filenameTemplate=string.Template("""$language-latest-$file""") #dump's filename in Wikimedia's server
     #URL to download the file
     self.urld=""
     self.urldTemplate=string.Template("""http://download.wikimedia.org/$language/latest/$language-latest-$file""")
     if (msqlu=="" or msqlp==""):
         print "Error initializing DB dump object. You must provide a valid MySQL username and password"
     else:
         self.msqlu=msqlu   #MySQL username for accessing and editing the DB
         self.msqlp=msqlp   #MySQL password
     #We can manage two different types of dumps, stubs (without the text of every revision) and pages
     #(containing the text of every revision)
     #self.urld="http://download.wikimedia.org/"+self.language+"/latest/"+\
     #self.language+"-latest-pages-meta-history.xml.7z"  #File to download
     #patterns for files
     #http://download.wikimedia.org/furwiki/20060921/furwiki-20060921-pages-meta-history.xml.7z
     #http://download.wikimedia.org/amwiki/20061014/amwiki-20061014-stub-meta-history.xml.gz
     #Create /dumps directory if it does not exist yet
     directories=os.listdir("./")
     if ("dumps" not in directories):
         os.makedirs("./dumps")
     ## Initialize DB in MySQL: create DB and tables definitions
     print "Initializing DB for --> "+ self.language +"\n"
     #Retrieving connection and cursor to access the DB
     acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,"mysql")
     dbaccess.createDB_SQL(acceso[1],"wx_"+self.language+"_"+self.dumptype)
     if self.dumptype=="research":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_research.sql > debug_mysql.log"
     elif self.dumptype=="standard":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_standard.sql > debug_mysql.log"
     ok=os.system(command)
     if ok == 0:
         acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,\
         "wx_"+self.language+"_"+self.dumptype)
         dbaccess.raw_query_SQL(acceso[1], "alter table page max_rows = 200000000000 avg_row_length = 50")
         dbaccess.raw_query_SQL(acceso[1], "alter table revision max_rows = 200000000000 avg_row_length = 50")
         if self.dumptype=="standard":
             dbaccess.raw_query_SQL(acceso[1], "alter table text max_rows = 200000000000 avg_row_length = 50")
         dbaccess.close_Connection(acceso[0])
     else:
         print "Error! There was a problem initializing definitions for DB tables"
         dbaccess.close_Connection(acceso[0])
Example #10
0
 def __init__(self, language="furwiki", dumptype="research", msqlu="", msqlp=""):
     """
     It receives the language and dumptype to download
     It returns an int =0 if the DB was successfully set up, =-1 if there was an error
     """
     self.language=language       #language to download
     self.dumptype=dumptype      #type of dump      
     self.files=["pages-meta-history.xml.7z", "redirect.sql.gz","page_restrictions.sql.gz",\
     "user_groups.sql.gz", "logging.sql.gz", "interwiki.sql.gz", "langlinks.sql.gz", "externallinks.sql.gz",\
     "templatelinks.sql.gz", "imagelinks.sql.gz", "categorylinks.sql.gz", "pagelinks.sql.gz", "oldimage.sql.gz",\
     "image.sql.gz"]
     self.filename=""
     self.filenameTemplate=string.Template("""$language-latest-$file""") #dump's filename in Wikimedia's server
     #URL to download the file
     self.urld=""
     self.urldTemplate=string.Template("""http://download.wikimedia.org/$language/latest/$language-latest-$file""")
     if (msqlu=="" or msqlp==""):
         print "Error initializing DB dump object. You must provide a valid MySQL username and password"
     else:
         self.msqlu=msqlu   #MySQL username for accessing and editing the DB
         self.msqlp=msqlp   #MySQL password
     #We can manage two different types of dumps, stubs (without the text of every revision) and pages
     #(containing the text of every revision)
     #self.urld="http://download.wikimedia.org/"+self.language+"/latest/"+\
     #self.language+"-latest-pages-meta-history.xml.7z"  #File to download
     #patterns for files
     #http://download.wikimedia.org/furwiki/20060921/furwiki-20060921-pages-meta-history.xml.7z
     #http://download.wikimedia.org/amwiki/20061014/amwiki-20061014-stub-meta-history.xml.gz
     #Create /dumps directory if it does not exist yet
     directories=os.listdir("./")
     if ("dumps" not in directories):
         os.makedirs("./dumps")
     ## Initialize DB in MySQL: create DB and tables definitions
     print "Initializing DB for --> "+ self.language +"\n"
     #Retrieving connection and cursor to access the DB
     acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,"mysql")
     dbaccess.createDB_SQL(acceso[1],"wx_"+self.language+"_"+self.dumptype)
     if self.dumptype=="research":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_research.sql > debug_mysql.log"
     elif self.dumptype=="standard":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_standard.sql > debug_mysql.log"
     ok=os.system(command)
     if ok == 0:
         acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,\
         "wx_"+self.language+"_"+self.dumptype)
         dbaccess.raw_query_SQL(acceso[1], "alter table page max_rows = 200000000000 avg_row_length = 50")
         dbaccess.raw_query_SQL(acceso[1], "alter table revision max_rows = 200000000000 avg_row_length = 50")
         if self.dumptype=="standard":
             dbaccess.raw_query_SQL(acceso[1], "alter table text max_rows = 200000000000 avg_row_length = 50")
         dbaccess.close_Connection(acceso[0])
     else:
         print "Error! There was a problem initializing definitions for DB tables"
         dbaccess.close_Connection(acceso[0])
Example #11
0
 def infoAuthors(self):
     ##  Generates statistics per user
     ##  Get DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     
     ##	local configuration: retrieving info for authors
     target="author"
     ##	intervals might be days, weeks, months, quarters, years
     intervals=["months", "quarters","weeks"]
     
     ############################
     #Number of total revisions per author ID
     for nspace in self.nspaces:
         self.__total_rev(self.acceso[1], nspace+"_"+self.language, target)
     
     ############################
     #Different articles edited per user
     for nspace in self.nspaces:
         self.__total_rev_diff(self.acceso[1], nspace+"_"+self.language, target)
     
     ############################
     #Total num of articles started per author
     #We consider as the beginning of an article the first revision of that article
     for nspace in self.nspaces:
         self.__total_page_init_author(self.acceso[1], nspace+"_"+self.language)
     
     ############################
     #Total number of revisions per author for several time intervals
     #Currently, we are only interested in data per months, quarters and weeks
     for nspace in self.nspaces:
         for interval in intervals:
             self.__total_rev_time(self.acceso[1], interval,nspace+"_"+self.language, target)
     
     ############################
     #Num of different articles revised per author for several time intervals
     for nspace in self.nspaces:
         for interval in intervals:
             self.__total_rev_diff_time(self.acceso[1], interval,nspace+"_"+self.language, target)
     
     ############################
     #Num of different articles initiated per author
     for nspace in self.nspaces:
         for interval in intervals:
             self.__total_page_init_author_time(self.acceso[1], interval,nspace+"_"+self.language)
             
     ############################
     #   BIRTH AND DEATH ANALYSIS FOR THE AUTHOR COMMUNITY
     ############################
     
     #Close DB connection
     dbaccess.close_Connection(self.acceso[0])
Example #12
0
 def infoContents(self):
     ###########################
     #Contents analysis
     ###########################
     ##  Get DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     ## For all namespaces (official an artificial):
     ## Evolution in time of the lenght of user contributions (per month; per quarter)
     ## Evolution in time of the lenght of pages (per month; per quarter supported but commented)
     for nspace in self.nspaces:
         self.__content_evolution(self.acceso[1], nspace+"_"+self.language)
     dbaccess.close_Connection(self.acceso[0])
Example #13
0
 def infoContents(self):
     ###########################
     #Contents analysis
     ###########################
     ##  Get DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     ## For all namespaces (official an artificial):
     ## Evolution in time of the lenght of user contributions (per month; per quarter)
     ## Evolution in time of the lenght of pages (per month; per quarter supported but commented)
     for nspace in self.nspaces:
         self.__content_evolution(self.acceso[1],
                                  nspace + "_" + self.language)
     dbaccess.close_Connection(self.acceso[0])
Example #14
0
 def cox_prop(self):
     """
     Creates intermediate files and tables for Cox-prop hazards analysis
     """
     #Initialize file header
     f=open("wkp_cox_prop_all.dat",'w')
     f.write("Project,rev_user,min_ts,max_ts,in_talk,in_FAs\n")
     f.close()
     
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research"
     
         print "Starting language "+self.language+"\n"
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         
         ##TABLE: Create table of users in talk pages
         dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_talk")
         dbaccess.raw_query_SQL(self.access[1],"create table users_in_talk as (select distinct(rev_user) from revision "+\
         "where rev_page in (select page_id from page where page_namespace=1))")
         dbaccess.raw_query_SQL(self.access[1],"alter table users_in_talk add primary key (rev_user)")
         
         ##TABLE: Create table of users in FAs
         dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_FAs")
         dbaccess.raw_query_SQL(self.access[1],"create table users_in_FAs as (select distinct(rev_user) from revision_FAs)")
         dbaccess.raw_query_SQL(self.access[1],"alter table users_in_FAs add primary key (rev_user)")
         
         ##TABLE: MIX previous info with time_range_authors --> save result in new table time_range_cox
         dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_cox")
         dbaccess.raw_query_SQL(self.access[1],"create table time_range_cox as (select rev_user, "+\
         "date(min_ts) as min_ts, date(max_ts) as max_ts, "+\
         "case when rev_user in (select rev_user from users_in_talk) then 1 else 0 end as in_talk, "+\
         "case when rev_user in (select rev_user from users_in_FAs) then 1 else 0 end as in_FAs "+\
         "from time_range_authors)")
     
         ##IN SYSTEM
         print "Interm. tables created proceeding to write out data..."+self.language+"\n"
         results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, min_ts, max_ts, in_talk, in_FAs "+\
         "from time_range_cox "+\
         " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')")
         #Close DB connection
         dbaccess.close_Connection(self.access[0])
     
         f=open("wkp_cox_prop_all.dat",'a')
         for result in results:
             f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\","+\
             str(int(result[3]))+","+str(int(result[4]))+"\n")
         f.close()
         print "Finished all cox-prop tasks for "+self.language+"\n"
Example #15
0
 def test_funciones(self):
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+self.conf.dumptype)
     __total_rev(self.acceso[1], table="stats_nlwiki", target="author")
     ##	targets=["page_id"]
     ##	for target in targets:
     ##		__total_rev(self.acceso[1], language, target)
     ##		__total_rev_target(self.acceso[1], language, target)
     ##		__total_rev_time(self.acceso[1],"years",language, target)
     ##		__total_rev_target_time(self.acceso[1],"years",language, target)
     ##	__total_article_init_author(self.acceso[1], language)
     ##	__article_init_author_time(self.acceso[1],"years",language)
     
     ##    __article_rev_author_time(self.acceso[1], "years", language)
     ##	__total_rev_time(self.acceso[1],"months",language, "page_id")
     ##	__total_article_init_author(self.acceso[1], language, target="author")
     ##	__content_evolution(self.acceso[1], language)
     dbaccess.close_Connection(self.acceso[0])
Example #16
0
    def test_funciones(self):
        self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
        "wx_"+self.language+self.conf.dumptype)
        __total_rev(self.acceso[1], table="stats_nlwiki", target="author")
        ##	targets=["page_id"]
        ##	for target in targets:
        ##		__total_rev(self.acceso[1], language, target)
        ##		__total_rev_target(self.acceso[1], language, target)
        ##		__total_rev_time(self.acceso[1],"years",language, target)
        ##		__total_rev_target_time(self.acceso[1],"years",language, target)
        ##	__total_article_init_author(self.acceso[1], language)
        ##	__article_init_author_time(self.acceso[1],"years",language)

        ##    __article_rev_author_time(self.acceso[1], "years", language)
        ##	__total_rev_time(self.acceso[1],"months",language, "page_id")
        ##	__total_article_init_author(self.acceso[1], language, target="author")
        ##	__content_evolution(self.acceso[1], language)
        dbaccess.close_Connection(self.acceso[0])
Example #17
0
 def ratios(self):
     """
     .dat files showing interesting descriptive ratios
     """
     #FILE author-pages.dat ratio no. logged editors/no. user pages
     file=open("overall/data/editors-userpages.dat",'w')
     file.write("logged_authors\tuser_pages\tratio\tlang\n")
     file.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research" 
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         #Obtain number of different logged authors
         self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\
         "revision where rev_user!=0")
         #Obtain number of different user pages (nspace =2)
         self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=2")
         dbaccess.close_Connection(self.access[0])
         #Writing data to file
         file=open("overall/data/author-pages.dat",'a')
         file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\
         str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n")
         file.close()
         #print "Completed lang "+self.language+"\n"
 
     #FILE articles-talk-ratio.dat ratio of no. articles/no. talk pages (excluding redirects)
     file=open("overall/data/articles-talk-ratio.dat",'w')
     file.write("articles\ttalk\tratio\tlang\n")
     file.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research" 
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         #Obtain number of articles excluding redirects
         self.articles=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=0 and page_is_redirect=0")
         #Obtain number of talk pages
         self.talk=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=1")
         dbaccess.close_Connection(self.access[0])
         #Writing data to file
         file=open("overall/data/articles-talk-ratio.dat",'a')
         file.write(str(int(self.articles[0][0]))+"\t"+str(int(self.talk[0][0]))+"\t"+\
         str(float(self.talk[0][0])/float(self.articles[0][0]))+"\t"+self.language+"\n")
         file.close()
Example #18
0
    def overall(self):
        """
        Preprocessing tables for evolution of page length over time
        """
	file=open("author-pages.dat",'w')
	file.write("logged_authors\tuser_pages\tratio\tlang\n")
	file.close()
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"	
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
	    self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\
	    "revision where rev_user!=0")
	    self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
	    "page where page_namespace=2")
            dbaccess.close_Connection(self.access[0])
	    file=open("author-pages.dat",'a')
	    file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\
	    str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n")
	    file.close()
	    print "Completed lang "+self.language+"\n"
Example #19
0
    def time_range(self):
        """
        Creates intermediate tables with time frame of editors activity
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            
            ##### TIME RANGE FOR AUTHORS IN ALL NAMESPACES
            #TABLE: Total no. of revisions made by every logged author
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\
            "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\
            "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user")
            dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)")
            
            print "Created table user_revs for "+self.language+"wiki...\n"
            
            #TABLE: Min and max timestamp for every logged author + total num_revs
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_authors AS "+\
            "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\
            "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\
            "ORDER BY min_ts)")
            
            print "Created table time_range_authors for "+self.language+"wiki...\n"
            
            ##### TIME RANGE FOR AUTHORS IN MAIN ONLY
                print "Processing language "+self.language+"\n"
            #VIEW: Create view for filtering annons and bots
            #Filter from rev_main_nored revisions from logged authors only
            dbaccess.raw_query_SQL(self.access[1],"create or replace view revision_logged as (select * from rev_main_nored "+\
            " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot') )")
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_users")
            #TABLE: Intermediate table, storing for each logged author the min and max ts in the system
            dbaccess.raw_query_SQL(self.access[1],"create table time_range_users as (SELECT rev_user, "+\
            "min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision_logged group by rev_user)")
            dbaccess.raw_query_SQL(self.access[1],"alter table time_range_users add primary key (rev_user)")
            
            print "Created time_range_users for "+self.language +"\n"

            #Close DB connection
            dbaccess.close_Connection(self.access[0])
Example #20
0
def comparative_contributions():
    listaidiomas=["dewiki", "jawiki", "frwiki", "plwiki", "nlwiki", "itwiki", "ptwiki", "eswiki", "svwiki"]
##    lista=["eswiki", "svwiki"]
    
    r.png("graphics/AAA/gini_comparative_top10.png")
    flag=0
    for idioma in listaidiomas:
        print "Starting comparative Gini analysis for language..."+idioma+"\n"
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
        tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma)
        dbaccess.close_Connection(acceso[0])
        data=__tup_to_list(tcnoann)
        listay_tcnoann=data.pop()
        listax=data.pop()
        if flag==0:
            _lorenz_Comp_Curves(listay_tcnoann,flag)
            flag=1
        else:
            _lorenz_Comp_Curves(listay_tcnoann,flag)
    r.dev_off()
    print "Comparative graphic for Gini curves finished!!"
Example #21
0
    def infoPages(self):
        ##	Generates statistics per article
        ##	Get new DB connection
        self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
        "wx_"+self.language+"_"+self.conf.dumptype)

        #Local configuration
        target = "page_id"
        intervals = ["months", "quarters", "weeks"]

        ###########################
        #Total num of revisions per page
        for nspace in self.nspaces:
            self.__total_rev(self.acceso[1], nspace + "_" + self.language,
                             target)

        ###########################
        #Total number of different editors per page
        for nspace in self.nspaces:
            self.__total_rev_diff(self.acceso[1], nspace + "_" + self.language,
                                  target)

        ###########################
        #Total number of revisions per page for several time intervals
        #Currently, we are only interested in months, quarters and weeks
        for nspace in self.nspaces:
            for interval in intervals:
                self.__total_rev_time(self.acceso[1], interval,
                                      nspace + "_" + self.language, target)

        ###########################
        #Total number of different editors per page; per month ,quarter and week
        for nspace in self.nspaces:
            for interval in intervals:
                self.__total_rev_diff_time(self.acceso[1], interval,
                                           nspace + "_" + self.language,
                                           target)

        #Close DB connection
        dbaccess.close_Connection(self.acceso[0])
Example #22
0
def histogram(idiomas):
    """
    Create histograms depicting article size distribution for a certain language version
    
    @type  idiomas: list of strings
    @param idiomas: list of strings indicating the language versions to process
    """
    filenames=["boxplot_log.png", "histogram_log.png", "histogram_log_low.png", "histogram_log_high.png", "ecdf_log_low.png", "ecdf_log_high.png", "data/page_len_log.data", "/data/histograms.info", "ecdf_total.png"]
    
    for idioma in idiomas:
        print "Creando histogramas para el idioma ... "+idioma
        #Print to another file the names of graphics files, following the order in the GNU R script histogram.R
        f=open("./data/hist_files_names.data",'w')
        for line in filenames:
            f.write("./graphics/"+idioma+"/"+line+"\n")
        f.close()
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
    
        #Considering only database pages corresponding to articles, with NAMESPACE=MAIN=0
        #dbaccess.dropTab_SQL(acceso[1], "aux")
        #dbaccess.query_SQL(acceso[1],"page_id, page_len","page", where="page_namespace=0", order="page_len", create="aux")
        result=dbaccess.query_SQL(acceso[1], "page_id, page_len", "aux")
        dbaccess.close_Connection(acceso[0])
        data=__tup_to_list(result)
        page_len=data.pop()
        for i in range(len(page_len)):
            if page_len[i]!=0:
                page_len[i]=math.log10(page_len[i])
        
        #Print to another file a list with article sizes to plot histograms
        f=open("./graphics/"+idioma+"/data/page_len_log.data", 'w')
        for value in page_len:
            f.writelines(str(value)+"\n")
        f.close()
        
        #CALL THE GNU R SCRIPT Histogram.R
        succ=os.system("R --vanilla < ./histogram.R > debug_R")
        if succ==0:
            print "Funcion histogram ejecutada con exito para el lenguage... "+idioma
Example #23
0
    def prepro_red_talk(self):
        """
        Data and evolution for redirects and talk pages
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research" 
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)

            #VIEW page_redirect (pages with redirect flag activated)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view page_redirect as "+\
            "(select page_id from page where page_namespace=0 and page_is_redirect=1)")

            #VIEW rev_redirect (revisions corresponding to redirect pages)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_redirect as ("+\
            "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
            "(select page_id from page_redirect))")

            #VIEW page_talk (pages in talk nspace)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view page_talk as "+\
            "(select page_id from page where page_namespace=1)")

            #VIEW rev_talk (revisions corresponding to talk pages)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_talk as ("+\
            "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
            "(select page_id from page_talk))")

            #TABLES max_rev_talk_YYYY (latest revision for each pages in talk nspace, in year YYYY)
            self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision")
            self.years=range(int(self.minyear[0][0])+1, 2009)
            for self.year in self.years:
                dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_talk_"+str(self.year))
                dbaccess.raw_query_SQL(self.access[1],"create table max_rev_talk_"+str(self.year)+\
                " as (select max(rev_id) as max_id, rev_page from rev_talk "+\
                "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)")
                dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add primary key (max_id)")
                dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add index (rev_page)")
                
            dbaccess.close_Connection(self.access[0])
    def performAnalysis(self):

        ##        Get DB connection
        self.acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", self.language + "_stub")
        ##        Singleton objects to plot graphics in the class methods
        self.simpleGraph = graphic2D(self.filePath)
        ##        self.multiGraph=graphic2Dmulti(self.filePath)
        ##        self.giniGraph=graphicGini(self.filePath)
        ##        self.splitHistGraph=graphicSplitHist(self.filePath, self.dataPath)
        self.graph3D = graphic3D(self.filePath, self.dataPath)
        print "Starting analysis on DB " + self.language + "_stub\n"
        ##        self.UserNumContribsGroup(self.acceso[1])
        ##        self.UserNumContribsGenerations()
        authorsGini = [
            (95.9677, 4.046),
            (95.7015, 4.304),
            (96.2223, 4.363),
            (95.7104, 4.395),
            (96.3844, 4.407),
            (92.4691, 4.528),
            (95.0077, 4.603),
            (95.0071, 4.7298),
            (93.785, 5.051),
            (93.6076, 5.888),
        ]
        authorsGini.sort()
        ##authorsGini=[(4.046,95.9677),(4.304,95.7015),(4.363,96.2223),(4.395,95.7104),(4.407,96.3844),(4.528,92.4691),(4.603,95.0077),(4.7298,95.0071),(5.051,93.785),(5.888,93.6076)]

        self.simpleGraph.createGraphic(
            "authors-Gini",
            (authorsGini,),
            "Gini coeff. (%)",
            "Number of different authors (log)",
            "Gini coeff. vs. number of registered authors in the top-ten Wikipedias.",
        )
        ##            Close DB connection
        dbaccess.close_Connection(self.acceso[0])
        print "This is finished"
Example #25
0
 def general_stats(self):
     """
     Preprocessing actions for general statistics scripts
     """
     #FILE page_len.dat, with info about length of pages
     self.f=open("overall/data/page_len.dat", 'w')
     self.f.write("page_len\tns\tis_redirect\tis_stub\tis_new\tlang\n")
     self.f.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research" 
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         print "Retrieving info from "+self.language+"\n"
         results=dbaccess.raw_query_SQL(self.access[1], "SELECT page_len, page_namespace, page_is_redirect, page_is_stub, "+\
         "page_is_new FROM page")
         print "Updating page_len info file with "+self.language+"\n"
             
         self.f=open("overall/data/page_len.dat", 'a')
         for result in results:
             self.f.write(str(int(result[0]))+"\t"+str(int(result[1]))+"\t"+str(int(result[2]))+"\t"+\
             str(int(result[3]))+"\t"+str(int(result[4]))+"\t"+self.language+"\n")
         self.f.close()
         results=None
         dbaccess.close_Connection(self.access[0])
    def analyze(self):

        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            
            #Total no. of revisions made by every logged author
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\
            "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\
            "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user")
            dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)")
            
            print "Created table user_revs for "+self.language+"wiki...\n"
            
            #Min and max timestamp for every logged author + total num_revs
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_allns AS "+\
            "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\
            "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\
            "ORDER BY min_ts)")
            
            print "Created table time_range_allns for "+self.language+"wiki...\n"
                
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
Example #27
0
                self.access[1], "ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)"
            )
        except Exception, e:
            print "An exception ocurred, the problem was the following:\n"
            print e
            print "*************\n\n"
        # try:
        # print "Generating index for rev_user_text and timestamp...\n"
        # dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)")
        # except Exception, e:
        # print "An exception ocurred, the problem was the following:\n"
        # print e
        # print "*************\n\n"
        print "Database" + "wx_" + self.language + "wiki_" + self.dumptype + " ready for quantitative analysis...\n"
        ##Close connection to DB server
        dbaccess.close_Connection(self.access[0])


if __name__ == "__main__":

    languages = ["pt", "it", "nl", "ja", "pl", "fr", "de"]

    # Normal languages
    for lang in languages:
        new_index = indexes("root", "phoenix", lang, "research")
        new_index.make_indexes()

    # The stub dump for enwiki
    # index_english=indexes("root","phoenix","en","stub_research")
    # index_english.make_indexes()
Example #28
0
            print self.pageinsert.encode('utf_8')
        elif self.options.monitor:
            while 1:
                try:
                    dbaccess.raw_query_SQL(self.acceso[1],
                                           self.pageinsert.encode('utf_8'))
                except (Exception), e:
                    print e
                else:
                    break
        #Reset status vars
        self.pageinsertrows = 0
        self.pageinsertsize = 0
        ########IF WE USE MONITOR MODE, CLOSE DB CONNECTION
        if options.monitor and (not options.fileout and not options.streamout):
            dbaccess.close_Connection(self.acceso[1])
        ################################################
        #Checking out total time consumed and display end message
        self.timeCheck = datetime.datetime.now()
        self.timeDelta = self.timeCheck - self.start
        print >> sys.stderr, "\n"
        print >> sys.stderr, "File successfully parsed..."
        print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)" % (self.page_num,\
        float(self.page_num)/self.timeDelta.seconds, self.rev_num, float(self.rev_num)/self.timeDelta.seconds)


##Main zone
if __name__ == '__main__':
    usage = "usage: %prog [options]"
    parserc = OptionParser(usage)
    parserc.add_option(
Example #29
0
    def decompress (self):
        """
        Decompress the DB dumps into MySQL
        """
        ##TODO: Ad-hoc acuerdate de quitar esto POR DIOSSS
        ##self.filename="mtwiki-latest-pages-meta-history.xml.7z" 
        if self.dumptype=="research":
            program="dump_sax_research.py"
        elif self.dumptype=="standard":
            program="dump_sax.py"
        else:
            print "Error! Unexpected type of dump received"
            return -1
        self.filename=self.filenameTemplate.safe_substitute(language=self.language,file=self.files[0])
        #Then we call our parser "dump_sax_research.py" to load data into MySQL
        command_7z="7za e -so dumps/"+self.filename+" | "+"python "+program+\
        " -u "+self.msqlu+" -p "+self.msqlp+" -d "+"wx_"+self.language+"_"+self.dumptype+\
        " --log "+self.language+".log"
        success=os.system(command_7z)
        if success == 0:
            print "DB "+"wx_"+self.language+\
            self.dumptype+" successfully decompressed...\n\n"
        else:
            print "Error! There was an error trying to decompress database --> "+\
            "wx_"+self.language+self.dumptype
            return -1
        #Loading into MySQL other interesting tables directly provided in SQL format
        #SQL code to generate the tables is embedded in the SQL file itself
##        for index in range(1,len(self.files)):
##            self.filename=self.filenameTemplate.safe_substitute(language=self.language, file=self.files[index])
##            command_gzip="gzip -d dumps/"+self.filename
##            command_mysql="mysql -u "+self.msqlu+" -p"+self.msqlp+\
##            " wx_"+self.language+"_"+self.dumptype+\
##            " < dumps/"+self.filename.rstrip(".gz")
##            command_comp="gzip dumps/"+self.filename.rstrip(".gz")
##            print "Decompressing "+self.filename+"..."
##            success=os.system(command_gzip)
##            if success==0:
##                print "Loading "+self.filename.rstrip(".gz")+" into MySQL database..."
##                success=os.system(command_mysql)
##                if success==0:
##                    print "Compressing again "+self.filename.rstrip(".gz")+"..."
##                    success=os.system(command_comp)
##                    if success!=0:
##                        print "Error compressing again "+self.filename.rstrip(".gz")
##                        return -1
##                else:
##                    print "Error loading "+self.filename.rstrip(".gz")
##                    return -1
##            else:
##                print "Error decompressing "+self.filename
##                return -1
        print "Generating indexes for tables page and revision...\n"
        print "Depending on the dump size this may take a while...\n"
        acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu,\
        self.msqlp, "wx_"+self.language+"_"+self.dumptype)
        #Generate adequate indexes and keys in tables page and revision
        print "Generating index for page_len...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE page ADD INDEX (page_len)")
        print "Modifying rev_timestamp to support DATETIME and creating index...\n"
        #dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision MODIFY rev_timestamp DATETIME")
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX timestamp (rev_timestamp)")
        print "Generating index for rev_page and rev_timestamp...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)")
        print "Generating index for rev_user and rev_timestamp...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)")
        print "Generating index for rev_user_text and timestamp...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)")
        dbaccess.close_Connection(acceso[0])
        print "Database ready for quantitative analysis...\n"
        print "Let's go on... Cross your fingers... ;-) \n\n\n"
        return success
Example #30
0
    def analyze(self):
        #Initialize all files headers
        #Survival data for all users (including editors out of MAIN)
        f=open("wkp_surv_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        #Survival data for all logged users who edited in MAIN
        f=open("wkp_surv_main_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        f=open("wkp_surv_join_core_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        f=open("wkp_surv_in_core_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        f=open("wkp_surv_core_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
        f=open("wkp_surv_join_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        f=open("wkp_surv_in_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
            
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
	    
            print "Starting language "+self.language+"\n"
            ##IN SYSTEM
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\
            "where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()

            ##IN MAIN
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_main_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished core users for language "+self.language+"\n"
            ###########################
            ##REV CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished all tasks for "+self.language+"\n"
Example #31
0
    def surv_files(self):
        """
        Creates all data files used as input for demography scripts in GNU R
        """
        #Initialize all files headers
        #FILE: Survival data for all users (including editors out of MAIN)
        f=open("wkp_surv_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        #FILE: Survival data for all logged users who edited in MAIN
        f=open("wkp_surv_main_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        #FILE: Survival data for all logged editors until they join the core (activity)
        f=open("wkp_surv_join_core_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        #FILE: Survival data for logged editors since they join the core until they leave it (activity)
        f=open("wkp_surv_in_core_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        #FILE: Survival data for loged editors since they leave the core until death (activity)
        f=open("wkp_surv_core_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
        #FILE: Survival data for all logged editors until they join the core (revisions)
        f=open("wkp_surv_join_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        #FILE: Survival data for logged editors since they join the core until they leave it (revisions)
        f=open("wkp_surv_in_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        #FILE: Survival data for loged editors since they leave the core until death (revisions)
        f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
            
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
        
            print "Starting language "+self.language+"\n"
            ##IN SYSTEM
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\
            " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()

            ##IN MAIN
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_main_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished core users by activity for language "+self.language+"\n"

            ###########################
            ##REV CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished all surv_file tasks for "+self.language+"\n"
Example #32
0
    def core_prepro(self):
        """
        Creates intermediate tables with info about core members (by activity
        and by top % of total number of revisions
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)

            #Obtain the list of years and months, with total num. of revisions and total num of logged users
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists core_limits_monthly")
            dbaccess.raw_query_SQL(self.access[1],"create table core_limits_monthly as "+\
            "(select year(rev_timestamp) as year, month(rev_timestamp) as month, "+\
            "count(distinct(rev_user)) num_users, count(*) num_revs from revision_logged group by year, month "+\
            "order by year, month)")
            
            print "Created table core_limits_monthly "+self.language+"\n"
            
            date_range=dbaccess.raw_query_SQL(self.access[1],"select * from core_limits_monthly "+\
            "order by year, month")
            
            #Core users: top-10% of total number of authors in that month
            #Core users with top-10% of total number of revisions in that month
            
            #Loop for each month
            need_create=True
            #LOOP FOR EACH MONTH IN LANG
            for adate in date_range:
                print "Processing year "+str(adate[0])+" month "+str(adate[1])+"\n"
                total_users=adate[2] #Total number of authors in that month
                total_revs=adate[3] #Total number of revisions in that month
                # To take the core of top-10% most active authors in that month
                limit_auth=int(round(total_users*0.1))+1
                # To take the core of authors responsible for top-10% of tot num.revs in that month
                limit_revs=int(round(total_revs*0.1))
                count_users=0
                count_revs=0
                insert_users=True
                insert_revs=True
                    
                #Get the list of active logged users for that month (descendent order!)
                ##IMPORTANT NOTE: FIRST APPLY SUBQUERY TO FILTER ALL REVISIONS IN THIS MONTH
                ##THEN APPLY THE GROUP AND ORDER CLAUSES ON THAT SUBQUERY
                ##THIS WAY, WE SAVE **A LOT** OF TIME DURING THIS PREPROCESSING STAGE
                month_users=dbaccess.raw_query_SQL(self.access[1],"select rev_user, count(*) num_revs_month from "+\
                "(select rev_user, rev_timestamp from revision_logged where "+\
                "year(rev_timestamp)="+str(int(adate[0]))+" and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\
                "order by num_revs_month desc" )
                
                #Calculate num. of authors accumulating top-10% of revs in that month
                for auser in month_users:
                    count_revs=count_revs+int(auser[1])
                    count_users=count_users+1
                    if (count_revs>limit_revs):
                        break
                    
                if (need_create):
                    #TABLE: Monthly info for users in core (by activity)
                    dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_core_monthly")
                    dbaccess.raw_query_SQL(self.access[1],"create table users_core_monthly as (select "+\
                    "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\
                    "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\
                    "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\
                    " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\
                    "order by num_revs_month desc limit "+str(limit_auth)+")" )
                    
                    #TABLE: Monthly info for users in core (by revisions)
                    dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_rev_core_monthly")
                    dbaccess.raw_query_SQL(self.access[1],"create table users_rev_core_monthly as (select "+\
                    "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\
                    "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\
                    "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\
                    " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\
                    "order by num_revs_month desc limit "+str(count_users)+")" )
                            
                    print "Created tables monthly data for "+self.language+"\n"
                    need_create=False
                
                else:
                    #Insert info in table with monthly info for users in core (by activity)
                    dbaccess.raw_query_SQL(self.access[1],"insert into users_core_monthly (select "+\
                    "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\
                    "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\
                    "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\
                    " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\
                    "order by num_revs_month desc limit "+str(limit_auth)+")" )
                    
                    #Insert info in table with monthly info for users in core (by revisions)
                    dbaccess.raw_query_SQL(self.access[1],"insert into users_rev_core_monthly (select "+\
                    "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\
                    "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\
                    "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\
                    " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\
                    "order by num_revs_month desc limit "+str(count_users)+")" )
                    
                print "Inserted monthly data for "+self.language+"\n"
                ####NOTE: WE ARE SUPPOSING THAT USERS DOES NOT LEAVE THE CORE SUBSEQUENTLY, TO COME BACK AGAIN, i.e.
                ####ONCE THEY JOIN THE CORE, WE ASSUME THAT THE DEFINITELY LEAVE IT AT max_ts_core
                ####BY THE MOMENT, WE WILL STICK TO THIS ASSUMPTION. LATER ON, WE CAN SEE HOW TO IDENTIFY BLANK PERIODS
            
            #Insert in table of core users values
            #users_core = top-10% most active authors in each month
            #users_rev_core = authors accumulating top-10% of tot. num. of revs. in that month
            
            #TABLE: Users in core by activity (user, min_ts, max_ts, min_ts_core, max_ts_core)
            print "Creating table users_core for "+ self.language+"\n"
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_core")
            dbaccess.raw_query_SQL(self.access[1], "create table users_core as (select x.*, "+\
            "(select min_ts from time_range_users r where r.rev_user=x.rev_user) min_ts, (select max_ts from "+\
            "time_range_users s where s.rev_user=x.rev_user) max_ts "+\
            "from (select rev_user, min(lower_ts_month) min_ts_core, "+\
            "max(upper_ts_month) max_ts_core from users_core_monthly group by rev_user) x)")
            
            #TABLE: Users in core by activity (user, min_ts, max_ts, min_ts_core, max_ts_core)
            print "Creating table users_rev_core for "+ self.language+"\n"
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_rev_core")
            dbaccess.raw_query_SQL(self.access[1], "create table users_rev_core as (select x.*, "+\
            "(select min_ts from time_range_users r where r.rev_user=x.rev_user) min_ts, (select max_ts from "+\
            "time_range_users s where s.rev_user=x.rev_user) max_ts "+\
            "from (select rev_user, min(lower_ts_month) min_ts_core, "+\
            "max(upper_ts_month) max_ts_core from users_rev_core_monthly group by rev_user) x)")
            
            print "All core_prepro tasks finished for"+ self.language+"\n"

            #Close DB connection
            dbaccess.close_Connection(self.access[0])
Example #33
0
    "CREATE TABLE page_FAs (page_id int(10) unsigned NOT NULL, page_title varchar(255), in_cover integer(1), PRIMARY KEY page_id(page_id))")
##    dbaccess.raw_query_SQL(acceso[1], "DROP TABLE IF EXISTS page_talk_FAs")
##    dbaccess.raw_query_SQL(acceso[1],\
##    "CREATE TABLE page_talk_FAs (page_id int(10) unsigned NOT NULL, page_title varchar(255), PRIMARY KEY page_id(page_id))")
    
    for element in listFA:
        print "Quering for ---> "+element[0].decode('utf_8')+" \n"
        try:
            dbaccess.raw_query_SQL(acceso[1], "INSERT INTO page_FAs (SELECT page_id, page_title, "+str(element[1])+" FROM page WHERE page_title='"+\
            element[0].replace("'","\\'").replace('"', '\\"')+"')")
##        print "Quering for ---> Discusión:"+element[0].decode('utf_8')+" \n"
##        try:
##            dbaccess.raw_query_SQL(acceso[1], "INSERT INTO page_talk_FAs (SELECT page_id, page_title FROM page WHERE page_title='Discusión:"+\
##            element[0].replace("'","\\'").replace('"', '\\"')+"' and page_namespace=1)")
        except (Exception), e:
            print "Ehhhhh, an exception ocurred..."+str(e)+"\n"
        #if len(result)>0:
            #fileID.write(str(result[0][0])+",")
            #listID.append(str(result[0][0]))
            #print "ID "+str(result[0][0])+"\n"
    #fileID.close()
    dbaccess.close_Connection(acceso[0])
##    for element in listFA:
##        print "--> '"+element[0].decode('utf_8')+"'\n"
##    print '\n'
##    count=0
##    for element in listFA:
##        if element[1]==True:
##            count+=1
##    print count
Example #34
0
 def decompress (self):
     """
     Decompress the DB dumps into MySQL
     """
     ##TODO: Ad-hoc acuerdate de quitar esto POR DIOSSS
     ##self.filename="mtwiki-latest-pages-meta-history.xml.7z" 
     if self.dumptype=="research":
         program="dump_sax_research.py"
     elif self.dumptype=="standard":
         program="dump_sax.py"
     else:
         print "Error! Unexpected type of dump received"
         return -1
     self.filename=self.filenameTemplate.safe_substitute(language=self.language,file=self.files[0])
     #Then we call our parser "dump_sax_research.py" to load data into MySQL
     command_7z="7za e -so dumps/"+self.filename+" | "+"python "+program+\
     " -u "+self.msqlu+" -p "+self.msqlp+" -d "+"wx_"+self.language+"_"+self.dumptype+\
     " --log "+self.language+".log"
     success=os.system(command_7z)
     if success == 0:
         print "DB "+"wx_"+self.language+\
         self.dumptype+" successfully decompressed...\n\n"
     else:
         print "Error! There was an error trying to decompress database --> "+\
         "wx_"+self.language+self.dumptype
         return -1
     #Loading into MySQL other interesting tables directly provided in SQL format
     #SQL code to generate the tables is embedded in the SQL file itself
     for index in range(1,len(self.files)):
         self.filename=self.filenameTemplate.safe_substitute(language=self.language, file=self.files[index])
         command_gzip="gzip -d dumps/"+self.filename
         command_mysql="mysql -u "+self.msqlu+" -p"+self.msqlp+\
         " wx_"+self.language+"_"+self.dumptype+\
         " < dumps/"+self.filename.strip(".gz")
         command_comp="gzip dumps/"+self.filename.strip(".gz")
         print "Decompressing "+self.filename+"..."
         success=os.system(command_gzip)
         if success==0:
             print "Loading "+self.filename.strip(".gz")+" into MySQL database..."
             success=os.system(command_mysql)
             if success==0:
                 print "Compressing again "+self.filename.strip(".gz")+"..."
                 success=os.system(command_comp)
                 if success!=0:
                     print "Error compressing again "+self.filename.strip(".gz")
                     return -1
             else:
                 print "Error loading "+self.filename.strip(".gz")
                 return -1
         else:
             print "Error decompressing "+self.filename
             return -1
     print "Generating indexes for tables page and revision...\n"
     print "Depending on the dump size this may take a while...\n"
     acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu,\
     self.msqlp, "wx_"+self.language+"_"+self.dumptype)
     #Generate adequate indexes and keys in tables page and revision
     print "Generating index for page_len...\n"
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE page ADD INDEX (page_len)")
     print "Modifying rev_timestamp to support DATETIME and creating index...\n"
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision MODIFY rev_timestamp DATETIME")
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX timestamp (rev_timestamp)")
     print "Generating index for rev_page and rev_timestamp...\n"
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)")
     print "Generating index for rev_user and rev_timestamp...\n"
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)")
     print "Generating index for rev_user_text and timestamp...\n"
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)")
     dbaccess.close_Connection(acceso[0])
     print "Database ready for quantitative analysis...\n"
     print "Let's go on... Cross your fingers... ;-) \n\n\n"
     return success
Example #35
0
    def __init__(self, conf, language="furwiki"):
        """
        Creates multiple views to create a convenient interface to access quantitative data
        It also generates necessary tables and views to store intermidiate results, so that other methods
        can later store data directly. 
        """
        self.conf = conf
        self.language = language
        ##List of namespaces to analyse. We have added new special namespaces (e.g. subsets of main)
        self.nspaces=["all","ns0","articles","redirects","cur_redirects","cur_stubs","stubs","talk",\
        "pageUser", "userTalk","meta", "metaTalk", "image", "imageTalk", "mediawiki",\
        "mediawikiTalk", "template", "templateTalk", "help", "helpTalk", "category", "categoryTalk"]

        ##Some fancy lists to work with time intervals in some private methods following
        self.type_interval_columns={"days":"day, year", "weeks":"week, year", "months":"month, year",\
        "quarters":"quarter, year", "years":"year"}
        self.type_interval_select={"days":"DAYOFYEAR(rev_timestamp) AS day, YEAR(rev_timestamp) AS year ",\
        "weeks":"WEEK(rev_timestamp,1) AS week, YEAR(rev_timestamp) AS year ",\
        "months":"MONTH(rev_timestamp) AS month, YEAR(rev_timestamp) AS year ",\
        "quarters":"QUARTER(rev_timestamp) AS quarter, YEAR(rev_timestamp) AS year ",\
        "years":"YEAR(rev_timestamp) AS year "}
        self.type_interval_group={"days":"year, day", "weeks":"year, week", "months":"year, month",\
        "quarters":"year, quarter", "years":"year"}

        ##	Get new DB connection
        self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
        "wx_"+self.language+"_"+self.conf.dumptype)

        ##    Delete previous versions of views
        for nspace in self.nspaces:
            dbaccess.dropView(self.acceso[1], nspace + "_" + self.language)

        ##    Create updated versions for views from revision table
        #View sumarizing all info for every revision (linking with info from table page)
        dbaccess.createView(self.acceso[1], view="all_"+self.language,\
        columns="rev_id, page_id, rev_len, page_ns, page_len, is_redirect, author, author_text,"+\
        " rev_timestamp, rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_namespace, page_len, rev_is_redirect,"+\
        " rev_user, rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id")
        #View sumarizing info regarding pages in namespace=0 (including articles, stubs and redirects)
        dbaccess.createView(self.acceso[1], view="ns0_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, is_redirect, author, author_text,"+\
        " rev_timestamp, rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, page_is_redirect, rev_user,"+\
        " rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id"+\
        " AND page_namespace=0")
        #View sumarizing info for articles (excluding pages that currently are redirects and stubs)
        dbaccess.createView(self.acceso[1], view="articles_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
        "rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text,"+\
        " rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND page_namespace=0 AND "+\
        "page_is_redirect=0 AND page_is_stub=0")
        #View with info only for redirects (pages that were redirects when that revision was made)
        dbaccess.createView(self.acceso[1], view="redirects_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
        "rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
        "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\
        "page_namespace=0 AND rev_is_redirect=1")
        #View with info only for current redirects
        dbaccess.createView(self.acceso[1], view="cur_redirects_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
        "rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
        "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\
        "page_namespace=0 AND page_is_redirect=1")
        #View with info only for revisions of stub pages (pages that were stubs when that revision was made)
        dbaccess.createView(self.acceso[1], view="stubs_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\
        " rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
        "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
        " page_namespace=0 AND rev_is_stub=1")
        #View with info only for revisions of current stub pages
        dbaccess.createView(self.acceso[1], view="cur_stubs_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\
        " rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
        "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
        " page_namespace=0 AND page_is_stub=1")
        #From this point on, automatically create views for the set of pages included in the remaining namespaces in MediaWiki
        for nspace, nsnum in zip(self.nspaces[7:], range(1, 16)):
            dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language,\
            columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"+\
            " rev_parent_id",
            query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
            "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
            " page_namespace="+str(nsnum))

        #View sumarizing the distribution of pages among namespaces
        dbaccess.dropView(self.acceso[1], "nspaces_" + self.language)
        dbaccess.createView(self.acceso[1],view="nspaces_"+self.language, columns="namespace, pages_in_nspace",\
        query="SELECT page_namespace, COUNT(*) FROM page GROUP BY page_namespace")

        ##    Intermidiate views for the minimun timestamp of every page [annons, and logged users]
        ## And other useful intermediate views regarding page evolution
        for nspace in self.nspaces:
            dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\
            "_page_min_timestamp_logged")
            dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\
            "_page_min_timestamp_logged", columns="page_id, rev_id, author, rev_timestamp",\
            query="SELECT page_id, rev_id,author, MIN(rev_timestamp) FROM "+\
            nspace+"_"+self.language+" WHERE author!=0 GROUP BY page_id")
            dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\
            "_page_min_timestamp_annons")
            dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\
            "_page_min_timestamp_annons",\
            columns="page_id, rev_id, author_text, rev_timestamp",\
            query="SELECT page_id,rev_id,author_text, MIN(rev_timestamp) FROM "+\
            nspace+"_"+self.language+" WHERE author=0 GROUP BY page_id")

            dbaccess.dropView(self.acceso[1],
                              nspace + "_" + self.language + "_list_months")
            dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_months",\
            columns="month, year",query="SELECT MONTH(rev_timestamp) as month, "+\
            "YEAR(rev_timestamp) as year"+\
            " FROM "+nspace+"_"+self.language+" GROUP BY year, month ORDER BY year, month")

            dbaccess.dropView(self.acceso[1],
                              nspace + "_" + self.language + "_list_quarters")
            dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_quarters",\
            columns="quarter, year",query="SELECT QUARTER(rev_timestamp) as quarter, "+\
            "YEAR(rev_timestamp) as year FROM "+nspace+"_"+self.language+" GROUP BY year,"+\
            " quarter ORDER BY year, quarter")

    ##    Close DB connection
        dbaccess.close_Connection(self.acceso[0])
Example #36
0
def summary_evol(idiomas):
    """
    Create some graphs summarizing the evolution in time of critical quantitative
    parameters for each language version to explore
    
    @type  idiomas: list of strings
    @param idiomas: list of strings indicating the language versions to process
    """
##	¡¡WARNING!! Please be careful when selecting values from tables storing evolution in time of number of articles, size etc.
##  You must always use a GROUP BY(pageCount, limitDate) clause, due to 
##  periods of inactivity that could generate duplicate entries in the graphics
    filenames=["page_dates.data", "page_Count_evol.data", "page_Len_Sum_log.data", "contribs_evol.data", "nspaces.data", "nspace_distrib.data", "diffArticles.data", "authors.data", "diff_authors_x_article.data", "authors_authors_per_pagelen.data", "pagelen_authors_per_pagelen.data"]

    filenames_out=["Tot_num_articles_absx_absy.png", "Tot_num_articles_absx_logy.png", "Tot_num_articles_logx_logy.png", "Tot_pagelensum_absx_absy.png", "Tot_pagelensum_absx_logy.png", "Tot_pagelensum_logx_logy.png", "Tot_contribs_absx_absy.png", "Tot_contribs_absx_logy.png", "Tot_contribs_logx_logy.png", "Diffs_articles_per_author.png", "Diffs_authors_per_article.png", "Diff_authors_against_page_len.png"]
    
    for idioma in idiomas:
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
        #acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages")
        result=dbaccess.query_SQL(acceso[1], "pageCount, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)")
        result2=dbaccess.query_SQL(acceso[1], "pageLenSum, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)")
        result3=dbaccess.query_SQL(acceso[1], "contribs, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)")
        
        resultnspace=dbaccess.query_SQL(acceso[1], "pages_nspace, namespace", "stats_nspace_"+idioma)
        
        diffArticlesNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_NoAnnons_author_"+idioma)
        
        diffInitNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_Init_NoAnnons_author_"+idioma)
        
        totRevperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Contrib_NoAnnons_page_id_"+idioma)
        
        diffAuthorperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Article_NoAnnons_page_id_"+idioma)
        
        dautxplen=dbaccess.query_SQL(acceso[1], "page_len, authors", "stats_pagelen_difauthors_"+idioma)
        
        dbaccess.close_Connection(acceso[0])
        
        data=__tup_to_list(result, 1)
        dates_x=data.pop()
        page_Count=data.pop()
        
##        if idioma=="frwiki":
        data2=__tup_to_list(result2, 2)
        dates_x=data2.pop()
        dates_x.pop(0)
        dates_x.pop(0)
        page_Len_Sum=data2.pop()
        page_Len_Sum.pop(0)
        page_Len_Sum.pop(0)
##        else:
##            data2=__tup_to_list(result2, 1)
##            dates_x=data2.pop()
##            page_Len_Sum=data2.pop()
        
        data3=__tup_to_list(result3, 1)
        dates_x=data3.pop()
        contribs=data3.pop()
        
        datanspace=__tup_to_list(resultnspace)
        namespaces=datanspace.pop()
        pages_nspace=datanspace.pop()
        
        dataDiffArticlesNoann=__tup_to_list(diffArticlesNoann)
        diffArticles=dataDiffArticlesNoann.pop()
        authors=dataDiffArticlesNoann.pop()
        
        dataDiffInitNoann=__tup_to_list(diffInitNoann)
        diffInitArticles=dataDiffInitNoann.pop()
        authors=dataDiffInitNoann.pop()
        
        datatotRevperArticle=__tup_to_list(totRevperArticle)
        totalRev=datatotRevperArticle.pop()
        article=datatotRevperArticle.pop()
        
        datadiffAuthorperArticle=__tup_to_list(diffAuthorperArticle)
        diffAuthors=datadiffAuthorperArticle.pop()
        article=datadiffAuthorperArticle.pop()
        
        datadautxplen=__tup_to_list(dautxplen)
        autxplen=datadautxplen.pop()
        lenautxplen=datadautxplen.pop()

##  Introduce in data list results form queries in the proper order
##  corresponding with the name files we pass to the GNU R script summary_evol.R      
        for i in range(len(page_Len_Sum)):
            if page_Len_Sum[i]!=0:
                page_Len_Sum[i]=math.log10(page_Len_Sum[i])
                
        dataList=[dates_x, page_Count, page_Len_Sum, contribs, namespaces, pages_nspace, diffArticles, authors, diffAuthors, autxplen, lenautxplen]

        for filename, data in zip (filenames, dataList):
            if(filename.find('date')!=-1):
                __makeDatesFile(idioma, filename, data)
            else:
                __makeDataFile(idioma, filename, data)
        
        ######################################
        
        #Pass data filenames to the GNU R script with a file
        f=open("./data/summary_files_names.data",'w')
        for line in filenames:
            f.write("./graphics/"+idioma+"/data/"+line+"\n")
        f.close()
        
        #Idem with graphic output filenames
        f=open("./data/summary_files_out.data",'w')
        for line in filenames_out:
            f.write("./graphics/"+idioma+"/"+line+"\n")
        f.close()
            
        #CALL THE GNU R SCRIPT summary_evol.R
        
        succ=os.system("R --vanilla < ./summary_evol.R > debug_R")
        if succ==0:
            print "Funcion summary_evol ejecutada con exito para el lenguage... "+idioma
Example #37
0
 def __init__(self, conf, language="furwiki"):
     """
     Creates multiple views to create a convenient interface to access quantitative data
     It also generates necessary tables and views to store intermidiate results, so that other methods
     can later store data directly. 
     """
     self.conf=conf
     self.language=language
     ##List of namespaces to analyse. We have added new special namespaces (e.g. subsets of main)
     self.nspaces=["all","ns0","articles","redirects","cur_redirects","cur_stubs","stubs","talk",\
     "pageUser", "userTalk","meta", "metaTalk", "image", "imageTalk", "mediawiki",\
     "mediawikiTalk", "template", "templateTalk", "help", "helpTalk", "category", "categoryTalk"]
     
     ##Some fancy lists to work with time intervals in some private methods following
     self.type_interval_columns={"days":"day, year", "weeks":"week, year", "months":"month, year",\
     "quarters":"quarter, year", "years":"year"}
     self.type_interval_select={"days":"DAYOFYEAR(rev_timestamp) AS day, YEAR(rev_timestamp) AS year ",\
     "weeks":"WEEK(rev_timestamp,1) AS week, YEAR(rev_timestamp) AS year ",\
     "months":"MONTH(rev_timestamp) AS month, YEAR(rev_timestamp) AS year ",\
     "quarters":"QUARTER(rev_timestamp) AS quarter, YEAR(rev_timestamp) AS year ",\
     "years":"YEAR(rev_timestamp) AS year "}
     self.type_interval_group={"days":"year, day", "weeks":"year, week", "months":"year, month",\
     "quarters":"year, quarter", "years":"year"}
     
     ##	Get new DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     
     ##    Delete previous versions of views
     for nspace in self.nspaces:
         dbaccess.dropView(self.acceso[1], nspace+"_"+self.language)
     
     ##    Create updated versions for views from revision table
     #View sumarizing all info for every revision (linking with info from table page)
     dbaccess.createView(self.acceso[1], view="all_"+self.language,\
     columns="rev_id, page_id, rev_len, page_ns, page_len, is_redirect, author, author_text,"+\
     " rev_timestamp, rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_namespace, page_len, rev_is_redirect,"+\
     " rev_user, rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id")
     #View sumarizing info regarding pages in namespace=0 (including articles, stubs and redirects)
     dbaccess.createView(self.acceso[1], view="ns0_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, is_redirect, author, author_text,"+\
     " rev_timestamp, rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, page_is_redirect, rev_user,"+\
     " rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id"+\
     " AND page_namespace=0")
     #View sumarizing info for articles (excluding pages that currently are redirects and stubs)
     dbaccess.createView(self.acceso[1], view="articles_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
     "rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text,"+\
     " rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND page_namespace=0 AND "+\
     "page_is_redirect=0 AND page_is_stub=0")
     #View with info only for redirects (pages that were redirects when that revision was made)
     dbaccess.createView(self.acceso[1], view="redirects_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
     "rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
     "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\
     "page_namespace=0 AND rev_is_redirect=1")
     #View with info only for current redirects
     dbaccess.createView(self.acceso[1], view="cur_redirects_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
     "rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
     "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\
     "page_namespace=0 AND page_is_redirect=1")
     #View with info only for revisions of stub pages (pages that were stubs when that revision was made)
     dbaccess.createView(self.acceso[1], view="stubs_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\
     " rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
     "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
     " page_namespace=0 AND rev_is_stub=1")
     #View with info only for revisions of current stub pages
     dbaccess.createView(self.acceso[1], view="cur_stubs_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\
     " rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
     "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
     " page_namespace=0 AND page_is_stub=1")
     #From this point on, automatically create views for the set of pages included in the remaining namespaces in MediaWiki
     for nspace, nsnum in zip(self.nspaces[7:], range(1,16)):
         dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language,\
         columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"+\
         " rev_parent_id",
         query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
         "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
         " page_namespace="+str(nsnum))
         
     #View sumarizing the distribution of pages among namespaces
     dbaccess.dropView(self.acceso[1], "nspaces_"+self.language)
     dbaccess.createView(self.acceso[1],view="nspaces_"+self.language, columns="namespace, pages_in_nspace",\
     query="SELECT page_namespace, COUNT(*) FROM page GROUP BY page_namespace")
 
     ##    Intermidiate views for the minimun timestamp of every page [annons, and logged users]
     ## And other useful intermediate views regarding page evolution
     for nspace in self.nspaces:
         dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\
         "_page_min_timestamp_logged")
         dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\
         "_page_min_timestamp_logged", columns="page_id, rev_id, author, rev_timestamp",\
         query="SELECT page_id, rev_id,author, MIN(rev_timestamp) FROM "+\
         nspace+"_"+self.language+" WHERE author!=0 GROUP BY page_id")
         dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\
         "_page_min_timestamp_annons")
         dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\
         "_page_min_timestamp_annons",\
         columns="page_id, rev_id, author_text, rev_timestamp",\
         query="SELECT page_id,rev_id,author_text, MIN(rev_timestamp) FROM "+\
         nspace+"_"+self.language+" WHERE author=0 GROUP BY page_id")
         
         dbaccess.dropView(self.acceso[1],nspace+"_"+self.language+"_list_months")
         dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_months",\
         columns="month, year",query="SELECT MONTH(rev_timestamp) as month, "+\
         "YEAR(rev_timestamp) as year"+\
         " FROM "+nspace+"_"+self.language+" GROUP BY year, month ORDER BY year, month")
         
         dbaccess.dropView(self.acceso[1],nspace+"_"+self.language+"_list_quarters")
         dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_quarters",\
         columns="quarter, year",query="SELECT QUARTER(rev_timestamp) as quarter, "+\
         "YEAR(rev_timestamp) as year FROM "+nspace+"_"+self.language+" GROUP BY year,"+\
         " quarter ORDER BY year, quarter")
     
 ##    Close DB connection
     dbaccess.close_Connection(self.acceso[0])
Example #38
0
def measuring(idiomas):
    """
    Create some graphs following the research presented by Jakob Voss in his paper
    Mesuring Wikipedia (ISSI 2005)
    
    @type  idiomas: list of strings
    @param idiomas: list of strings indicating the language versions to process
    """
##   Generates some graphics reproducing those in Measuring Wikipedia article
    filenames=["total_edits.data", "noannons_edits.data", "annon_edits.data", "authors_per_article_desc.data", "articles_per_logged_author_desc.data",  "articles_per_anonymous_author_desc.data"]
    
    filenames_out=["total_edits_per_author.png", "total_edits_per_noannon_author.png", "total_edits_per_annon_author.png", "diff_authors_per_article_descending.png", "diff_articles_per_logged_author_descending.png", "diff_articles_per_anonymous_author_descending.png"]
    
    for idioma in idiomas:
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
    ##    acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages")
        #Combined evolution graphics
        #ALL THESE GRAPHICS ARE ALREADY GENERATED BY ERIK ZATCHE'S OFFICIAL PERL SCRIPTS
            #Database size
            #Total number of words
            #Total number of internal links
            #Number of articles (including redirects)
            #Number of active wikipedians (more than 5 contributions in a given month)
            #Number of very active wikipedians (more than 100 contributions in a given month)
        
        #Namespace size
            #OK, it is generated in summary_evol() method
            
        #Evolution in time of article size (histogram)
            #IDEA: Download page.sql files for a language for each semester period
            
        #Number of distinct authors per article (descending sorted graphic)
            #Already generated in summary_evol, ONLY NEED TO SORT AND ADJUST IN GNU R
        diffAuthorperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Article_NoAnnons_page_id_"+idioma)
        
        #Number of distinct articles per author (descending sorted graphic)
            #Idem as in the previous case
        diffArticlesNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_NoAnnons_author_"+idioma)
        diffArticlesAnn=dbaccess.query_SQL(acceso[1], "author_text, theCount", "stats_Article_Annons_author_text_"+idioma)        
        
        data=__tup_to_list(diffAuthorperArticle)
        lisdiffauthorartic=data.pop()
        data=__tup_to_list(diffArticlesNoann)
        lisdiffarticleaut=data.pop()
        data=__tup_to_list(diffArticlesAnn,2)
        lisdiffarticleannon=data.pop()
##        Ordenamos los resultados para que se puedan ajustar a una Power Law
        lisdiffauthorartic.sort(reverse=True)
        lisdiffarticleaut.sort(reverse=True)
        lisdiffarticleannon.sort(reverse=True)
        
        #Number of edtis per author
            #Retrieve results from database
            #We have already created GINI graphics for this parameter
            #ALSO AVAILABLE DATABASE TABLES WITH EVOLUTION IN TIME OF THIS PARAMETER
        
        tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma)
        tcauthor=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_author_"+idioma)
        tc_ann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_Annons_author_text_"+idioma)
        
        data=__tup_to_list(tcnoann)
        listcnoann=data.pop()
        data=__tup_to_list(tcauthor)
        listcauthors=data.pop()
        #BTW, we are also obtaining but not using the IP adresses of annon users
        data=__tup_to_list(tc_ann,2)
        listcann=data.pop()
        
##        Arranging results in a decreasing way to adjust them to a power law
        listcnoann.sort(reverse=True)
        listcauthors.sort(reverse=True)
        listcann.sort(reverse=True)
        
        #Ingoing and outgoing number of links per article
            #STILL TO BE DEVELOPED
            #NEED TO FIRST IDENTIFY LINKS FOR A GIVEN ARTICLE IN THE DATABASE
            #LINKS TABLES MAY HELP, but in these dump versions they are all empty!!!
            
            #BROKEN LINKS also need to be considered
        
        dbaccess.close_Connection(acceso[0])
        
        dataList=[listcauthors, listcnoann, listcann, lisdiffauthorartic, lisdiffarticleaut, lisdiffarticleannon]
        
        for filename, data in zip (filenames, dataList):
            if(filename.find('date')!=-1):
                __makeDatesFile(idioma, filename, data)
            else:
                __makeDataFile(idioma, filename, data)
        
        #Pass data filenames to the GNU R script with a file
        f=open("./data/measuring_files_names.data",'w')
        for line in filenames:
            f.write("./graphics/"+idioma+"/data/"+line+"\n")
        f.close()
        
        #Idem with graphic output filenames
        f=open("./data/measuring_files_out.data",'w')
        for line in filenames_out:
            f.write("./graphics/"+idioma+"/"+line+"\n")
        f.close()
            
        #CALL GNU R SCRIPT measuring_Wiki.R
        
        succ=os.system("R --vanilla < ./measuring_Wiki.R > debug_R")
        if succ==0:
            print "Funcion measuring_Wiki.R ejecutada con exito para el lenguage... "+idioma
Example #39
0
    def bots(self):
        """
        Preprocessing actions with bots data
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research" 
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            #TABLE revs_bots (revisions made by officially identified bots, by year, month)
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_bots")
            dbaccess.raw_query_SQL(self.access[1], "create table revs_bots as select year(rev_timestamp) "+\
            "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision where rev_user!=0 "+\
            "and rev_user in (select ug_user from user_groups where ug_group='bot') group by "+\
            "year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)")

            #TABLE revs_logged (revisions made by logged authors, by year, month)
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_logged")
            dbaccess.raw_query_SQL(self.access[1],"create table revs_logged as select year(rev_timestamp) "+\
            "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision where rev_user!=0 "+\
            "group by year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)")

            #TABLE revs_all (revisions made by all authors, by year, month)
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_all")
            dbaccess.raw_query_SQL(self.access[1], "create table revs_all as select year(rev_timestamp) "+\
            "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision "+\
            "group by year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)")

            dbaccess.close_Connection(self.access[0])

        #FILE perc-bots-all-revs.dat % of all revisions due to bots
        file=open("overall/data/perc-bots-all-revs.dat",'w')
        file.write("year\tmonth\tperc_revs\tlang\n")
        file.close()
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research" 
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            #Obtain % of total no. revs due to bots, by year, month
            self.perc_revs=dbaccess.raw_query_SQL(self.access[1], "select bot.theyear, bot.themonth, "+\
            "(bot.num_revs/tot.num_revs)*100 perc_revs from revs_bots as bot, revs_all as tot "+\
            "where bot.theyear=tot.theyear and bot.themonth=tot.themonth;")
            dbaccess.close_Connection(self.access[0])
            #Writing data to file
            file=open("overall/data/perc-bots-all-revs.dat",'a')
            for item in self.perc_revs:
                file.write(str(int(item[0]))+"\t"+str(int(item[1]))+"\t"+\
                str(float(item[2]))+"\t"+self.language+"\n")
            file.close()

        #file perc-bots-logged-revs.dat % of all revisions due to bots
        file=open("overall/data/perc-bots-logged-revs.dat",'w')
        file.write("year\tmonth\tperc_revs\tlang\n")
        file.close()
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research" 
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            #obtain % of no. revs by logged editors due to bots, by year, month
            self.perc_revs=dbaccess.raw_query_sql(self.access[1], "select bot.theyear, bot.themonth, "+\
            "(bot.num_revs/logged.num_revs)*100 perc_logged_revs from revs_bots as bot, "+\
            "revs_logged as logged where bot.theyear=logged.theyear and bot.themonth=logged.themonth;")
            dbaccess.close_connection(self.access[0])
            #writing data to file
            file=open("overall/data/perc-bots-logged-revs.dat",'a')
            for item in self.perc_revs:
                file.write(str(int(item[0]))+"\t"+str(int(item[1]))+"\t"+\
                str(float(item[2]))+"\t"+self.language+"\n")
            file.close()
            self.pageinsert+=";"
            print self.pageinsert.encode('utf-8')
        elif self.options.monitor:
            while 1:
                try:
                    dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode('utf-8'))
                except (Exception), e:
                    print e
                else:
                    break
        #Reset status vars
        self.pageinsertrows=0
        self.pageinsertsize=0
        ########IF WE USE MONITOR MODE, CLOSE DB CONNECTION
        if self.options.monitor and not self.options.fileout and not self.options.streamout:
            dbaccess.close_Connection(self.acceso[1])
        ################################################
        #Checking out total time consumed and display end message
        self.timeCheck=datetime.datetime.now()
        self.timeDelta=self.timeCheck-self.start
        print >> sys.stderr, "\n"
        print >> sys.stderr, "File successfully parsed..."
        print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)" % (self.page_num,\
        float(self.page_num)/self.timeDelta.seconds, self.rev_num, float(self.rev_num)/self.timeDelta.seconds)
 
##Main zone
if __name__ == '__main__':
    usage = "usage: %prog [options]"
    parserc = OptionParser(usage)
    parserc.add_option("-t","--stubth", dest="stubth", type="int", metavar="STUBTH", default=256,
    help="Max. size in bytes to consider an article as stub [default: %default]")