Ejemplo n.º 1
0
 def __init__(self, language="furwiki", dumptype="research", msqlu="", msqlp=""):
     """
     It receives the language and dumptype to download
     It returns an int =0 if the DB was successfully set up, =-1 if there was an error
     """
     self.language=language       #language to download
     self.dumptype=dumptype      #type of dump      
     self.files=["pages-meta-history.xml.7z", "redirect.sql.gz","page_restrictions.sql.gz",\
     "user_groups.sql.gz", "logging.sql.gz", "interwiki.sql.gz", "langlinks.sql.gz", "externallinks.sql.gz",\
     "templatelinks.sql.gz", "imagelinks.sql.gz", "categorylinks.sql.gz", "pagelinks.sql.gz", "oldimage.sql.gz",\
     "image.sql.gz"]
     self.filename=""
     self.filenameTemplate=string.Template("""$language-latest-$file""") #dump's filename in Wikimedia's server
     #URL to download the file
     self.urld=""
     self.urldTemplate=string.Template("""http://download.wikimedia.org/$language/latest/$language-latest-$file""")
     if (msqlu=="" or msqlp==""):
         print "Error initializing DB dump object. You must provide a valid MySQL username and password"
     else:
         self.msqlu=msqlu   #MySQL username for accessing and editing the DB
         self.msqlp=msqlp   #MySQL password
     #We can manage two different types of dumps, stubs (without the text of every revision) and pages
     #(containing the text of every revision)
     #self.urld="http://download.wikimedia.org/"+self.language+"/latest/"+\
     #self.language+"-latest-pages-meta-history.xml.7z"  #File to download
     #patterns for files
     #http://download.wikimedia.org/furwiki/20060921/furwiki-20060921-pages-meta-history.xml.7z
     #http://download.wikimedia.org/amwiki/20061014/amwiki-20061014-stub-meta-history.xml.gz
     #Create /dumps directory if it does not exist yet
     directories=os.listdir("./")
     if ("dumps" not in directories):
         os.makedirs("./dumps")
     ## Initialize DB in MySQL: create DB and tables definitions
     print "Initializing DB for --> "+ self.language +"\n"
     #Retrieving connection and cursor to access the DB
     acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,"mysql")
     dbaccess.createDB_SQL(acceso[1],"wx_"+self.language+"_"+self.dumptype)
     if self.dumptype=="research":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_research.sql > debug_mysql.log"
     elif self.dumptype=="standard":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_standard.sql > debug_mysql.log"
     ok=os.system(command)
     if ok == 0:
         acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,\
         "wx_"+self.language+"_"+self.dumptype)
         dbaccess.raw_query_SQL(acceso[1], "alter table page max_rows = 200000000000 avg_row_length = 50")
         dbaccess.raw_query_SQL(acceso[1], "alter table revision max_rows = 200000000000 avg_row_length = 50")
         if self.dumptype=="standard":
             dbaccess.raw_query_SQL(acceso[1], "alter table text max_rows = 200000000000 avg_row_length = 50")
         dbaccess.close_Connection(acceso[0])
     else:
         print "Error! There was a problem initializing definitions for DB tables"
         dbaccess.close_Connection(acceso[0])
Ejemplo n.º 2
0
 def __init__(self, language="furwiki", dumptype="research", msqlu="", msqlp=""):
     """
     It receives the language and dumptype to download
     It returns an int =0 if the DB was successfully set up, =-1 if there was an error
     """
     self.language=language       #language to download
     self.dumptype=dumptype      #type of dump      
     self.files=["pages-meta-history.xml.7z", "redirect.sql.gz","page_restrictions.sql.gz",\
     "user_groups.sql.gz", "logging.sql.gz", "interwiki.sql.gz", "langlinks.sql.gz", "externallinks.sql.gz",\
     "templatelinks.sql.gz", "imagelinks.sql.gz", "categorylinks.sql.gz", "pagelinks.sql.gz", "oldimage.sql.gz",\
     "image.sql.gz"]
     self.filename=""
     self.filenameTemplate=string.Template("""$language-latest-$file""") #dump's filename in Wikimedia's server
     #URL to download the file
     self.urld=""
     self.urldTemplate=string.Template("""http://download.wikimedia.org/$language/latest/$language-latest-$file""")
     if (msqlu=="" or msqlp==""):
         print "Error initializing DB dump object. You must provide a valid MySQL username and password"
     else:
         self.msqlu=msqlu   #MySQL username for accessing and editing the DB
         self.msqlp=msqlp   #MySQL password
     #We can manage two different types of dumps, stubs (without the text of every revision) and pages
     #(containing the text of every revision)
     #self.urld="http://download.wikimedia.org/"+self.language+"/latest/"+\
     #self.language+"-latest-pages-meta-history.xml.7z"  #File to download
     #patterns for files
     #http://download.wikimedia.org/furwiki/20060921/furwiki-20060921-pages-meta-history.xml.7z
     #http://download.wikimedia.org/amwiki/20061014/amwiki-20061014-stub-meta-history.xml.gz
     #Create /dumps directory if it does not exist yet
     directories=os.listdir("./")
     if ("dumps" not in directories):
         os.makedirs("./dumps")
     ## Initialize DB in MySQL: create DB and tables definitions
     print "Initializing DB for --> "+ self.language +"\n"
     #Retrieving connection and cursor to access the DB
     acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,"mysql")
     dbaccess.createDB_SQL(acceso[1],"wx_"+self.language+"_"+self.dumptype)
     if self.dumptype=="research":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_research.sql > debug_mysql.log"
     elif self.dumptype=="standard":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_standard.sql > debug_mysql.log"
     ok=os.system(command)
     if ok == 0:
         acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,\
         "wx_"+self.language+"_"+self.dumptype)
         dbaccess.raw_query_SQL(acceso[1], "alter table page max_rows = 200000000000 avg_row_length = 50")
         dbaccess.raw_query_SQL(acceso[1], "alter table revision max_rows = 200000000000 avg_row_length = 50")
         if self.dumptype=="standard":
             dbaccess.raw_query_SQL(acceso[1], "alter table text max_rows = 200000000000 avg_row_length = 50")
         dbaccess.close_Connection(acceso[0])
     else:
         print "Error! There was a problem initializing definitions for DB tables"
         dbaccess.close_Connection(acceso[0])
Ejemplo n.º 3
0
 def __init__(self, options):
     self.options=options
     if self.options.monitor and (not self.options.fileout and not self.options.streamout):
         self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\
         self.options.user, self.options.passwd, self.options.database)
     self.nspace_dict={}
     self.codens=''
     self.page_dict={}
     self.rev_dict = {}
     self.stack=[]
     self.current_text = ''
     self.current_elem=None
     self.revfile=None
     self.pagefile=None
     self.page_num = 0
     self.rev_num=0
     self.last_page_len=0
     self.rev_count=0
     self.prior_rev_id='NULL'
     self.isRedirect='0'
     self.isStub='0'
     self.isMinor='0'
     self.inlinks=None # internal links
     self.outlinks=None # external links
     self.trans=None # translations to other language editions
     self.sections=None # sections (no matter their level)
     self.highwords=None #highlighted words (bold/italics/bold+italics)
     self.special=None #rev_text, special links filtered out
     ########################################
     ##REGEXPS
     ########################################
     self.pathighlight=r"\'\'+"#Regexp matching bold/italics/bold+italics wikitags
     self.pathighwords=r"\'\'+.*\'\'+" #Regexp for highlighted words
     self.pathtml=r"\<[^\>]+\>" #Regexp matching HTML tags
     self.patunicode=r"\&\w+\;|\&\#\d+\;|[\xc0-\xf7][\x80-\xbf]+" #Regexp matching unicode chars
     self.patspecial=r"\[\[[^\:\]]+\:[^\]]*\]\]" #Regexp matching special inlinks (image/category/interwiki)
     self.patinlink=r"\[\[.*\]\]" #Regexp matching inlinks (after filtering image/category/interwiki links)
     self.patoutlink=r"\s\[[^\[\]]*\]|http[s]?://" #Regexp matching outlinks
     self.patsection=r"\=\=+[\s]*[^\=]*[\s]*\=\=+" #Regexp matching section titles
     self.pattrans=r"\[\[..[.]?:"#Regexp matching translation links
     self.patitemize=r"\n\**" #Regexp matching itemize bullets and line branches
     self.patdumb=r"\)\(" #A rapid solution to concatenate tuples in special instert strings
     self.fileErrPath="./errors.log"
     #TODO: Solve lookup in global scope if the special item did not show up in any previous revision
     #of the whole dump (maybe lookup query to DB??)
     self.highwords_dict={}; self.special_dict={}; self.inlinks_dict={};
     self.outlinks_dict={}; self.trans_dict={}
     self.highwords_id=1; self.special_id=1; self.inlinks_id=1; self.outlinks_id=1
     self.trans_id=1;
     self.highwords_rev_insert=[]; self.special_rev_insert=[]; self.inlinks_rev_insert=[]
     self.outlinks_rev_insert=[]; self.trans_rev_insert=[];
     self.revinsert=''
     self.pageinsert=''
     self.revinsertrows=0
     self.revinsertsize=0
     self.pageinsertrows=0
     self.pageinsertsize=0
     self.start=datetime.datetime.now()
     self.timeCheck=None
     self.timeDelta=None
Ejemplo n.º 4
0
 def __init__(self, options):
     self.fileErrPath = "./errors.log"
     self.options = options
     if options.monitor and (not options.fileout and not options.streamout):
         self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\
         self.options.user, self.options.passwd, self.options.database)
     self.nspace_dict = {}
     self.codens = ''
     self.page_dict = {}
     self.rev_dict = {}
     self.stack = []
     self.current_text = ''
     self.current_elem = None
     self.revfile = None
     self.pagefile = None
     self.page_num = 0
     self.rev_num = 0
     self.last_page_len = 0
     self.rev_count = 0
     self.prior_rev_id = 'NULL'
     self.isRedirect = '0'
     self.isStub = '0'
     self.isMinor = '0'
     self.revinsert = ''
     self.pageinsert = ''
     self.textinsert = ''
     self.revinsertrows = 0
     self.revinsertsize = 0
     self.pageinsertrows = 0
     self.pageinsertsize = 0
     self.textinsertrows = 0
     self.textinsertsize = 0
     self.start = datetime.datetime.now()
     self.timeCheck = None
     self.timeDelta = None
Ejemplo n.º 5
0
 def overall(self):
     """
     Preprocessing tables for evolution of page length over time
     """
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research"	
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         dbaccess.raw_query_SQL(self.access[1], "create or replace view page_redirect as "+\
         "(select page_id from page where page_namespace=0 and page_is_redirect=1)")
         dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_redirect as ("+\
         "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
         "(select page_id from page_redirect))")
         dbaccess.raw_query_SQL(self.access[1], "create or replace view page_talk as "+\
         "(select page_id from page where page_namespace=1)")
         dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_talk as ("+\
         "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
         "(select page_id from page_talk))")
         self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision")
         self.years=range(int(self.minyear[0][0])+1, 2009)
         for self.year in self.years:
             dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_talk_"+str(self.year))
                     dbaccess.raw_query_SQL(self.access[1],"create table max_rev_talk_"+str(self.year)+\
             " as (select max(rev_id) as max_id, rev_page from rev_talk "+\
             "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)")
             dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add primary key (max_id)")
             dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add index (rev_page)")
             
         dbaccess.close_Connection(self.access[0])
Ejemplo n.º 6
0
    def performAnalysis(self):

        ##        Get DB connection
        self.acceso = dbaccess.get_Connection("localhost", 3306, "root",
                                              "phoenix",
                                              self.language + "_stub")
        ##        Singleton objects to plot graphics in the class methods
        self.simpleGraph = graphic2D(self.filePath)
        ##        self.multiGraph=graphic2Dmulti(self.filePath)
        ##        self.giniGraph=graphicGini(self.filePath)
        ##        self.splitHistGraph=graphicSplitHist(self.filePath, self.dataPath)
        self.graph3D = graphic3D(self.filePath, self.dataPath)
        print "Starting analysis on DB " + self.language + "_stub\n"
        ##        self.UserNumContribsGroup(self.acceso[1])
        ##        self.UserNumContribsGenerations()
        authorsGini = [(
            95.9677,
            4.046,
        ), (95.7015, 4.304), (96.2223, 4.363), (95.7104, 4.395),
                       (96.3844, 4.407), (92.4691, 4.528), (95.0077, 4.603),
                       (95.0071, 4.7298), (93.785, 5.051), (93.6076, 5.888)]
        authorsGini.sort()
        ##authorsGini=[(4.046,95.9677),(4.304,95.7015),(4.363,96.2223),(4.395,95.7104),(4.407,96.3844),(4.528,92.4691),(4.603,95.0077),(4.7298,95.0071),(5.051,93.785),(5.888,93.6076)]

        self.simpleGraph.createGraphic(
            "authors-Gini", (authorsGini, ), "Gini coeff. (%)",
            "Number of different authors (log)",
            "Gini coeff. vs. number of registered authors in the top-ten Wikipedias."
        )
        ##            Close DB connection
        dbaccess.close_Connection(self.acceso[0])
        print "This is finished"
Ejemplo n.º 7
0
    def prepro_pagelen(self):
        """
        Preprocessing tables for evolution of page length over time
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"   
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            
            #VIEW page_main_nored (pages in main nspace excluding redirects)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view page_main_nored as "+\
            "(select page_id from page where page_namespace=0 and page_is_redirect=0)")

            #VIEW rev_main_nored (revisions in main nspace in all pages, excluding redirects)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_main_nored as ("+\
            "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
            "(select page_id from page_main_nored))")
            
            #TABLES max_rev_YYYY (latest revision for each page in main nspace, up to year YYYY)
            self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision")
            self.years=range(int(self.minyear[0][0])+1, 2009)
            for self.year in self.years:
                dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_"+str(self.year))
                dbaccess.raw_query_SQL(self.access[1],"create table max_rev_"+str(self.year)+\
                " as (select max(rev_id) as max_id, rev_page from rev_main_nored "+\
                "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)")
                dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add primary key (max_id)")
                dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add index (rev_page)")
                
            dbaccess.close_Connection(self.access[0])
Ejemplo n.º 8
0
    def __init__(self, options):
        self.options=options
        if self.options.monitor and (not self.options.fileout and not self.options.streamout):
            self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\
            self.options.user, self.options.passwd, self.options.database)
        self.log_dict = {}
        self.stack=[]
        self.current_text = ''
        self.current_elem=None
        #self.log_count=0
        self.log_num=0

        self.nspace_dict={}
        self.codens=''
        
        ########################################
        ##REGEXPS
        ########################################
        

        ########################################
        #REMAINING GLOBAL ATTRIBUTES
        ########################################
        self.fileErrPath="./errors"+self.options.database+".log"
        self.loginsert=''
        self.loginsertrows=0
        self.loginsertsize=0
        self.start=datetime.datetime.now()
        self.timeCheck=None
        self.timeDelta=None
Ejemplo n.º 9
0
 def generalStatistics(self):
 ##  Computes the views containing general statistics and overall information:
 ##  For all namespaces (official and artificial):
 ################
 ##  View _overall_statistics1_months, which includes
 ##  Total num of pages with at least one edit in that month, total number of contribs, 
 ##  total num of users who made at least 1 edit in that month (alive_users)
 ####################################
 ##  Parameters from Wikistats by Erik Zachte
 ####################################
 ##  Wikipedians: contributors, active wikipedians, very active wikipedians
 ##  Articles: (WARNING: readable contents are not being filtered out yet)
 ##  new articles per day, edits per article, bytes per article, % of articles over 0.5k,
 ##  % of articles over 2k
 ##  Total size of contribs per month
 ##  Size of pages and number of different authors who have edited them
 ####################################
 ##    Get new DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
 ##    General statistics
     for nspace in self.nspaces:
         self.__gral_stats(self.acceso[1], nspace+"_"+self.language)
 ##    Close DB connection
     dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 10
0
 def __init__(self, options):
     self.options=options
     if self.options.monitor and (not self.options.fileout and not self.options.streamout):
         self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\
         self.options.user, self.options.passwd, self.options.database)
     self.nspace_dict={}
     self.codens=''
     self.page_dict={}
     self.rev_dict = {}
     self.stack=[]
     self.current_text = ''
     self.current_elem=None
     self.revfile=None
     self.pagefile=None
     self.page_num = 0
     self.rev_num=0
     self.last_page_len=0
     self.rev_count=0
     self.prior_rev_id='NULL'
     self.isRedirect='0'
     self.isStub='0'
     self.isMinor='0'
     self.inlinks=None # internal links
     self.outlinks=None # external links
     self.trans=None # translations to other language editions
     self.sections=None # sections (no matter their level)
     self.highwords=None #highlighted words (bold/italics/bold+italics)
     self.special=None #rev_text, special links filtered out
     ########################################
     ##REGEXPS
     ########################################
     self.pathighlight=r"\'\'+"#Regexp matching bold/italics/bold+italics wikitags
     self.pathighwords=r"\'\'+.*\'\'+" #Regexp for highlighted words
     self.pathtml=r"\<[^\>]+\>" #Regexp matching HTML tags
     self.patunicode=r"\&\w+\;|\&\#\d+\;|[\xc0-\xf7][\x80-\xbf]+" #Regexp matching unicode chars
     self.patspecial=r"\[\[[^\:\]]+\:[^\]]*\]\]" #Regexp matching special inlinks (image/category/interwiki)
     self.patinlink=r"\[\[.*\]\]" #Regexp matching inlinks (after filtering image/category/interwiki links)
     self.patoutlink=r"\s\[[^\[\]]*\]|http[s]?://" #Regexp matching outlinks
     self.patsection=r"\=\=+[\s]*[^\=]*[\s]*\=\=+" #Regexp matching section titles
     self.pattrans=r"\[\[..[.]?:"#Regexp matching translation links
     self.patitemize=r"\n\**" #Regexp matching itemize bullets and line branches
     self.patdumb=r"\)\(" #A rapid solution to concatenate tuples in special instert strings
     self.fileErrPath="./errors.log"
     #TODO: Solve lookup in global scope if the special item did not show up in any previous revision
     #of the whole dump (maybe lookup query to DB??)
     self.highwords_dict={}; self.special_dict={}; self.inlinks_dict={};
     self.outlinks_dict={}; self.trans_dict={}
     self.highwords_id=1; self.special_id=1; self.inlinks_id=1; self.outlinks_id=1
     self.trans_id=1;
     self.highwords_rev_insert=[]; self.special_rev_insert=[]; self.inlinks_rev_insert=[]
     self.outlinks_rev_insert=[]; self.trans_rev_insert=[];
     self.revinsert=''
     self.pageinsert=''
     self.revinsertrows=0
     self.revinsertsize=0
     self.pageinsertrows=0
     self.pageinsertsize=0
     self.start=datetime.datetime.now()
     self.timeCheck=None
     self.timeDelta=None
Ejemplo n.º 11
0
 def infoPages(self):
     ##	Generates statistics per article
     ##	Get new DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     
     #Local configuration
     target="page_id"
     intervals=["months", "quarters","weeks"]
     
     ###########################
     #Total num of revisions per page
     for nspace in self.nspaces:
         self.__total_rev(self.acceso[1], nspace+"_"+self.language, target)
     
     ###########################
     #Total number of different editors per page
     for nspace in self.nspaces:
         self.__total_rev_diff(self.acceso[1], nspace+"_"+self.language, target)
     
     ###########################
     #Total number of revisions per page for several time intervals
     #Currently, we are only interested in months, quarters and weeks
     for nspace in self.nspaces:
         for interval in intervals:
             self.__total_rev_time(self.acceso[1], interval,nspace+"_"+self.language, target)
     
     ###########################
     #Total number of different editors per page; per month ,quarter and week
     for nspace in self.nspaces:
         for interval in intervals:
             self.__total_rev_diff_time(self.acceso[1], interval,nspace+"_"+self.language, target)
     
     #Close DB connection
     dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 12
0
 def generalStatistics(self):
     ##  Computes the views containing general statistics and overall information:
     ##  For all namespaces (official and artificial):
     ################
     ##  View _overall_statistics1_months, which includes
     ##  Total num of pages with at least one edit in that month, total number of contribs,
     ##  total num of users who made at least 1 edit in that month (alive_users)
     ####################################
     ##  Parameters from Wikistats by Erik Zachte
     ####################################
     ##  Wikipedians: contributors, active wikipedians, very active wikipedians
     ##  Articles: (WARNING: readable contents are not being filtered out yet)
     ##  new articles per day, edits per article, bytes per article, % of articles over 0.5k,
     ##  % of articles over 2k
     ##  Total size of contribs per month
     ##  Size of pages and number of different authors who have edited them
     ####################################
     ##    Get new DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     ##    General statistics
     for nspace in self.nspaces:
         self.__gral_stats(self.acceso[1], nspace + "_" + self.language)
 ##    Close DB connection
     dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 13
0
def contributions(idiomas):
    """
    Create some graphs and files with statistical results about authors contributions
    
    @type  idiomas: list of strings
    @param idiomas: list of strings indicating the language versions to process
    """
    for idioma in idiomas:
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
        #acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages")
        #dbaccess.query_SQL(acceso[1], "page_id, page_namespace", "page", where="page_namespace=0", create="pag_namespace")
        tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma)
        tcauthor=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_author_"+idioma)
        #tc_ann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_Annons_author_text_"+idioma)
        dbaccess.close_Connection(acceso[0])
        
        data=__tup_to_list(tcnoann)
        listay_tcnoann=data.pop()
        listax=data.pop()
        data=__tup_to_list(tcauthor)
        listay_tcauthor=data.pop()
        listax=data.pop()
        #data=__tup_to_list(tc_ann)
        #listay_tc_ann=data.pop()
        #listax=data.pop()
        r.png("graphics/"+idioma+"/gini_TContrib_NoAnn_"+idioma+".png")
        __lorenz_Curve(listay_tcnoann)
        r.png("graphics/"+idioma+"/gini_TContrib_"+idioma+".png")
        __lorenz_Curve(listay_tcauthor)
Ejemplo n.º 14
0
    def infoAuthors(self):
        ##  Generates statistics per user
        ##  Get DB connection
        self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
        "wx_"+self.language+"_"+self.conf.dumptype)

        ##	local configuration: retrieving info for authors
        target = "author"
        ##	intervals might be days, weeks, months, quarters, years
        intervals = ["months", "quarters", "weeks"]

        ############################
        #Number of total revisions per author ID
        for nspace in self.nspaces:
            self.__total_rev(self.acceso[1], nspace + "_" + self.language,
                             target)

        ############################
        #Different articles edited per user
        for nspace in self.nspaces:
            self.__total_rev_diff(self.acceso[1], nspace + "_" + self.language,
                                  target)

        ############################
        #Total num of articles started per author
        #We consider as the beginning of an article the first revision of that article
        for nspace in self.nspaces:
            self.__total_page_init_author(self.acceso[1],
                                          nspace + "_" + self.language)

        ############################
        #Total number of revisions per author for several time intervals
        #Currently, we are only interested in data per months, quarters and weeks
        for nspace in self.nspaces:
            for interval in intervals:
                self.__total_rev_time(self.acceso[1], interval,
                                      nspace + "_" + self.language, target)

        ############################
        #Num of different articles revised per author for several time intervals
        for nspace in self.nspaces:
            for interval in intervals:
                self.__total_rev_diff_time(self.acceso[1], interval,
                                           nspace + "_" + self.language,
                                           target)

        ############################
        #Num of different articles initiated per author
        for nspace in self.nspaces:
            for interval in intervals:
                self.__total_page_init_author_time(
                    self.acceso[1], interval, nspace + "_" + self.language)

        ############################
        #   BIRTH AND DEATH ANALYSIS FOR THE AUTHOR COMMUNITY
        ############################

        #Close DB connection
        dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 15
0
 def ratios(self):
     """
     .dat files showing interesting descriptive ratios
     """
     #FILE author-pages.dat ratio no. logged editors/no. user pages
     file=open("overall/data/editors-userpages.dat",'w')
     file.write("logged_authors\tuser_pages\tratio\tlang\n")
     file.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research" 
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         #Obtain number of different logged authors
         self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\
         "revision where rev_user!=0")
         #Obtain number of different user pages (nspace =2)
         self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=2")
         dbaccess.close_Connection(self.access[0])
         #Writing data to file
         file=open("overall/data/author-pages.dat",'a')
         file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\
         str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n")
         file.close()
         #print "Completed lang "+self.language+"\n"
 
     #FILE articles-talk-ratio.dat ratio of no. articles/no. talk pages (excluding redirects)
     file=open("overall/data/articles-talk-ratio.dat",'w')
     file.write("articles\ttalk\tratio\tlang\n")
     file.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research" 
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         #Obtain number of articles excluding redirects
         self.articles=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=0 and page_is_redirect=0")
         #Obtain number of talk pages
         self.talk=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=1")
         dbaccess.close_Connection(self.access[0])
         #Writing data to file
         file=open("overall/data/articles-talk-ratio.dat",'a')
         file.write(str(int(self.articles[0][0]))+"\t"+str(int(self.talk[0][0]))+"\t"+\
         str(float(self.talk[0][0])/float(self.articles[0][0]))+"\t"+self.language+"\n")
         file.close()
Ejemplo n.º 16
0
    def calculate(self):
        self.access = dbaccess.get_Connection("localhost", 3306, self.user,\
        self.passw, "wx_"+self.language+"wiki_"+self.dumptype)
        
	try:
            print "Creating table for logged users..."
            users=dbaccess.raw_query_SQL(self.access[1],"create table lag_info (rev_user INT(10) UNSIGNED NOT NULL,"+\
	    "fecha1 datetime not null, fecha2 datetime not null)")
        except Exception, e:
            print "An exception ocurred, the problem was the following:\n"
            print e
            print "*************\n\n"
Ejemplo n.º 17
0
 def infoAuthors(self):
     ##  Generates statistics per user
     ##  Get DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     
     ##	local configuration: retrieving info for authors
     target="author"
     ##	intervals might be days, weeks, months, quarters, years
     intervals=["months", "quarters","weeks"]
     
     ############################
     #Number of total revisions per author ID
     for nspace in self.nspaces:
         self.__total_rev(self.acceso[1], nspace+"_"+self.language, target)
     
     ############################
     #Different articles edited per user
     for nspace in self.nspaces:
         self.__total_rev_diff(self.acceso[1], nspace+"_"+self.language, target)
     
     ############################
     #Total num of articles started per author
     #We consider as the beginning of an article the first revision of that article
     for nspace in self.nspaces:
         self.__total_page_init_author(self.acceso[1], nspace+"_"+self.language)
     
     ############################
     #Total number of revisions per author for several time intervals
     #Currently, we are only interested in data per months, quarters and weeks
     for nspace in self.nspaces:
         for interval in intervals:
             self.__total_rev_time(self.acceso[1], interval,nspace+"_"+self.language, target)
     
     ############################
     #Num of different articles revised per author for several time intervals
     for nspace in self.nspaces:
         for interval in intervals:
             self.__total_rev_diff_time(self.acceso[1], interval,nspace+"_"+self.language, target)
     
     ############################
     #Num of different articles initiated per author
     for nspace in self.nspaces:
         for interval in intervals:
             self.__total_page_init_author_time(self.acceso[1], interval,nspace+"_"+self.language)
             
     ############################
     #   BIRTH AND DEATH ANALYSIS FOR THE AUTHOR COMMUNITY
     ############################
     
     #Close DB connection
     dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 18
0
 def infoContents(self):
     ###########################
     #Contents analysis
     ###########################
     ##  Get DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     ## For all namespaces (official an artificial):
     ## Evolution in time of the lenght of user contributions (per month; per quarter)
     ## Evolution in time of the lenght of pages (per month; per quarter supported but commented)
     for nspace in self.nspaces:
         self.__content_evolution(self.acceso[1], nspace+"_"+self.language)
     dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 19
0
 def __init__(self, options):
     self.fileErrPath="./errors.log"; self.options=options
     if self.options.monitor and not self.options.fileout and not self.options.streamout:
         self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\
         self.options.user, self.options.passwd, self.options.database)
     self.nspace_dict={}; self.codens=''; self.page_dict={}; self.rev_dict = {}
     self.stack=[]; self.current_text = ''; self.current_elem=None; self.revfile=None
     self.pagefile=None
     self.page_num = 0; self.rev_num=0; self.last_page_len=0; self.rev_count=0
     self.prior_rev_id='NULL'; self.isRedirect='0'; self.isStub='0'; self.isMinor='0'
     self.revinsert=''; self.pageinsert=''; self.textinsert=''
     self.revinsertrows=0; self.revinsertsize=0; self.pageinsertrows=0
     self.pageinsertsize=0; self.textinsertrows=0; self.textinsertsize=0
     self.start=datetime.datetime.now(); self.timeCheck=None; self.timeDelta=None
Ejemplo n.º 20
0
 def infoContents(self):
     ###########################
     #Contents analysis
     ###########################
     ##  Get DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     ## For all namespaces (official an artificial):
     ## Evolution in time of the lenght of user contributions (per month; per quarter)
     ## Evolution in time of the lenght of pages (per month; per quarter supported but commented)
     for nspace in self.nspaces:
         self.__content_evolution(self.acceso[1],
                                  nspace + "_" + self.language)
     dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 21
0
 def overall(self):
     """
     Preprocessing tables for evolution of page length over time
     """
     file=open("author-pages.dat",'w')
     file.write("logged_authors\tuser_pages\tratio\tlang\n")
     file.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research"	
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\
         "revision where rev_user!=0")
         self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=2")
         dbaccess.close_Connection(self.access[0])
         file=open("author-pages.dat",'a')
         file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\
         str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n")
         file.close()
         print "Completed lang "+self.language+"\n"
     
     file=open("articles-talk-ratio.dat",'w')
     file.write("articles\ttalk\tratio\tlang\n")
     file.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research"	
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         self.articles=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=0 and page_is_redirect=0")
         self.talk=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=1")
         dbaccess.close_Connection(self.access[0])
         file=open("articles-talk-ratio.dat",'a')
         file.write(str(int(self.articles[0][0]))+"\t"+str(int(self.talk[0][0]))+"\t"+\
         str(float(self.talk[0][0])/float(self.articles[0][0]))+"\t"+self.language+"\n")
         file.close()
         print "Completed lang "+self.language+"\n"
Ejemplo n.º 22
0
 def cox_prop(self):
     """
     Creates intermediate files and tables for Cox-prop hazards analysis
     """
     #Initialize file header
     f=open("wkp_cox_prop_all.dat",'w')
     f.write("Project,rev_user,min_ts,max_ts,in_talk,in_FAs\n")
     f.close()
     
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research"
     
         print "Starting language "+self.language+"\n"
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         
         ##TABLE: Create table of users in talk pages
         dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_talk")
         dbaccess.raw_query_SQL(self.access[1],"create table users_in_talk as (select distinct(rev_user) from revision "+\
         "where rev_page in (select page_id from page where page_namespace=1))")
         dbaccess.raw_query_SQL(self.access[1],"alter table users_in_talk add primary key (rev_user)")
         
         ##TABLE: Create table of users in FAs
         dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_FAs")
         dbaccess.raw_query_SQL(self.access[1],"create table users_in_FAs as (select distinct(rev_user) from revision_FAs)")
         dbaccess.raw_query_SQL(self.access[1],"alter table users_in_FAs add primary key (rev_user)")
         
         ##TABLE: MIX previous info with time_range_authors --> save result in new table time_range_cox
         dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_cox")
         dbaccess.raw_query_SQL(self.access[1],"create table time_range_cox as (select rev_user, "+\
         "date(min_ts) as min_ts, date(max_ts) as max_ts, "+\
         "case when rev_user in (select rev_user from users_in_talk) then 1 else 0 end as in_talk, "+\
         "case when rev_user in (select rev_user from users_in_FAs) then 1 else 0 end as in_FAs "+\
         "from time_range_authors)")
     
         ##IN SYSTEM
         print "Interm. tables created proceeding to write out data..."+self.language+"\n"
         results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, min_ts, max_ts, in_talk, in_FAs "+\
         "from time_range_cox "+\
         " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')")
         #Close DB connection
         dbaccess.close_Connection(self.access[0])
     
         f=open("wkp_cox_prop_all.dat",'a')
         for result in results:
             f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\","+\
             str(int(result[3]))+","+str(int(result[4]))+"\n")
         f.close()
         print "Finished all cox-prop tasks for "+self.language+"\n"
Ejemplo n.º 23
0
    def test_funciones(self):
        self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
        "wx_"+self.language+self.conf.dumptype)
        __total_rev(self.acceso[1], table="stats_nlwiki", target="author")
        ##	targets=["page_id"]
        ##	for target in targets:
        ##		__total_rev(self.acceso[1], language, target)
        ##		__total_rev_target(self.acceso[1], language, target)
        ##		__total_rev_time(self.acceso[1],"years",language, target)
        ##		__total_rev_target_time(self.acceso[1],"years",language, target)
        ##	__total_article_init_author(self.acceso[1], language)
        ##	__article_init_author_time(self.acceso[1],"years",language)

        ##    __article_rev_author_time(self.acceso[1], "years", language)
        ##	__total_rev_time(self.acceso[1],"months",language, "page_id")
        ##	__total_article_init_author(self.acceso[1], language, target="author")
        ##	__content_evolution(self.acceso[1], language)
        dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 24
0
 def test_funciones(self):
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+self.conf.dumptype)
     __total_rev(self.acceso[1], table="stats_nlwiki", target="author")
     ##	targets=["page_id"]
     ##	for target in targets:
     ##		__total_rev(self.acceso[1], language, target)
     ##		__total_rev_target(self.acceso[1], language, target)
     ##		__total_rev_time(self.acceso[1],"years",language, target)
     ##		__total_rev_target_time(self.acceso[1],"years",language, target)
     ##	__total_article_init_author(self.acceso[1], language)
     ##	__article_init_author_time(self.acceso[1],"years",language)
     
     ##    __article_rev_author_time(self.acceso[1], "years", language)
     ##	__total_rev_time(self.acceso[1],"months",language, "page_id")
     ##	__total_article_init_author(self.acceso[1], language, target="author")
     ##	__content_evolution(self.acceso[1], language)
     dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 25
0
 def make_indexes(self):
     self.access = dbaccess.get_Connection(
         "localhost", 3306, self.user, self.passw, "wx_" + self.language + "wiki_" + self.dumptype
     )
     # Generate adequate indexes and keys in tables page and revision
     # try:
     # print "Generating index for page_len...\n"
     # dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE page ADD INDEX page_len(page_len)")
     # except Exception, e:
     # print "An exception ocurred, the problem was the following:\n"
     # print e
     # print "*************\n\n"
     try:
         print "Creating index for rev_timestamp"
         dbaccess.raw_query_SQL(self.access[1], "ALTER TABLE revision ADD INDEX timestamp(rev_timestamp)")
     except Exception, e:
         print "An exception ocurred, the problem was the following:\n"
         print e
         print "*************\n\n"
Ejemplo n.º 26
0
    def time_range(self):
        """
        Creates intermediate tables with time frame of editors activity
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            
            ##### TIME RANGE FOR AUTHORS IN ALL NAMESPACES
            #TABLE: Total no. of revisions made by every logged author
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\
            "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\
            "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user")
            dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)")
            
            print "Created table user_revs for "+self.language+"wiki...\n"
            
            #TABLE: Min and max timestamp for every logged author + total num_revs
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_authors AS "+\
            "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\
            "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\
            "ORDER BY min_ts)")
            
            print "Created table time_range_authors for "+self.language+"wiki...\n"
            
            ##### TIME RANGE FOR AUTHORS IN MAIN ONLY
                print "Processing language "+self.language+"\n"
            #VIEW: Create view for filtering annons and bots
            #Filter from rev_main_nored revisions from logged authors only
            dbaccess.raw_query_SQL(self.access[1],"create or replace view revision_logged as (select * from rev_main_nored "+\
            " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot') )")
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_users")
            #TABLE: Intermediate table, storing for each logged author the min and max ts in the system
            dbaccess.raw_query_SQL(self.access[1],"create table time_range_users as (SELECT rev_user, "+\
            "min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision_logged group by rev_user)")
            dbaccess.raw_query_SQL(self.access[1],"alter table time_range_users add primary key (rev_user)")
            
            print "Created time_range_users for "+self.language +"\n"

            #Close DB connection
            dbaccess.close_Connection(self.access[0])
Ejemplo n.º 27
0
def comparative_contributions():
    listaidiomas=["dewiki", "jawiki", "frwiki", "plwiki", "nlwiki", "itwiki", "ptwiki", "eswiki", "svwiki"]
##    lista=["eswiki", "svwiki"]
    
    r.png("graphics/AAA/gini_comparative_top10.png")
    flag=0
    for idioma in listaidiomas:
        print "Starting comparative Gini analysis for language..."+idioma+"\n"
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
        tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma)
        dbaccess.close_Connection(acceso[0])
        data=__tup_to_list(tcnoann)
        listay_tcnoann=data.pop()
        listax=data.pop()
        if flag==0:
            _lorenz_Comp_Curves(listay_tcnoann,flag)
            flag=1
        else:
            _lorenz_Comp_Curves(listay_tcnoann,flag)
    r.dev_off()
    print "Comparative graphic for Gini curves finished!!"
Ejemplo n.º 28
0
    def infoPages(self):
        ##	Generates statistics per article
        ##	Get new DB connection
        self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
        "wx_"+self.language+"_"+self.conf.dumptype)

        #Local configuration
        target = "page_id"
        intervals = ["months", "quarters", "weeks"]

        ###########################
        #Total num of revisions per page
        for nspace in self.nspaces:
            self.__total_rev(self.acceso[1], nspace + "_" + self.language,
                             target)

        ###########################
        #Total number of different editors per page
        for nspace in self.nspaces:
            self.__total_rev_diff(self.acceso[1], nspace + "_" + self.language,
                                  target)

        ###########################
        #Total number of revisions per page for several time intervals
        #Currently, we are only interested in months, quarters and weeks
        for nspace in self.nspaces:
            for interval in intervals:
                self.__total_rev_time(self.acceso[1], interval,
                                      nspace + "_" + self.language, target)

        ###########################
        #Total number of different editors per page; per month ,quarter and week
        for nspace in self.nspaces:
            for interval in intervals:
                self.__total_rev_diff_time(self.acceso[1], interval,
                                           nspace + "_" + self.language,
                                           target)

        #Close DB connection
        dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 29
0
def histogram(idiomas):
    """
    Create histograms depicting article size distribution for a certain language version
    
    @type  idiomas: list of strings
    @param idiomas: list of strings indicating the language versions to process
    """
    filenames=["boxplot_log.png", "histogram_log.png", "histogram_log_low.png", "histogram_log_high.png", "ecdf_log_low.png", "ecdf_log_high.png", "data/page_len_log.data", "/data/histograms.info", "ecdf_total.png"]
    
    for idioma in idiomas:
        print "Creando histogramas para el idioma ... "+idioma
        #Print to another file the names of graphics files, following the order in the GNU R script histogram.R
        f=open("./data/hist_files_names.data",'w')
        for line in filenames:
            f.write("./graphics/"+idioma+"/"+line+"\n")
        f.close()
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
    
        #Considering only database pages corresponding to articles, with NAMESPACE=MAIN=0
        #dbaccess.dropTab_SQL(acceso[1], "aux")
        #dbaccess.query_SQL(acceso[1],"page_id, page_len","page", where="page_namespace=0", order="page_len", create="aux")
        result=dbaccess.query_SQL(acceso[1], "page_id, page_len", "aux")
        dbaccess.close_Connection(acceso[0])
        data=__tup_to_list(result)
        page_len=data.pop()
        for i in range(len(page_len)):
            if page_len[i]!=0:
                page_len[i]=math.log10(page_len[i])
        
        #Print to another file a list with article sizes to plot histograms
        f=open("./graphics/"+idioma+"/data/page_len_log.data", 'w')
        for value in page_len:
            f.writelines(str(value)+"\n")
        f.close()
        
        #CALL THE GNU R SCRIPT Histogram.R
        succ=os.system("R --vanilla < ./histogram.R > debug_R")
        if succ==0:
            print "Funcion histogram ejecutada con exito para el lenguage... "+idioma
Ejemplo n.º 30
0
    def performAnalysis(self):

        ##        Get DB connection
        self.acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", self.language + "_stub")
        ##        Singleton objects to plot graphics in the class methods
        self.simpleGraph = graphic2D(self.filePath)
        ##        self.multiGraph=graphic2Dmulti(self.filePath)
        ##        self.giniGraph=graphicGini(self.filePath)
        ##        self.splitHistGraph=graphicSplitHist(self.filePath, self.dataPath)
        self.graph3D = graphic3D(self.filePath, self.dataPath)
        print "Starting analysis on DB " + self.language + "_stub\n"
        ##        self.UserNumContribsGroup(self.acceso[1])
        ##        self.UserNumContribsGenerations()
        authorsGini = [
            (95.9677, 4.046),
            (95.7015, 4.304),
            (96.2223, 4.363),
            (95.7104, 4.395),
            (96.3844, 4.407),
            (92.4691, 4.528),
            (95.0077, 4.603),
            (95.0071, 4.7298),
            (93.785, 5.051),
            (93.6076, 5.888),
        ]
        authorsGini.sort()
        ##authorsGini=[(4.046,95.9677),(4.304,95.7015),(4.363,96.2223),(4.395,95.7104),(4.407,96.3844),(4.528,92.4691),(4.603,95.0077),(4.7298,95.0071),(5.051,93.785),(5.888,93.6076)]

        self.simpleGraph.createGraphic(
            "authors-Gini",
            (authorsGini,),
            "Gini coeff. (%)",
            "Number of different authors (log)",
            "Gini coeff. vs. number of registered authors in the top-ten Wikipedias.",
        )
        ##            Close DB connection
        dbaccess.close_Connection(self.acceso[0])
        print "This is finished"
Ejemplo n.º 31
0
    def prepro_red_talk(self):
        """
        Data and evolution for redirects and talk pages
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research" 
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)

            #VIEW page_redirect (pages with redirect flag activated)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view page_redirect as "+\
            "(select page_id from page where page_namespace=0 and page_is_redirect=1)")

            #VIEW rev_redirect (revisions corresponding to redirect pages)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_redirect as ("+\
            "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
            "(select page_id from page_redirect))")

            #VIEW page_talk (pages in talk nspace)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view page_talk as "+\
            "(select page_id from page where page_namespace=1)")

            #VIEW rev_talk (revisions corresponding to talk pages)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_talk as ("+\
            "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
            "(select page_id from page_talk))")

            #TABLES max_rev_talk_YYYY (latest revision for each pages in talk nspace, in year YYYY)
            self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision")
            self.years=range(int(self.minyear[0][0])+1, 2009)
            for self.year in self.years:
                dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_talk_"+str(self.year))
                dbaccess.raw_query_SQL(self.access[1],"create table max_rev_talk_"+str(self.year)+\
                " as (select max(rev_id) as max_id, rev_page from rev_talk "+\
                "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)")
                dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add primary key (max_id)")
                dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add index (rev_page)")
                
            dbaccess.close_Connection(self.access[0])
Ejemplo n.º 32
0
 def general_stats(self):
     """
     Preprocessing actions for general statistics scripts
     """
     #FILE page_len.dat, with info about length of pages
     self.f=open("overall/data/page_len.dat", 'w')
     self.f.write("page_len\tns\tis_redirect\tis_stub\tis_new\tlang\n")
     self.f.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research" 
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         print "Retrieving info from "+self.language+"\n"
         results=dbaccess.raw_query_SQL(self.access[1], "SELECT page_len, page_namespace, page_is_redirect, page_is_stub, "+\
         "page_is_new FROM page")
         print "Updating page_len info file with "+self.language+"\n"
             
         self.f=open("overall/data/page_len.dat", 'a')
         for result in results:
             self.f.write(str(int(result[0]))+"\t"+str(int(result[1]))+"\t"+str(int(result[2]))+"\t"+\
             str(int(result[3]))+"\t"+str(int(result[4]))+"\t"+self.language+"\n")
         self.f.close()
         results=None
         dbaccess.close_Connection(self.access[0])
Ejemplo n.º 33
0
    def analyze(self):

        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            
            #Total no. of revisions made by every logged author
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\
            "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\
            "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user")
            dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)")
            
            print "Created table user_revs for "+self.language+"wiki...\n"
            
            #Min and max timestamp for every logged author + total num_revs
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_allns AS "+\
            "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\
            "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\
            "ORDER BY min_ts)")
            
            print "Created table time_range_allns for "+self.language+"wiki...\n"
                
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
Ejemplo n.º 34
0
    def surv_files(self):
        """
        Creates all data files used as input for demography scripts in GNU R
        """
        #Initialize all files headers
        #FILE: Survival data for all users (including editors out of MAIN)
        f=open("wkp_surv_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        #FILE: Survival data for all logged users who edited in MAIN
        f=open("wkp_surv_main_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        #FILE: Survival data for all logged editors until they join the core (activity)
        f=open("wkp_surv_join_core_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        #FILE: Survival data for logged editors since they join the core until they leave it (activity)
        f=open("wkp_surv_in_core_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        #FILE: Survival data for loged editors since they leave the core until death (activity)
        f=open("wkp_surv_core_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
        #FILE: Survival data for all logged editors until they join the core (revisions)
        f=open("wkp_surv_join_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        #FILE: Survival data for logged editors since they join the core until they leave it (revisions)
        f=open("wkp_surv_in_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        #FILE: Survival data for loged editors since they leave the core until death (revisions)
        f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
            
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
        
            print "Starting language "+self.language+"\n"
            ##IN SYSTEM
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\
            " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()

            ##IN MAIN
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_main_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished core users by activity for language "+self.language+"\n"

            ###########################
            ##REV CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished all surv_file tasks for "+self.language+"\n"
Ejemplo n.º 35
0
 def __init__(self, conf, language="furwiki"):
     """
     Creates multiple views to create a convenient interface to access quantitative data
     It also generates necessary tables and views to store intermidiate results, so that other methods
     can later store data directly. 
     """
     self.conf=conf
     self.language=language
     ##List of namespaces to analyse. We have added new special namespaces (e.g. subsets of main)
     self.nspaces=["all","ns0","articles","redirects","cur_redirects","cur_stubs","stubs","talk",\
     "pageUser", "userTalk","meta", "metaTalk", "image", "imageTalk", "mediawiki",\
     "mediawikiTalk", "template", "templateTalk", "help", "helpTalk", "category", "categoryTalk"]
     
     ##Some fancy lists to work with time intervals in some private methods following
     self.type_interval_columns={"days":"day, year", "weeks":"week, year", "months":"month, year",\
     "quarters":"quarter, year", "years":"year"}
     self.type_interval_select={"days":"DAYOFYEAR(rev_timestamp) AS day, YEAR(rev_timestamp) AS year ",\
     "weeks":"WEEK(rev_timestamp,1) AS week, YEAR(rev_timestamp) AS year ",\
     "months":"MONTH(rev_timestamp) AS month, YEAR(rev_timestamp) AS year ",\
     "quarters":"QUARTER(rev_timestamp) AS quarter, YEAR(rev_timestamp) AS year ",\
     "years":"YEAR(rev_timestamp) AS year "}
     self.type_interval_group={"days":"year, day", "weeks":"year, week", "months":"year, month",\
     "quarters":"year, quarter", "years":"year"}
     
     ##	Get new DB connection
     self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
     "wx_"+self.language+"_"+self.conf.dumptype)
     
     ##    Delete previous versions of views
     for nspace in self.nspaces:
         dbaccess.dropView(self.acceso[1], nspace+"_"+self.language)
     
     ##    Create updated versions for views from revision table
     #View sumarizing all info for every revision (linking with info from table page)
     dbaccess.createView(self.acceso[1], view="all_"+self.language,\
     columns="rev_id, page_id, rev_len, page_ns, page_len, is_redirect, author, author_text,"+\
     " rev_timestamp, rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_namespace, page_len, rev_is_redirect,"+\
     " rev_user, rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id")
     #View sumarizing info regarding pages in namespace=0 (including articles, stubs and redirects)
     dbaccess.createView(self.acceso[1], view="ns0_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, is_redirect, author, author_text,"+\
     " rev_timestamp, rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, page_is_redirect, rev_user,"+\
     " rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id"+\
     " AND page_namespace=0")
     #View sumarizing info for articles (excluding pages that currently are redirects and stubs)
     dbaccess.createView(self.acceso[1], view="articles_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
     "rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text,"+\
     " rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND page_namespace=0 AND "+\
     "page_is_redirect=0 AND page_is_stub=0")
     #View with info only for redirects (pages that were redirects when that revision was made)
     dbaccess.createView(self.acceso[1], view="redirects_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
     "rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
     "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\
     "page_namespace=0 AND rev_is_redirect=1")
     #View with info only for current redirects
     dbaccess.createView(self.acceso[1], view="cur_redirects_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
     "rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
     "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\
     "page_namespace=0 AND page_is_redirect=1")
     #View with info only for revisions of stub pages (pages that were stubs when that revision was made)
     dbaccess.createView(self.acceso[1], view="stubs_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\
     " rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
     "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
     " page_namespace=0 AND rev_is_stub=1")
     #View with info only for revisions of current stub pages
     dbaccess.createView(self.acceso[1], view="cur_stubs_"+self.language,\
     columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\
     " rev_parent_id",
     query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
     "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
     " page_namespace=0 AND page_is_stub=1")
     #From this point on, automatically create views for the set of pages included in the remaining namespaces in MediaWiki
     for nspace, nsnum in zip(self.nspaces[7:], range(1,16)):
         dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language,\
         columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"+\
         " rev_parent_id",
         query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
         "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
         " page_namespace="+str(nsnum))
         
     #View sumarizing the distribution of pages among namespaces
     dbaccess.dropView(self.acceso[1], "nspaces_"+self.language)
     dbaccess.createView(self.acceso[1],view="nspaces_"+self.language, columns="namespace, pages_in_nspace",\
     query="SELECT page_namespace, COUNT(*) FROM page GROUP BY page_namespace")
 
     ##    Intermidiate views for the minimun timestamp of every page [annons, and logged users]
     ## And other useful intermediate views regarding page evolution
     for nspace in self.nspaces:
         dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\
         "_page_min_timestamp_logged")
         dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\
         "_page_min_timestamp_logged", columns="page_id, rev_id, author, rev_timestamp",\
         query="SELECT page_id, rev_id,author, MIN(rev_timestamp) FROM "+\
         nspace+"_"+self.language+" WHERE author!=0 GROUP BY page_id")
         dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\
         "_page_min_timestamp_annons")
         dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\
         "_page_min_timestamp_annons",\
         columns="page_id, rev_id, author_text, rev_timestamp",\
         query="SELECT page_id,rev_id,author_text, MIN(rev_timestamp) FROM "+\
         nspace+"_"+self.language+" WHERE author=0 GROUP BY page_id")
         
         dbaccess.dropView(self.acceso[1],nspace+"_"+self.language+"_list_months")
         dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_months",\
         columns="month, year",query="SELECT MONTH(rev_timestamp) as month, "+\
         "YEAR(rev_timestamp) as year"+\
         " FROM "+nspace+"_"+self.language+" GROUP BY year, month ORDER BY year, month")
         
         dbaccess.dropView(self.acceso[1],nspace+"_"+self.language+"_list_quarters")
         dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_quarters",\
         columns="quarter, year",query="SELECT QUARTER(rev_timestamp) as quarter, "+\
         "YEAR(rev_timestamp) as year FROM "+nspace+"_"+self.language+" GROUP BY year,"+\
         " quarter ORDER BY year, quarter")
     
 ##    Close DB connection
     dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 36
0
    def __init__(self, conf, language="furwiki"):
        """
        Creates multiple views to create a convenient interface to access quantitative data
        It also generates necessary tables and views to store intermidiate results, so that other methods
        can later store data directly. 
        """
        self.conf = conf
        self.language = language
        ##List of namespaces to analyse. We have added new special namespaces (e.g. subsets of main)
        self.nspaces=["all","ns0","articles","redirects","cur_redirects","cur_stubs","stubs","talk",\
        "pageUser", "userTalk","meta", "metaTalk", "image", "imageTalk", "mediawiki",\
        "mediawikiTalk", "template", "templateTalk", "help", "helpTalk", "category", "categoryTalk"]

        ##Some fancy lists to work with time intervals in some private methods following
        self.type_interval_columns={"days":"day, year", "weeks":"week, year", "months":"month, year",\
        "quarters":"quarter, year", "years":"year"}
        self.type_interval_select={"days":"DAYOFYEAR(rev_timestamp) AS day, YEAR(rev_timestamp) AS year ",\
        "weeks":"WEEK(rev_timestamp,1) AS week, YEAR(rev_timestamp) AS year ",\
        "months":"MONTH(rev_timestamp) AS month, YEAR(rev_timestamp) AS year ",\
        "quarters":"QUARTER(rev_timestamp) AS quarter, YEAR(rev_timestamp) AS year ",\
        "years":"YEAR(rev_timestamp) AS year "}
        self.type_interval_group={"days":"year, day", "weeks":"year, week", "months":"year, month",\
        "quarters":"year, quarter", "years":"year"}

        ##	Get new DB connection
        self.acceso = dbaccess.get_Connection("localhost", 3306, self.conf.msqlu, self.conf.msqlp,\
        "wx_"+self.language+"_"+self.conf.dumptype)

        ##    Delete previous versions of views
        for nspace in self.nspaces:
            dbaccess.dropView(self.acceso[1], nspace + "_" + self.language)

        ##    Create updated versions for views from revision table
        #View sumarizing all info for every revision (linking with info from table page)
        dbaccess.createView(self.acceso[1], view="all_"+self.language,\
        columns="rev_id, page_id, rev_len, page_ns, page_len, is_redirect, author, author_text,"+\
        " rev_timestamp, rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_namespace, page_len, rev_is_redirect,"+\
        " rev_user, rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id")
        #View sumarizing info regarding pages in namespace=0 (including articles, stubs and redirects)
        dbaccess.createView(self.acceso[1], view="ns0_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, is_redirect, author, author_text,"+\
        " rev_timestamp, rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, page_is_redirect, rev_user,"+\
        " rev_user_text, rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id"+\
        " AND page_namespace=0")
        #View sumarizing info for articles (excluding pages that currently are redirects and stubs)
        dbaccess.createView(self.acceso[1], view="articles_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
        "rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text,"+\
        " rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND page_namespace=0 AND "+\
        "page_is_redirect=0 AND page_is_stub=0")
        #View with info only for redirects (pages that were redirects when that revision was made)
        dbaccess.createView(self.acceso[1], view="redirects_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
        "rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
        "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\
        "page_namespace=0 AND rev_is_redirect=1")
        #View with info only for current redirects
        dbaccess.createView(self.acceso[1], view="cur_redirects_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp, "+\
        "rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
        "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND "+\
        "page_namespace=0 AND page_is_redirect=1")
        #View with info only for revisions of stub pages (pages that were stubs when that revision was made)
        dbaccess.createView(self.acceso[1], view="stubs_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\
        " rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
        "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
        " page_namespace=0 AND rev_is_stub=1")
        #View with info only for revisions of current stub pages
        dbaccess.createView(self.acceso[1], view="cur_stubs_"+self.language,\
        columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"\
        " rev_parent_id",
        query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
        "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
        " page_namespace=0 AND page_is_stub=1")
        #From this point on, automatically create views for the set of pages included in the remaining namespaces in MediaWiki
        for nspace, nsnum in zip(self.nspaces[7:], range(1, 16)):
            dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language,\
            columns="rev_id, page_id, rev_len, page_len, page_title, author, author_text, rev_timestamp,"+\
            " rev_parent_id",
            query="SELECT rev_id, rev_page, rev_len, page_len, page_title, rev_user, rev_user_text, "+\
            "rev_timestamp, rev_parent_id FROM revision, page WHERE rev_page=page_id AND"+\
            " page_namespace="+str(nsnum))

        #View sumarizing the distribution of pages among namespaces
        dbaccess.dropView(self.acceso[1], "nspaces_" + self.language)
        dbaccess.createView(self.acceso[1],view="nspaces_"+self.language, columns="namespace, pages_in_nspace",\
        query="SELECT page_namespace, COUNT(*) FROM page GROUP BY page_namespace")

        ##    Intermidiate views for the minimun timestamp of every page [annons, and logged users]
        ## And other useful intermediate views regarding page evolution
        for nspace in self.nspaces:
            dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\
            "_page_min_timestamp_logged")
            dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\
            "_page_min_timestamp_logged", columns="page_id, rev_id, author, rev_timestamp",\
            query="SELECT page_id, rev_id,author, MIN(rev_timestamp) FROM "+\
            nspace+"_"+self.language+" WHERE author!=0 GROUP BY page_id")
            dbaccess.dropView(self.acceso[1], nspace+"_"+self.language+\
            "_page_min_timestamp_annons")
            dbaccess.createView(self.acceso[1], view=nspace+"_"+self.language+\
            "_page_min_timestamp_annons",\
            columns="page_id, rev_id, author_text, rev_timestamp",\
            query="SELECT page_id,rev_id,author_text, MIN(rev_timestamp) FROM "+\
            nspace+"_"+self.language+" WHERE author=0 GROUP BY page_id")

            dbaccess.dropView(self.acceso[1],
                              nspace + "_" + self.language + "_list_months")
            dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_months",\
            columns="month, year",query="SELECT MONTH(rev_timestamp) as month, "+\
            "YEAR(rev_timestamp) as year"+\
            " FROM "+nspace+"_"+self.language+" GROUP BY year, month ORDER BY year, month")

            dbaccess.dropView(self.acceso[1],
                              nspace + "_" + self.language + "_list_quarters")
            dbaccess.createView(self.acceso[1],view=nspace+"_"+self.language+"_list_quarters",\
            columns="quarter, year",query="SELECT QUARTER(rev_timestamp) as quarter, "+\
            "YEAR(rev_timestamp) as year FROM "+nspace+"_"+self.language+" GROUP BY year,"+\
            " quarter ORDER BY year, quarter")

    ##    Close DB connection
        dbaccess.close_Connection(self.acceso[0])
Ejemplo n.º 37
0
    def analyze(self):
        #Initialize all files headers
        #Survival data for all users (including editors out of MAIN)
        f=open("wkp_surv_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        #Survival data for all logged users who edited in MAIN
        f=open("wkp_surv_main_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        f=open("wkp_surv_join_core_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        f=open("wkp_surv_in_core_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        f=open("wkp_surv_core_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
        f=open("wkp_surv_join_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        f=open("wkp_surv_in_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
            
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
	    
            print "Starting language "+self.language+"\n"
            ##IN SYSTEM
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\
            "where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()

            ##IN MAIN
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_main_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished core users for language "+self.language+"\n"
            ###########################
            ##REV CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished all tasks for "+self.language+"\n"
Ejemplo n.º 38
0
import dbaccess
import csv
from pprint import pprint
import re

msqlu = 'root'
msqlp = 'qhshfl27'
msqldb = 'setags_ux'
msqlh = 'localhost'

acceso = dbaccess.get_Connection(msqlh, 3306, msqlu, msqlp, msqldb)


def getTags(rawtagstr):

    # split tags
    mlist = re.finditer("\<(?P<tag>[A-Za-z0-9\_\-]+)\>", rawtagstr)

    tags = []
    for m in mlist:
        tags.append(m.group('tag'))
    return tags


def createTagChangeDB(filename, tablename1, tablename2):

    # read in csv data
    reader = csv.reader(open(filename, "rU"))
    header = reader.next()

    for (i, row) in enumerate(reader):
Ejemplo n.º 39
0
def summary_evol(idiomas):
    """
    Create some graphs summarizing the evolution in time of critical quantitative
    parameters for each language version to explore
    
    @type  idiomas: list of strings
    @param idiomas: list of strings indicating the language versions to process
    """
##	¡¡WARNING!! Please be careful when selecting values from tables storing evolution in time of number of articles, size etc.
##  You must always use a GROUP BY(pageCount, limitDate) clause, due to 
##  periods of inactivity that could generate duplicate entries in the graphics
    filenames=["page_dates.data", "page_Count_evol.data", "page_Len_Sum_log.data", "contribs_evol.data", "nspaces.data", "nspace_distrib.data", "diffArticles.data", "authors.data", "diff_authors_x_article.data", "authors_authors_per_pagelen.data", "pagelen_authors_per_pagelen.data"]

    filenames_out=["Tot_num_articles_absx_absy.png", "Tot_num_articles_absx_logy.png", "Tot_num_articles_logx_logy.png", "Tot_pagelensum_absx_absy.png", "Tot_pagelensum_absx_logy.png", "Tot_pagelensum_logx_logy.png", "Tot_contribs_absx_absy.png", "Tot_contribs_absx_logy.png", "Tot_contribs_logx_logy.png", "Diffs_articles_per_author.png", "Diffs_authors_per_article.png", "Diff_authors_against_page_len.png"]
    
    for idioma in idiomas:
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
        #acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages")
        result=dbaccess.query_SQL(acceso[1], "pageCount, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)")
        result2=dbaccess.query_SQL(acceso[1], "pageLenSum, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)")
        result3=dbaccess.query_SQL(acceso[1], "contribs, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)")
        
        resultnspace=dbaccess.query_SQL(acceso[1], "pages_nspace, namespace", "stats_nspace_"+idioma)
        
        diffArticlesNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_NoAnnons_author_"+idioma)
        
        diffInitNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_Init_NoAnnons_author_"+idioma)
        
        totRevperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Contrib_NoAnnons_page_id_"+idioma)
        
        diffAuthorperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Article_NoAnnons_page_id_"+idioma)
        
        dautxplen=dbaccess.query_SQL(acceso[1], "page_len, authors", "stats_pagelen_difauthors_"+idioma)
        
        dbaccess.close_Connection(acceso[0])
        
        data=__tup_to_list(result, 1)
        dates_x=data.pop()
        page_Count=data.pop()
        
##        if idioma=="frwiki":
        data2=__tup_to_list(result2, 2)
        dates_x=data2.pop()
        dates_x.pop(0)
        dates_x.pop(0)
        page_Len_Sum=data2.pop()
        page_Len_Sum.pop(0)
        page_Len_Sum.pop(0)
##        else:
##            data2=__tup_to_list(result2, 1)
##            dates_x=data2.pop()
##            page_Len_Sum=data2.pop()
        
        data3=__tup_to_list(result3, 1)
        dates_x=data3.pop()
        contribs=data3.pop()
        
        datanspace=__tup_to_list(resultnspace)
        namespaces=datanspace.pop()
        pages_nspace=datanspace.pop()
        
        dataDiffArticlesNoann=__tup_to_list(diffArticlesNoann)
        diffArticles=dataDiffArticlesNoann.pop()
        authors=dataDiffArticlesNoann.pop()
        
        dataDiffInitNoann=__tup_to_list(diffInitNoann)
        diffInitArticles=dataDiffInitNoann.pop()
        authors=dataDiffInitNoann.pop()
        
        datatotRevperArticle=__tup_to_list(totRevperArticle)
        totalRev=datatotRevperArticle.pop()
        article=datatotRevperArticle.pop()
        
        datadiffAuthorperArticle=__tup_to_list(diffAuthorperArticle)
        diffAuthors=datadiffAuthorperArticle.pop()
        article=datadiffAuthorperArticle.pop()
        
        datadautxplen=__tup_to_list(dautxplen)
        autxplen=datadautxplen.pop()
        lenautxplen=datadautxplen.pop()

##  Introduce in data list results form queries in the proper order
##  corresponding with the name files we pass to the GNU R script summary_evol.R      
        for i in range(len(page_Len_Sum)):
            if page_Len_Sum[i]!=0:
                page_Len_Sum[i]=math.log10(page_Len_Sum[i])
                
        dataList=[dates_x, page_Count, page_Len_Sum, contribs, namespaces, pages_nspace, diffArticles, authors, diffAuthors, autxplen, lenautxplen]

        for filename, data in zip (filenames, dataList):
            if(filename.find('date')!=-1):
                __makeDatesFile(idioma, filename, data)
            else:
                __makeDataFile(idioma, filename, data)
        
        ######################################
        
        #Pass data filenames to the GNU R script with a file
        f=open("./data/summary_files_names.data",'w')
        for line in filenames:
            f.write("./graphics/"+idioma+"/data/"+line+"\n")
        f.close()
        
        #Idem with graphic output filenames
        f=open("./data/summary_files_out.data",'w')
        for line in filenames_out:
            f.write("./graphics/"+idioma+"/"+line+"\n")
        f.close()
            
        #CALL THE GNU R SCRIPT summary_evol.R
        
        succ=os.system("R --vanilla < ./summary_evol.R > debug_R")
        if succ==0:
            print "Funcion summary_evol ejecutada con exito para el lenguage... "+idioma
Ejemplo n.º 40
0
def measuring(idiomas):
    """
    Create some graphs following the research presented by Jakob Voss in his paper
    Mesuring Wikipedia (ISSI 2005)
    
    @type  idiomas: list of strings
    @param idiomas: list of strings indicating the language versions to process
    """
##   Generates some graphics reproducing those in Measuring Wikipedia article
    filenames=["total_edits.data", "noannons_edits.data", "annon_edits.data", "authors_per_article_desc.data", "articles_per_logged_author_desc.data",  "articles_per_anonymous_author_desc.data"]
    
    filenames_out=["total_edits_per_author.png", "total_edits_per_noannon_author.png", "total_edits_per_annon_author.png", "diff_authors_per_article_descending.png", "diff_articles_per_logged_author_descending.png", "diff_articles_per_anonymous_author_descending.png"]
    
    for idioma in idiomas:
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
    ##    acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages")
        #Combined evolution graphics
        #ALL THESE GRAPHICS ARE ALREADY GENERATED BY ERIK ZATCHE'S OFFICIAL PERL SCRIPTS
            #Database size
            #Total number of words
            #Total number of internal links
            #Number of articles (including redirects)
            #Number of active wikipedians (more than 5 contributions in a given month)
            #Number of very active wikipedians (more than 100 contributions in a given month)
        
        #Namespace size
            #OK, it is generated in summary_evol() method
            
        #Evolution in time of article size (histogram)
            #IDEA: Download page.sql files for a language for each semester period
            
        #Number of distinct authors per article (descending sorted graphic)
            #Already generated in summary_evol, ONLY NEED TO SORT AND ADJUST IN GNU R
        diffAuthorperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Article_NoAnnons_page_id_"+idioma)
        
        #Number of distinct articles per author (descending sorted graphic)
            #Idem as in the previous case
        diffArticlesNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_NoAnnons_author_"+idioma)
        diffArticlesAnn=dbaccess.query_SQL(acceso[1], "author_text, theCount", "stats_Article_Annons_author_text_"+idioma)        
        
        data=__tup_to_list(diffAuthorperArticle)
        lisdiffauthorartic=data.pop()
        data=__tup_to_list(diffArticlesNoann)
        lisdiffarticleaut=data.pop()
        data=__tup_to_list(diffArticlesAnn,2)
        lisdiffarticleannon=data.pop()
##        Ordenamos los resultados para que se puedan ajustar a una Power Law
        lisdiffauthorartic.sort(reverse=True)
        lisdiffarticleaut.sort(reverse=True)
        lisdiffarticleannon.sort(reverse=True)
        
        #Number of edtis per author
            #Retrieve results from database
            #We have already created GINI graphics for this parameter
            #ALSO AVAILABLE DATABASE TABLES WITH EVOLUTION IN TIME OF THIS PARAMETER
        
        tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma)
        tcauthor=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_author_"+idioma)
        tc_ann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_Annons_author_text_"+idioma)
        
        data=__tup_to_list(tcnoann)
        listcnoann=data.pop()
        data=__tup_to_list(tcauthor)
        listcauthors=data.pop()
        #BTW, we are also obtaining but not using the IP adresses of annon users
        data=__tup_to_list(tc_ann,2)
        listcann=data.pop()
        
##        Arranging results in a decreasing way to adjust them to a power law
        listcnoann.sort(reverse=True)
        listcauthors.sort(reverse=True)
        listcann.sort(reverse=True)
        
        #Ingoing and outgoing number of links per article
            #STILL TO BE DEVELOPED
            #NEED TO FIRST IDENTIFY LINKS FOR A GIVEN ARTICLE IN THE DATABASE
            #LINKS TABLES MAY HELP, but in these dump versions they are all empty!!!
            
            #BROKEN LINKS also need to be considered
        
        dbaccess.close_Connection(acceso[0])
        
        dataList=[listcauthors, listcnoann, listcann, lisdiffauthorartic, lisdiffarticleaut, lisdiffarticleannon]
        
        for filename, data in zip (filenames, dataList):
            if(filename.find('date')!=-1):
                __makeDatesFile(idioma, filename, data)
            else:
                __makeDataFile(idioma, filename, data)
        
        #Pass data filenames to the GNU R script with a file
        f=open("./data/measuring_files_names.data",'w')
        for line in filenames:
            f.write("./graphics/"+idioma+"/data/"+line+"\n")
        f.close()
        
        #Idem with graphic output filenames
        f=open("./data/measuring_files_out.data",'w')
        for line in filenames_out:
            f.write("./graphics/"+idioma+"/"+line+"\n")
        f.close()
            
        #CALL GNU R SCRIPT measuring_Wiki.R
        
        succ=os.system("R --vanilla < ./measuring_Wiki.R > debug_R")
        if succ==0:
            print "Funcion measuring_Wiki.R ejecutada con exito para el lenguage... "+idioma
Ejemplo n.º 41
0
    def decompress (self):
        """
        Decompress the DB dumps into MySQL
        """
        ##TODO: Ad-hoc acuerdate de quitar esto POR DIOSSS
        ##self.filename="mtwiki-latest-pages-meta-history.xml.7z" 
        if self.dumptype=="research":
            program="dump_sax_research.py"
        elif self.dumptype=="standard":
            program="dump_sax.py"
        else:
            print "Error! Unexpected type of dump received"
            return -1
        self.filename=self.filenameTemplate.safe_substitute(language=self.language,file=self.files[0])
        #Then we call our parser "dump_sax_research.py" to load data into MySQL
        command_7z="7za e -so dumps/"+self.filename+" | "+"python "+program+\
        " -u "+self.msqlu+" -p "+self.msqlp+" -d "+"wx_"+self.language+"_"+self.dumptype+\
        " --log "+self.language+".log"
        success=os.system(command_7z)
        if success == 0:
            print "DB "+"wx_"+self.language+\
            self.dumptype+" successfully decompressed...\n\n"
        else:
            print "Error! There was an error trying to decompress database --> "+\
            "wx_"+self.language+self.dumptype
            return -1
        #Loading into MySQL other interesting tables directly provided in SQL format
        #SQL code to generate the tables is embedded in the SQL file itself
##        for index in range(1,len(self.files)):
##            self.filename=self.filenameTemplate.safe_substitute(language=self.language, file=self.files[index])
##            command_gzip="gzip -d dumps/"+self.filename
##            command_mysql="mysql -u "+self.msqlu+" -p"+self.msqlp+\
##            " wx_"+self.language+"_"+self.dumptype+\
##            " < dumps/"+self.filename.rstrip(".gz")
##            command_comp="gzip dumps/"+self.filename.rstrip(".gz")
##            print "Decompressing "+self.filename+"..."
##            success=os.system(command_gzip)
##            if success==0:
##                print "Loading "+self.filename.rstrip(".gz")+" into MySQL database..."
##                success=os.system(command_mysql)
##                if success==0:
##                    print "Compressing again "+self.filename.rstrip(".gz")+"..."
##                    success=os.system(command_comp)
##                    if success!=0:
##                        print "Error compressing again "+self.filename.rstrip(".gz")
##                        return -1
##                else:
##                    print "Error loading "+self.filename.rstrip(".gz")
##                    return -1
##            else:
##                print "Error decompressing "+self.filename
##                return -1
        print "Generating indexes for tables page and revision...\n"
        print "Depending on the dump size this may take a while...\n"
        acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu,\
        self.msqlp, "wx_"+self.language+"_"+self.dumptype)
        #Generate adequate indexes and keys in tables page and revision
        print "Generating index for page_len...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE page ADD INDEX (page_len)")
        print "Modifying rev_timestamp to support DATETIME and creating index...\n"
        #dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision MODIFY rev_timestamp DATETIME")
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX timestamp (rev_timestamp)")
        print "Generating index for rev_page and rev_timestamp...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)")
        print "Generating index for rev_user and rev_timestamp...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)")
        print "Generating index for rev_user_text and timestamp...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)")
        dbaccess.close_Connection(acceso[0])
        print "Database ready for quantitative analysis...\n"
        print "Let's go on... Cross your fingers... ;-) \n\n\n"
        return success
Ejemplo n.º 42
0
def community_contrib(idiomas):
    for idioma in idiomas:
        list_admins=test_admins.process_admins(idioma)
        num_admins=list_admins.pop()
        where_clause1=list_admins.pop()
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
        admins_ids=dbaccess.raw_query_SQL(acceso[1], "SELECT DISTINCT(author) FROM stats_"+idioma+" WHERE "+where_clause1+" LIMIT "+str(num_admins))
##        MONTAR WHERE CLAUSE CON ADMINS IDS
        list_admins_ids=[]
        for item in list_admins_ids:
            list_admins_ids.append(int(item[0]))
        where_clause2=test_admins.process_users_ids(list_admins_ids,idioma)
        edits_admin_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_NoAnnons_months_author_"+idioma+" ", where=where_clause2, group="year, month ", order="year, month")
        dates_admins=[]
        admins_contribs=[]
        for element in edits_admin_month:
            dates_admins.append(list(element[0:2]))
            admins_contribs.append(int(element[2]))
##        PASAR A UN ARCHIVO PARA PLOT (FIG 2)
##        RECUPERAMOS CONTRIBUCIONES TOTALES POR MESES
        total_edits_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, group="year, month ")
        dates_contribs=[]
        total_contribs=[]
        for element in total_edits_month:
            dates_contribs.append(list(element[0:2]))
            total_contribs.append(int(element[2]))
##        DIVIDIR LA PRIMERA LISTA POR LA SEGUNDA
        perc_contribs_admins=[]
        for admin_contrib, total_contrib in zip(admins_contribs, total_contribs):
            perc_contribs_admins.append((float(admin_contrib)/total_contrib))
##        PASAR A UN ARCHIVO PARA PLOT (FIG 1)

##    FIG 4 TOTAL EDITS MADE BY USERS WITH DIFFERENT EDIT LEVELS
##    CREATE CLUSTER OF USERS IDENTIFIED BY CONTRIBUTIONS LEVEL
##    5 LEVELS: <100, 100-1K, 1K-5K, 5K-10K, >10K
        users_level1=[]
        users_level2=[]
        users_level3=[]
        users_level4=[]
        users_level5=[]
        level1=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount<=100")
        for userid in level1:
            users_level1.append(int(userid[0]))
        level2=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>100 AND theCount<=1000")
        for userid in level2:
            users_level2.append(int(userid[0]))
        level3=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>1000 AND theCount<=5000")
        for userid in level3:
            users_level3.append(int(userid[0]))
        level4=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>5000 AND theCount<=10000")
        for userid in level4:
            users_level4.append(int(userid[0]))
        level5=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>10000")
        for userid in level5:
            users_level5.append(int(userid[0]))
        where_clause_level1=test_admins.process_users_ids(users_level1,idioma)
        where_clause_level2=test_admins.process_users_ids(users_level2,idioma)
        where_clause_level3=test_admins.process_users_ids(users_level3,idioma)
        where_clause_level4=test_admins.process_users_ids(users_level4,idioma)
        where_clause_level5=test_admins.process_users_ids(users_level5,idioma)
        
        contribs_level1_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level1, group="year, month")
        contribs_level2_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level2, group="year, month")
        contribs_level3_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level3, group="year, month")
        contribs_level4_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level4, group="year, month")
        contribs_level5_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level5, group="year, month")
        list_level1=__process_contribs(contribs_level1_month, total_contribs)
        perc_contribs_level1=list_level1.pop()
        contribs_level1=list_level1.pop()
        dates_level1=list_level1.pop()
        
        list_level2=__process_contribs(contribs_level2_month, total_contribs)
        perc_contribs_level2=list_level2.pop()
        contribs_level2=list_level2.pop()
        dates_level2=list_level2.pop()
        
        list_level3=__process_contribs(contribs_level3_month, total_contribs)
        perc_contribs_level3=list_level3.pop()
        contribs_level3=list_level3.pop()
        dates_level3=list_level1.pop()
        
        list_level4=__process_contribs(contribs_level4_month, total_contribs)
        perc_contribs_level4=list_level4.pop()
        contribs_level4=list_level4.pop()
        dates_level4=list_level4.pop()
        
        list_level5=__process_contribs(contribs_level5_month, total_contribs)
        perc_contribs_level5=list_level5.pop()
        contribs_level5=list_level5.pop()
        dates_level5=list_level5.pop()
        
##    FIG 5 PLOT 4b
##    FIG 6 AVERAGE NUMBER OF EDITS PER USER PER MONTH FOR EACH LEVEL
##        RETRIEVE NUM USERS FOR EACH MONTH IN EACH LEVEL WHO HAVE MADE AT LEAST ONE CONTRIB
        num_users_1_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level1, group="year, month")
        num_users_2_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level2, group="year, month")
        num_users_3_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level3, group="year, month")
        num_users_4_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level4, group="year, month")
        num_users_5_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level5, group="year, month")
        list_users_1_month=[]
        for element in num_users_1_month:
            list_users_1_month.append(int(element[0]))
        list_users_2_month=[]
        for element in num_users_2_month:
            list_users_2_month.append(int(element[0]))
        list_users_3_month=[]
        for element in num_users_3_month:
            list_users_3_month.append(int(element[0]))
        list_users_4_month=[]
        for element in num_users_4_month:
            list_users_4_month.append(int(element[0]))
        list_users_5_month=[]
        for element in num_users_5_month:
            list_users_5_month.append(int(element[0]))
        
##        DIVIDE TOT NUM CONTRIBS PER LEVEL PER MONTH BY THE NUM USERS FOR EACH MONTH IN EACH LEVEL
        avg_contribs_user_1_month=[]
        for contribmonth, usermonth in zip(contribs_level1, list_users_1_month):
            avg_contribs_user_1_month.append(float(contribmonth)/usermonth)
        avg_contribs_user_2_month=[]
        for contribmonth, usermonth in zip(contribs_level2, list_users_2_month):
            avg_contribs_user_2_month.append(float(contribmonth)/usermonth)
        avg_contribs_user_3_month=[]
        for contribmonth, usermonth in zip(contribs_level3, list_users_3_month):
            avg_contribs_user_3_month.append(float(contribmonth)/usermonth)
        avg_contribs_user_4_month=[]
        for contribmonth, usermonth in zip(contribs_level4, list_users_4_month):
            avg_contribs_user_4_month.append(float(contribmonth)/usermonth)
        avg_contribs_user_5_month=[]
        for contribmonth, usermonth in zip(contribs_level5, list_users_5_month):
            avg_contribs_user_5_month.append(float(contribmonth)/usermonth)
        
##        FIG 7 POPULATION GROWTH FOR EACH USER GROUP
##        SIMPLY RETRIEVE list_users_X_month
##        FIG 8 % OF TOTAL POPULATION OF EACH USER GROUP
        perc_users_1_months=[]
        perc_users_2_months=[]
        perc_users_3_months=[]
        perc_users_4_months=[]
        perc_users_5_months=[]
        for e1, e2, e3, e4, e5 in zip(list_users_1_month,list_users_2_month,list_users_3_month,list_users_4_month,list_users_5_month):
            total_users_month=e1+e2+e3+e4+e5
            perc_users_1_months.append((float(e1)/total_users_month))
            perc_users_2_months.append((float(e2)/total_users_month))
            perc_users_3_months.append((float(e3)/total_users_month))
            perc_users_4_months.append((float(e4)/total_users_month))
            perc_users_5_months.append((float(e5)/total_users_month))
            
###############################
##    FINAL DUTIES, TRANSFER DATA AND EXECUTE R SCRIPT
        filenames=["dates_admin_contrib.data","contribs_admins_months.data", "perc_contribs_months.data","dates_level1_contrib.data", "contribs_level1_months.data", "perc_contribs_level1_months.data", "dates_level2_contrib.data", "contribs_level2_months.data", "perc_contribs_level2_months.data","dates_level3_contrib.data", "contribs_level3_months.data", "perc_contribs_level3_months.data","dates_level4_contrib.data", "contribs_level4_months.data", "perc_contribs_level4_months.data","dates_level5_contrib.data" ,"contribs_level5_months.data", "perc_contribs_level5_months.data", "avg_contribs_user_1_month.data", "avg_contribs_user_2_month.data", "avg_contribs_user_3_month.data", "avg_contribs_user_4_month.data", "avg_contribs_user_5_month.data", "users_1_month.data", "users_2_month.data", "users_3_month.data", "users_4_month.data", "users_5_month.data", "perc_users_1_months.data","perc_users_2_months.data", "perc_users_3_months.data", "perc_users_4_months.data", "perc_users_5_months.data"]
        
        filenames_out=["Figure1.png", "Figure_2.png", "Figure4.png", "Figure5.png", "Figure6.png", "Figure7.png", "Figure8.png"]
        
        dataList=[dates_contribs, admins_contribs, perc_contribs_admins, dates_level1, contribs_level1, perc_contribs_level1,dates_level2, contribs_level2, perc_contribs_level2,dates_level3, contribs_level3, perc_contribs_level3, dates_level4, contribs_level4, perc_contribs_level4,dates_level5, contribs_level5, perc_contribs_level5, avg_contribs_user_1_month, avg_contribs_user_2_month, avg_contribs_user_3_month, avg_contribs_user_4_month, avg_contribs_user_5_month, list_users_1_month, list_users_2_month, list_users_3_month, list_users_4_month, list_users_5_month, perc_users_1_months, perc_users_2_months, perc_users_3_months, perc_users_4_months, perc_users_5_months]
        
        for filename, data in zip (filenames, dataList):
            if(filename.find('date')!=-1):
                f=open("./graphics/"+idioma+"/data/"+filename, 'w')
                for adate in data:
                    f.writelines(str(adate)+"\n")
                f.close()
            else:
                __makeDataFile(idioma, filename, data)
        
        #Pass data filenames to the GNU R script with a file
        f=open("./data/community_contrib_files_names.data",'w')
        for line in filenames:
            f.write("./graphics/"+idioma+"/data/"+line+"\n")
        f.close()
        
        #Idem with graphic output filenames
        f=open("./data/community_contrib_files_out.data",'w')
        for line in filenames_out:
            f.write("./graphics/"+idioma+"/"+line+"\n")
        f.close()
            
        #CALL GNU R SCRIPT measuring_Wiki.R
        
        succ=os.system("R --vanilla < ./community_contrib.R > debug_R")
        if succ==0:
            print "Funcion community_contrib.R ejecutada con exito para el lenguage... "+idioma