def endDocument(self):
     ################################################
     # We must write the las revinsert before finishing this page
     if self.options.fileout:
         self.revinsert += ";\n"
         # Write output to SQL file
         self.revfile = codecs.open(self.options.revfile, "a", "utf_8")
         self.revfile.write(self.revinsert)
         self.revfile.close()
     elif self.options.streamout:
         # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
         self.revinsert += ";"
         print self.revinsert.encode("utf_8")
     elif self.options.monitor:
         chances = 0
         while chances < 5:
             try:
                 dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode("utf_8"))
             except (Exception), e:
                 self.printfile = codecs.open("error_" + self.options.database, "a", "utf_8")
                 self.printfile.write(str(e) + "\n")
                 self.printfile.write(self.revinsert[0:30] + "\n**********************************")
                 self.printfile.close()
                 chances += 1
             else:
                 break
Beispiel #2
0
 def endDocument(self):
     ################################################
     #We must write the las revinsert before finishing this page
     if self.options.fileout:
         self.revinsert+=";\n"
     # Write output to SQL file
         self.revfile = codecs.open(self.options.revfile,'a','utf_8')
         self.revfile.write(self.revinsert)
         self.revfile.close()
     elif self.options.streamout:
         # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
         self.revinsert+=";"
         print self.revinsert.encode('utf_8')
     elif self.options.monitor:
         chances=0
         while chances<5:
             try:
                 dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf_8'))
             except (Exception), e:
                 self.printfile = codecs.open("error_"+self.options.database,'a','utf_8')
                 self.printfile.write(str(e)+"\n")
                 self.printfile.write(self.revinsert[0:30]+"\n**********************************")
                 self.printfile.close()
                 chances+=1
             else:
                 break
Beispiel #3
0
def printSql(sqlquery, csvfile, header):

    writer = csv.writer(open(csvfile, "w"))
    writer.writerow(header)

    data = dbaccess.raw_query_SQL(acceso[1], sqlquery)
    for row in data:
        #print(row)
        writer.writerow(row)
Beispiel #4
0
 def ratios(self):
     """
     .dat files showing interesting descriptive ratios
     """
     #FILE author-pages.dat ratio no. logged editors/no. user pages
     file=open("overall/data/editors-userpages.dat",'w')
     file.write("logged_authors\tuser_pages\tratio\tlang\n")
     file.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research" 
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         #Obtain number of different logged authors
         self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\
         "revision where rev_user!=0")
         #Obtain number of different user pages (nspace =2)
         self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=2")
         dbaccess.close_Connection(self.access[0])
         #Writing data to file
         file=open("overall/data/author-pages.dat",'a')
         file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\
         str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n")
         file.close()
         #print "Completed lang "+self.language+"\n"
 
     #FILE articles-talk-ratio.dat ratio of no. articles/no. talk pages (excluding redirects)
     file=open("overall/data/articles-talk-ratio.dat",'w')
     file.write("articles\ttalk\tratio\tlang\n")
     file.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research" 
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         #Obtain number of articles excluding redirects
         self.articles=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=0 and page_is_redirect=0")
         #Obtain number of talk pages
         self.talk=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
         "page where page_namespace=1")
         dbaccess.close_Connection(self.access[0])
         #Writing data to file
         file=open("overall/data/articles-talk-ratio.dat",'a')
         file.write(str(int(self.articles[0][0]))+"\t"+str(int(self.talk[0][0]))+"\t"+\
         str(float(self.talk[0][0])/float(self.articles[0][0]))+"\t"+self.language+"\n")
         file.close()
Beispiel #5
0
 def make_indexes(self):
     self.access = dbaccess.get_Connection(
         "localhost", 3306, self.user, self.passw, "wx_" + self.language + "wiki_" + self.dumptype
     )
     # Generate adequate indexes and keys in tables page and revision
     # try:
     # print "Generating index for page_len...\n"
     # dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE page ADD INDEX page_len(page_len)")
     # except Exception, e:
     # print "An exception ocurred, the problem was the following:\n"
     # print e
     # print "*************\n\n"
     try:
         print "Creating index for rev_timestamp"
         dbaccess.raw_query_SQL(self.access[1], "ALTER TABLE revision ADD INDEX timestamp(rev_timestamp)")
     except Exception, e:
         print "An exception ocurred, the problem was the following:\n"
         print e
         print "*************\n\n"
Beispiel #6
0
    def overall(self):
        """
        Preprocessing tables for evolution of page length over time
        """
	file=open("author-pages.dat",'w')
	file.write("logged_authors\tuser_pages\tratio\tlang\n")
	file.close()
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"	
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
	    self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\
	    "revision where rev_user!=0")
	    self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\
	    "page where page_namespace=2")
            dbaccess.close_Connection(self.access[0])
	    file=open("author-pages.dat",'a')
	    file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\
	    str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n")
	    file.close()
	    print "Completed lang "+self.language+"\n"
Beispiel #7
0
    def calculate(self):
        self.access = dbaccess.get_Connection("localhost", 3306, self.user,\
        self.passw, "wx_"+self.language+"wiki_"+self.dumptype)
        
	try:
            print "Creating table for logged users..."
            users=dbaccess.raw_query_SQL(self.access[1],"create table lag_info (rev_user INT(10) UNSIGNED NOT NULL,"+\
	    "fecha1 datetime not null, fecha2 datetime not null)")
        except Exception, e:
            print "An exception ocurred, the problem was the following:\n"
            print e
            print "*************\n\n"
 def endDocument(self):
     ################################################
     #We must write the last pageinsert before finishing this dump
     if self.options.fileout:
     # Write output to SQL file
         self.pageinsert+=";\n"
         self.pagefile = codecs.open(self.options.pagefile,'a','utf_8')
         self.pagefile.write(self.pageinsert)
         self.pagefile.close()
     elif self.options.streamout:
         # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
         self.pageinsert+=";"
         print self.pageinsert.encode('utf-8')
     elif self.options.monitor:
         while 1:
             try:
                 dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode('utf-8'))
             except (Exception), e:
                 print e
             else:
                 break
    def commitsPerPeriodPerCommiter(self):

        """
        Print lines to a file, each line repesenting:
          period commiter commits
        """

        filehand = open(self.dataPath + "commits_per_period_per_commiter", "w")

        result = dbaccess.raw_query_SQL(self.acceso[1], "select * from contribs_period_author_" + self.language)
        for row in result:
            # 0:period, 1:commiter, 2:commits
            filehand.write(row[0] + " " + row[1] + " " + row[2] + "\n")

        filehand.close()
Beispiel #10
0
 def __init__(self, language="furwiki", dumptype="research", msqlu="", msqlp=""):
     """
     It receives the language and dumptype to download
     It returns an int =0 if the DB was successfully set up, =-1 if there was an error
     """
     self.language=language       #language to download
     self.dumptype=dumptype      #type of dump      
     self.files=["pages-meta-history.xml.7z", "redirect.sql.gz","page_restrictions.sql.gz",\
     "user_groups.sql.gz", "logging.sql.gz", "interwiki.sql.gz", "langlinks.sql.gz", "externallinks.sql.gz",\
     "templatelinks.sql.gz", "imagelinks.sql.gz", "categorylinks.sql.gz", "pagelinks.sql.gz", "oldimage.sql.gz",\
     "image.sql.gz"]
     self.filename=""
     self.filenameTemplate=string.Template("""$language-latest-$file""") #dump's filename in Wikimedia's server
     #URL to download the file
     self.urld=""
     self.urldTemplate=string.Template("""http://download.wikimedia.org/$language/latest/$language-latest-$file""")
     if (msqlu=="" or msqlp==""):
         print "Error initializing DB dump object. You must provide a valid MySQL username and password"
     else:
         self.msqlu=msqlu   #MySQL username for accessing and editing the DB
         self.msqlp=msqlp   #MySQL password
     #We can manage two different types of dumps, stubs (without the text of every revision) and pages
     #(containing the text of every revision)
     #self.urld="http://download.wikimedia.org/"+self.language+"/latest/"+\
     #self.language+"-latest-pages-meta-history.xml.7z"  #File to download
     #patterns for files
     #http://download.wikimedia.org/furwiki/20060921/furwiki-20060921-pages-meta-history.xml.7z
     #http://download.wikimedia.org/amwiki/20061014/amwiki-20061014-stub-meta-history.xml.gz
     #Create /dumps directory if it does not exist yet
     directories=os.listdir("./")
     if ("dumps" not in directories):
         os.makedirs("./dumps")
     ## Initialize DB in MySQL: create DB and tables definitions
     print "Initializing DB for --> "+ self.language +"\n"
     #Retrieving connection and cursor to access the DB
     acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,"mysql")
     dbaccess.createDB_SQL(acceso[1],"wx_"+self.language+"_"+self.dumptype)
     if self.dumptype=="research":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_research.sql > debug_mysql.log"
     elif self.dumptype=="standard":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_standard.sql > debug_mysql.log"
     ok=os.system(command)
     if ok == 0:
         acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,\
         "wx_"+self.language+"_"+self.dumptype)
         dbaccess.raw_query_SQL(acceso[1], "alter table page max_rows = 200000000000 avg_row_length = 50")
         dbaccess.raw_query_SQL(acceso[1], "alter table revision max_rows = 200000000000 avg_row_length = 50")
         if self.dumptype=="standard":
             dbaccess.raw_query_SQL(acceso[1], "alter table text max_rows = 200000000000 avg_row_length = 50")
         dbaccess.close_Connection(acceso[0])
     else:
         print "Error! There was a problem initializing definitions for DB tables"
         dbaccess.close_Connection(acceso[0])
Beispiel #11
0
 def __init__(self, language="furwiki", dumptype="research", msqlu="", msqlp=""):
     """
     It receives the language and dumptype to download
     It returns an int =0 if the DB was successfully set up, =-1 if there was an error
     """
     self.language=language       #language to download
     self.dumptype=dumptype      #type of dump      
     self.files=["pages-meta-history.xml.7z", "redirect.sql.gz","page_restrictions.sql.gz",\
     "user_groups.sql.gz", "logging.sql.gz", "interwiki.sql.gz", "langlinks.sql.gz", "externallinks.sql.gz",\
     "templatelinks.sql.gz", "imagelinks.sql.gz", "categorylinks.sql.gz", "pagelinks.sql.gz", "oldimage.sql.gz",\
     "image.sql.gz"]
     self.filename=""
     self.filenameTemplate=string.Template("""$language-latest-$file""") #dump's filename in Wikimedia's server
     #URL to download the file
     self.urld=""
     self.urldTemplate=string.Template("""http://download.wikimedia.org/$language/latest/$language-latest-$file""")
     if (msqlu=="" or msqlp==""):
         print "Error initializing DB dump object. You must provide a valid MySQL username and password"
     else:
         self.msqlu=msqlu   #MySQL username for accessing and editing the DB
         self.msqlp=msqlp   #MySQL password
     #We can manage two different types of dumps, stubs (without the text of every revision) and pages
     #(containing the text of every revision)
     #self.urld="http://download.wikimedia.org/"+self.language+"/latest/"+\
     #self.language+"-latest-pages-meta-history.xml.7z"  #File to download
     #patterns for files
     #http://download.wikimedia.org/furwiki/20060921/furwiki-20060921-pages-meta-history.xml.7z
     #http://download.wikimedia.org/amwiki/20061014/amwiki-20061014-stub-meta-history.xml.gz
     #Create /dumps directory if it does not exist yet
     directories=os.listdir("./")
     if ("dumps" not in directories):
         os.makedirs("./dumps")
     ## Initialize DB in MySQL: create DB and tables definitions
     print "Initializing DB for --> "+ self.language +"\n"
     #Retrieving connection and cursor to access the DB
     acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,"mysql")
     dbaccess.createDB_SQL(acceso[1],"wx_"+self.language+"_"+self.dumptype)
     if self.dumptype=="research":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_research.sql > debug_mysql.log"
     elif self.dumptype=="standard":
         command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\
         "wx_"+self.language+"_"+self.dumptype+" < tables_standard.sql > debug_mysql.log"
     ok=os.system(command)
     if ok == 0:
         acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,\
         "wx_"+self.language+"_"+self.dumptype)
         dbaccess.raw_query_SQL(acceso[1], "alter table page max_rows = 200000000000 avg_row_length = 50")
         dbaccess.raw_query_SQL(acceso[1], "alter table revision max_rows = 200000000000 avg_row_length = 50")
         if self.dumptype=="standard":
             dbaccess.raw_query_SQL(acceso[1], "alter table text max_rows = 200000000000 avg_row_length = 50")
         dbaccess.close_Connection(acceso[0])
     else:
         print "Error! There was a problem initializing definitions for DB tables"
         dbaccess.close_Connection(acceso[0])
Beispiel #12
0
    def commitsCommiterAllPeriods(self, commiter, arrayCommits):
        """
        Calculate the number of commits per period, for all periods,
        for a given commiter, and produce an add that information
        to the given arrayCommits (which should be zeroed before
        calling this function in case this is the first call to fill
        it in).
        """

        # print arrayCommits
        result = dbaccess.raw_query_SQL (self.acceso[1],\
        "select period, contribs from contribs_period_author_"+\
        self.language+" where author="+\
        str(commiter) + " group by period")
        for row in result:
            # 0:period, 1:commits
            period = int(row[0])
            ##            It is better for us to compute the log10(num-revisions) for low values to appear
            arrayCommits[period] = arrayCommits[period] + int(row[1])
Beispiel #13
0
    def commitsPerPeriod(self):
        """
        Print lines to a file, each line representing:
          period commits commiters
        Also, fills in self.commitsPeriodDict (commits per period)
        and self.commitersPeriodDict (commiters per period)
        """

        filehand = open(self.dataPath + 'data_per_period', 'w')

        # Commits per period, as an array of rows
        commitsPeriod = dbaccess.raw_query_SQL(self.acceso[1],\
        "select period, sum(contribs), count(DISTINCT(author))"+\
        " from contribs_period_author_"+self.language+" group by period")
        for row in commitsPeriod:
            # 0:period, 1:commits, 2: commiters
            filehand.write(
                str(row[0]) + ' ' + str(row[1]) + ' ' + str(row[2]) + '\n')
            self.commitsPeriodDict[int(row[0])] = int(row[1])
            self.commitersPeriodDict[int(row[0])] = int(row[2])
Beispiel #14
0
 def general_stats(self):
     """
     Preprocessing actions for general statistics scripts
     """
     #FILE page_len.dat, with info about length of pages
     self.f=open("overall/data/page_len.dat", 'w')
     self.f.write("page_len\tns\tis_redirect\tis_stub\tis_new\tlang\n")
     self.f.close()
     for self.language in self.languages:
         self.dbname="wx_"+self.language+"wiki_research" 
         self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
         print "Retrieving info from "+self.language+"\n"
         results=dbaccess.raw_query_SQL(self.access[1], "SELECT page_len, page_namespace, page_is_redirect, page_is_stub, "+\
         "page_is_new FROM page")
         print "Updating page_len info file with "+self.language+"\n"
             
         self.f=open("overall/data/page_len.dat", 'a')
         for result in results:
             self.f.write(str(int(result[0]))+"\t"+str(int(result[1]))+"\t"+str(int(result[2]))+"\t"+\
             str(int(result[3]))+"\t"+str(int(result[4]))+"\t"+self.language+"\n")
         self.f.close()
         results=None
         dbaccess.close_Connection(self.access[0])
    def commitsCommiterAllPeriods(self, commiter, arrayCommits):

        """
        Calculate the number of commits per period, for all periods,
        for a given commiter, and produce an add that information
        to the given arrayCommits (which should be zeroed before
        calling this function in case this is the first call to fill
        it in).
        """

        # print arrayCommits
        result = dbaccess.raw_query_SQL(
            self.acceso[1],
            "select period, contribs from contribs_period_author_"
            + self.language
            + " where author="
            + str(commiter)
            + " group by period",
        )
        for row in result:
            # 0:period, 1:commits
            period = int(row[0])
            ##            It is better for us to compute the log10(num-revisions) for low values to appear
            arrayCommits[period] = arrayCommits[period] + int(row[1])
    def commitsPerPeriod(self):

        """
        Print lines to a file, each line representing:
          period commits commiters
        Also, fills in self.commitsPeriodDict (commits per period)
        and self.commitersPeriodDict (commiters per period)
        """

        filehand = open(self.dataPath + "data_per_period", "w")

        # Commits per period, as an array of rows
        commitsPeriod = dbaccess.raw_query_SQL(
            self.acceso[1],
            "select period, sum(contribs), count(DISTINCT(author))"
            + " from contribs_period_author_"
            + self.language
            + " group by period",
        )
        for row in commitsPeriod:
            # 0:period, 1:commits, 2: commiters
            filehand.write(str(row[0]) + " " + str(row[1]) + " " + str(row[2]) + "\n")
            self.commitsPeriodDict[int(row[0])] = int(row[1])
            self.commitersPeriodDict[int(row[0])] = int(row[2])
    def analyze(self):

        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            
            #Total no. of revisions made by every logged author
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\
            "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\
            "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user")
            dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)")
            
            print "Created table user_revs for "+self.language+"wiki...\n"
            
            #Min and max timestamp for every logged author + total num_revs
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_allns AS "+\
            "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\
            "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\
            "ORDER BY min_ts)")
            
            print "Created table time_range_allns for "+self.language+"wiki...\n"
                
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
Beispiel #18
0
    def time_range(self):
        """
        Creates intermediate tables with time frame of editors activity
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            
            ##### TIME RANGE FOR AUTHORS IN ALL NAMESPACES
            #TABLE: Total no. of revisions made by every logged author
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\
            "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\
            "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user")
            dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)")
            
            print "Created table user_revs for "+self.language+"wiki...\n"
            
            #TABLE: Min and max timestamp for every logged author + total num_revs
            dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_authors AS "+\
            "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\
            "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\
            "ORDER BY min_ts)")
            
            print "Created table time_range_authors for "+self.language+"wiki...\n"
            
            ##### TIME RANGE FOR AUTHORS IN MAIN ONLY
                print "Processing language "+self.language+"\n"
            #VIEW: Create view for filtering annons and bots
            #Filter from rev_main_nored revisions from logged authors only
            dbaccess.raw_query_SQL(self.access[1],"create or replace view revision_logged as (select * from rev_main_nored "+\
            " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot') )")
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_users")
            #TABLE: Intermediate table, storing for each logged author the min and max ts in the system
            dbaccess.raw_query_SQL(self.access[1],"create table time_range_users as (SELECT rev_user, "+\
            "min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision_logged group by rev_user)")
            dbaccess.raw_query_SQL(self.access[1],"alter table time_range_users add primary key (rev_user)")
            
            print "Created time_range_users for "+self.language +"\n"

            #Close DB connection
            dbaccess.close_Connection(self.access[0])
Beispiel #19
0
    def prepro_red_talk(self):
        """
        Data and evolution for redirects and talk pages
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research" 
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)

            #VIEW page_redirect (pages with redirect flag activated)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view page_redirect as "+\
            "(select page_id from page where page_namespace=0 and page_is_redirect=1)")

            #VIEW rev_redirect (revisions corresponding to redirect pages)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_redirect as ("+\
            "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
            "(select page_id from page_redirect))")

            #VIEW page_talk (pages in talk nspace)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view page_talk as "+\
            "(select page_id from page where page_namespace=1)")

            #VIEW rev_talk (revisions corresponding to talk pages)
            dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_talk as ("+\
            "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
            "(select page_id from page_talk))")

            #TABLES max_rev_talk_YYYY (latest revision for each pages in talk nspace, in year YYYY)
            self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision")
            self.years=range(int(self.minyear[0][0])+1, 2009)
            for self.year in self.years:
                dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_talk_"+str(self.year))
                dbaccess.raw_query_SQL(self.access[1],"create table max_rev_talk_"+str(self.year)+\
                " as (select max(rev_id) as max_id, rev_page from rev_talk "+\
                "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)")
                dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add primary key (max_id)")
                dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add index (rev_page)")
                
            dbaccess.close_Connection(self.access[0])
Beispiel #20
0
    def endElement(self, name):
##    Defining tasks to manage contents from the last readed tag
##        Catching the namespace of this page
        if name=='namespace':
            self.nspace_dict[self.current_text]=self.codens
            
        elif name=='id':
            if self.stack[-1]=='contributor':
                ##Detecting contributor's attributes inside a revision
                self.rev_dict['rev_user']=self.current_text
            elif self.stack[-1]=='revision':
                self.rev_dict[name]=self.current_text
            elif self.stack[-1]=='page':
                self.page_dict[name]=self.current_text
            else:
                self.f=open(self.fileErrPath,'w')
                if len(self.stack)>0:
                    self.f.write("Unsupported parent tag for '"+name+"': "+self.stack[-1])
                self.f.close()
          
        elif name=='ip':
            self.rev_dict['rev_user']='******'
            self.rev_dict['username']=self.current_text
            
        elif name=='timestamp':
            ##Adequate formatting of timestamps
            self.rev_dict['timestamp']=self.current_text.replace('Z','').replace('T',' ')
                
        elif name=='contributor':
            ##Pop contributor tag from the stack
            self.stack.pop()
        #####################################################
        ## END OF REVISION
        #####################################################
        elif name=='revision':
            self.rev_count+=1
            ##Store whether this is a redirect or stub page or not
            ##TODO: Substitute the find command with a regexp
            if len(self.rev_dict['text'])>0:
                if string.upper(self.rev_dict['text'][0:9])=='#REDIRECT':
                    self.isRedirect='1'
                else:
                    self.isRedirect='0'
            ## Takes from the first argument the threshold for stub's length
            if str(2*len(self.rev_dict['text']))<=self.options.stubth:
                self.isStub='1'
            else:
                self.isStub='0'
                
            ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (RESEARCH VERSION)######
            ##Values order: (rev_id, rev_page, rev_user, rev_user_text, rev_timestamp,
            ##rev_len, rev_letters_raw, rev_words_raw, rev_words_filter, rev_in_links, rev_out_links, rev_trans,
            ##rev_sections, rev_bolds, rev_italics, rev_bolditalics
            ##rev_parent_id, rev_is_redirect, rev_is_stub, rev_is_minor, rev_comment)
            ##Calculation of additional fancy statistics AND
            ##Detection and stripping of wiki tags and HTML tags
            ##We also store inlinks, outlinks and special links

#### **********************

            #self.rev_dict['text']=re.sub(self.pathtml, '', self.rev_dict['text']) #filter HTML tags
            #self.rev_dict['text']=re.sub(self.patunicode, 'X', self.rev_dict['text']) #convert unicode chars to X
###            self.highwords=re.findall(self.pathighwords, self.rev_dict['text']) #detect highlighted words
###            for i in range(len(self.highwords)):
###                self.highwords[i]=re.sub(self.pathighlight,'', self.highwords[i]) #Filter bold/italics tags
            #self.rev_dict['text']=re.sub(self.pathighlight, '', self.rev_dict['text']) #filter highlight tags
###            self.special=re.findall(self.patspecial, self.rev_dict['text']) #detect special links
            #self.trans=re.findall(self.pattrans, self.rev_dict['text']) #detect translation links
            #self.rev_dict['text']=re.sub(self.patspecial, '', self.rev_dict['text']) #filter out special links (after detecting trans)
            #self.inlinks=re.findall(self.patinlink, self.rev_dict['text']) #detect inlinks
            #self.outlinks=re.findall(self.patoutlink, self.rev_dict['text']) #detect outlinks
            #self.sections=re.findall(self.patsection, self.rev_dict['text']) #detect sections
            #self.rev_dict['text']=re.sub(self.patitemize, '', self.rev_dict['text']) #filter out itemize bullets and line branches

#### **********************
            
            # Build current row for revinsert

            ## IMPORTANT PERFORMANCE NOTE: using str.join instead of plain '+' operator
            ## for increased performance

            try:
              #newrevinsert="("+self.rev_dict['id']+","+self.page_dict['id']+","+\
              #self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\
              #'"'+","+'"'+self.rev_dict['timestamp']+\
              #'"'+","+str(2*len(self.rev_dict['text']))+\
              #","+str(len(self.rev_dict['text']))+\
              #","+str(len(self.rev_dict['text'].split())) 
      ###            ","+str(len(self.highwords))+","+str(len(self.special))+\
              #newrevinsert+=","+str(len(self.inlinks))+","+str(len(self.outlinks))+","+str(len(self.trans))+","+str(len(self.sections))+\
              #","+self.prior_rev_id+","+self.isRedirect+","+self.isStub+","+self.isMinor

              newrevinsert="".join(["(",self.rev_dict['id'],",",self.page_dict['id'],",",
              self.rev_dict['rev_user'],",",'"',self.rev_dict['username'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"'),
              '"',",",'"',self.rev_dict['timestamp'],'"',
              ",",str(len(self.rev_dict['text'])),
              #",",str(len(self.rev_dict['text'])),",",str(len(self.rev_dict['text'].split())),
              #",",str(len(self.inlinks)),",",str(len(self.outlinks)),",",str(len(self.trans)),",",str(len(self.sections)),
              ",",self.prior_rev_id,",",self.isRedirect,",",self.isStub,",",self.isMinor])
              if self.rev_dict.has_key('comment'):
                #newrevinsert+=","+'"'+self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+'"'
                newrevinsert="".join([newrevinsert,
                ",",'"',self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"'),'"'])
              else:
                #newrevinsert+=",''"
                newrevinsert="".join([newrevinsert,",''"])
              #newrevinsert+=")"
              newrevinsert="".join([newrevinsert,")"])

            # In case that any field is missing or flawed, skip this revision and log to standard error
            except (KeyError), e:
              self.printfile = codecs.open("error_"+self.options.database,'a','utf_8')
              self.printfile.write("Offending rev_dict was = \n")
              self.printfile.write(str(self.rev_dict))
              self.printfile.write("\n")
              self.printfile.write("Offending page_dict was = \n")
              self.printfile.write(str(self.page_dict))
              self.printfile.write("\n")
              self.printfile.write("====================================================\n")
              self.printfile.write(str(e)+"\n")
              self.printfile.write("====================================================\n\n")
              self.printfile.close()
              return
            
            #############################################
            # CONSTRUCT DICTIONARIES WITH SPECIAL ITEMS
            #############################################
##            for item in self.highwords:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.highwords_dict.get(item)
##                if (stumble==None):
##                    self.highwords_dict[item]=self.highwords_id
##                    self.highwords_rev_insert.append((self.rev_dict['id'],self.highwords_id))
##                    self.highwords_id+=1
##                else:
##                    self.highwords_rev_insert.append((self.rev_dict['id'],stumble))
##            for item in self.special:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.special_dict.get(item)
##                if (stumble==None):
##                    self.special_dict[item]=self.special_id
##                    self.special_rev_insert.append((self.rev_dict['id'], self.special_id))
##                    self.special_id+=1
##                else:
##                    self.special_rev_insert.append((self.rev_dict['id'], stumble))
##            for item in self.inlinks:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.inlinks_dict.get(item)
##                if (stumble==None):
##                    self.inlinks_dict[item]=self.inlinks_id
##                    self.inlinks_rev_insert.append((self.rev_dict['id'], self.inlinks_id))
##                    self.inlinks_id+=1
##                else:
##                    self.inlinks_rev_insert.append((self.rev_dict['id'], stumble))
##            for item in self.outlinks:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.outlinks_dict.get(item)
##                if (stumble==None):
##                    self.outlinks_dict[item]=self.outlinks_id
##                    self.outlinks_rev_insert.append((self.rev_dict['id'], self.outlinks_id))
##                    self.outlinks_id+=1
##                else:
##                    self.outlinks_rev_insert.append((self.rev_dict['id'], stumble))
##            for item in self.trans:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.trans_dict.get(item)
##                if (stumble==None):
##                    self.trans_dict[item]=self.trans_id
##                    self.trans_rev_insert.append((self.rev_dict['id'], self.trans_id))
##                    self.trans_id+=1
##                else:
##                    self.trans_rev_insert.append((self.rev_dict['id'], stumble))
            ##############################################
            ## LOOK-AHEAD ALGORITHM

            ## IMPORTANT PERFORMANCE NOTE: using str.join instead of plain '+' operator
            ## for increased performance

            ##############################################
            if self.revinsertrows==0:
                #Always allow at least one row in extended inserts
                #self.revinsert="INSERT INTO revision VALUES"+newrevinsert
                self.revinsert="".join(["INSERT INTO revision VALUES",newrevinsert])
                self.revinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            elif ((self.revinsertrows+1)<=self.options.imaxrows) or\
            (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024):
                #Append new row to self.revinsert
                #self.revinsert+=","+newrevinsert
                self.revinsert="".join([self.revinsert,",",newrevinsert])
                self.revinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            else:
                #We must finish and write currrent insert and begin a new one
                if self.options.fileout:
                    #self.revinsert+=";\n"
                    self.revinsert="".join([self.revinsert,";\n"])
                    #self.revinsert
                    # Write output to SQL file
                    self.revfile = codecs.open(self.options.revfile,'a','utf_8')
                    self.revfile.write(revinsert)
                    self.revfile.close()
                elif self.options.streamout:
                    # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                    #self.revinsert+=";"
                    self.revinsert="".join([self.revinsert,";"])
                    print self.revinsert.encode('utf_8')
                elif self.options.monitor:
                    chances=0
                    while chances<5:
                        try:
                            dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf_8'))
                        except (Exception), e:
                            self.printfile = codecs.open("error_"+self.options.database,'a','utf_8')
                            self.printfile.write(str(e)+"\n")
                            self.printfile.write("".join([self.revinsert[0:30],"\n**********************************"]))
                            self.printfile.close()
                            chances+=1
                        else:
                            break
                self.revinsert="".join(["INSERT INTO revision VALUES",newrevinsert])
                self.revinsertrows=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
Beispiel #21
0
def community_contrib(idiomas):
    for idioma in idiomas:
        list_admins=test_admins.process_admins(idioma)
        num_admins=list_admins.pop()
        where_clause1=list_admins.pop()
        acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub")
        admins_ids=dbaccess.raw_query_SQL(acceso[1], "SELECT DISTINCT(author) FROM stats_"+idioma+" WHERE "+where_clause1+" LIMIT "+str(num_admins))
##        MONTAR WHERE CLAUSE CON ADMINS IDS
        list_admins_ids=[]
        for item in list_admins_ids:
            list_admins_ids.append(int(item[0]))
        where_clause2=test_admins.process_users_ids(list_admins_ids,idioma)
        edits_admin_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_NoAnnons_months_author_"+idioma+" ", where=where_clause2, group="year, month ", order="year, month")
        dates_admins=[]
        admins_contribs=[]
        for element in edits_admin_month:
            dates_admins.append(list(element[0:2]))
            admins_contribs.append(int(element[2]))
##        PASAR A UN ARCHIVO PARA PLOT (FIG 2)
##        RECUPERAMOS CONTRIBUCIONES TOTALES POR MESES
        total_edits_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, group="year, month ")
        dates_contribs=[]
        total_contribs=[]
        for element in total_edits_month:
            dates_contribs.append(list(element[0:2]))
            total_contribs.append(int(element[2]))
##        DIVIDIR LA PRIMERA LISTA POR LA SEGUNDA
        perc_contribs_admins=[]
        for admin_contrib, total_contrib in zip(admins_contribs, total_contribs):
            perc_contribs_admins.append((float(admin_contrib)/total_contrib))
##        PASAR A UN ARCHIVO PARA PLOT (FIG 1)

##    FIG 4 TOTAL EDITS MADE BY USERS WITH DIFFERENT EDIT LEVELS
##    CREATE CLUSTER OF USERS IDENTIFIED BY CONTRIBUTIONS LEVEL
##    5 LEVELS: <100, 100-1K, 1K-5K, 5K-10K, >10K
        users_level1=[]
        users_level2=[]
        users_level3=[]
        users_level4=[]
        users_level5=[]
        level1=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount<=100")
        for userid in level1:
            users_level1.append(int(userid[0]))
        level2=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>100 AND theCount<=1000")
        for userid in level2:
            users_level2.append(int(userid[0]))
        level3=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>1000 AND theCount<=5000")
        for userid in level3:
            users_level3.append(int(userid[0]))
        level4=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>5000 AND theCount<=10000")
        for userid in level4:
            users_level4.append(int(userid[0]))
        level5=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>10000")
        for userid in level5:
            users_level5.append(int(userid[0]))
        where_clause_level1=test_admins.process_users_ids(users_level1,idioma)
        where_clause_level2=test_admins.process_users_ids(users_level2,idioma)
        where_clause_level3=test_admins.process_users_ids(users_level3,idioma)
        where_clause_level4=test_admins.process_users_ids(users_level4,idioma)
        where_clause_level5=test_admins.process_users_ids(users_level5,idioma)
        
        contribs_level1_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level1, group="year, month")
        contribs_level2_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level2, group="year, month")
        contribs_level3_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level3, group="year, month")
        contribs_level4_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level4, group="year, month")
        contribs_level5_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level5, group="year, month")
        list_level1=__process_contribs(contribs_level1_month, total_contribs)
        perc_contribs_level1=list_level1.pop()
        contribs_level1=list_level1.pop()
        dates_level1=list_level1.pop()
        
        list_level2=__process_contribs(contribs_level2_month, total_contribs)
        perc_contribs_level2=list_level2.pop()
        contribs_level2=list_level2.pop()
        dates_level2=list_level2.pop()
        
        list_level3=__process_contribs(contribs_level3_month, total_contribs)
        perc_contribs_level3=list_level3.pop()
        contribs_level3=list_level3.pop()
        dates_level3=list_level1.pop()
        
        list_level4=__process_contribs(contribs_level4_month, total_contribs)
        perc_contribs_level4=list_level4.pop()
        contribs_level4=list_level4.pop()
        dates_level4=list_level4.pop()
        
        list_level5=__process_contribs(contribs_level5_month, total_contribs)
        perc_contribs_level5=list_level5.pop()
        contribs_level5=list_level5.pop()
        dates_level5=list_level5.pop()
        
##    FIG 5 PLOT 4b
##    FIG 6 AVERAGE NUMBER OF EDITS PER USER PER MONTH FOR EACH LEVEL
##        RETRIEVE NUM USERS FOR EACH MONTH IN EACH LEVEL WHO HAVE MADE AT LEAST ONE CONTRIB
        num_users_1_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level1, group="year, month")
        num_users_2_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level2, group="year, month")
        num_users_3_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level3, group="year, month")
        num_users_4_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level4, group="year, month")
        num_users_5_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level5, group="year, month")
        list_users_1_month=[]
        for element in num_users_1_month:
            list_users_1_month.append(int(element[0]))
        list_users_2_month=[]
        for element in num_users_2_month:
            list_users_2_month.append(int(element[0]))
        list_users_3_month=[]
        for element in num_users_3_month:
            list_users_3_month.append(int(element[0]))
        list_users_4_month=[]
        for element in num_users_4_month:
            list_users_4_month.append(int(element[0]))
        list_users_5_month=[]
        for element in num_users_5_month:
            list_users_5_month.append(int(element[0]))
        
##        DIVIDE TOT NUM CONTRIBS PER LEVEL PER MONTH BY THE NUM USERS FOR EACH MONTH IN EACH LEVEL
        avg_contribs_user_1_month=[]
        for contribmonth, usermonth in zip(contribs_level1, list_users_1_month):
            avg_contribs_user_1_month.append(float(contribmonth)/usermonth)
        avg_contribs_user_2_month=[]
        for contribmonth, usermonth in zip(contribs_level2, list_users_2_month):
            avg_contribs_user_2_month.append(float(contribmonth)/usermonth)
        avg_contribs_user_3_month=[]
        for contribmonth, usermonth in zip(contribs_level3, list_users_3_month):
            avg_contribs_user_3_month.append(float(contribmonth)/usermonth)
        avg_contribs_user_4_month=[]
        for contribmonth, usermonth in zip(contribs_level4, list_users_4_month):
            avg_contribs_user_4_month.append(float(contribmonth)/usermonth)
        avg_contribs_user_5_month=[]
        for contribmonth, usermonth in zip(contribs_level5, list_users_5_month):
            avg_contribs_user_5_month.append(float(contribmonth)/usermonth)
        
##        FIG 7 POPULATION GROWTH FOR EACH USER GROUP
##        SIMPLY RETRIEVE list_users_X_month
##        FIG 8 % OF TOTAL POPULATION OF EACH USER GROUP
        perc_users_1_months=[]
        perc_users_2_months=[]
        perc_users_3_months=[]
        perc_users_4_months=[]
        perc_users_5_months=[]
        for e1, e2, e3, e4, e5 in zip(list_users_1_month,list_users_2_month,list_users_3_month,list_users_4_month,list_users_5_month):
            total_users_month=e1+e2+e3+e4+e5
            perc_users_1_months.append((float(e1)/total_users_month))
            perc_users_2_months.append((float(e2)/total_users_month))
            perc_users_3_months.append((float(e3)/total_users_month))
            perc_users_4_months.append((float(e4)/total_users_month))
            perc_users_5_months.append((float(e5)/total_users_month))
            
###############################
##    FINAL DUTIES, TRANSFER DATA AND EXECUTE R SCRIPT
        filenames=["dates_admin_contrib.data","contribs_admins_months.data", "perc_contribs_months.data","dates_level1_contrib.data", "contribs_level1_months.data", "perc_contribs_level1_months.data", "dates_level2_contrib.data", "contribs_level2_months.data", "perc_contribs_level2_months.data","dates_level3_contrib.data", "contribs_level3_months.data", "perc_contribs_level3_months.data","dates_level4_contrib.data", "contribs_level4_months.data", "perc_contribs_level4_months.data","dates_level5_contrib.data" ,"contribs_level5_months.data", "perc_contribs_level5_months.data", "avg_contribs_user_1_month.data", "avg_contribs_user_2_month.data", "avg_contribs_user_3_month.data", "avg_contribs_user_4_month.data", "avg_contribs_user_5_month.data", "users_1_month.data", "users_2_month.data", "users_3_month.data", "users_4_month.data", "users_5_month.data", "perc_users_1_months.data","perc_users_2_months.data", "perc_users_3_months.data", "perc_users_4_months.data", "perc_users_5_months.data"]
        
        filenames_out=["Figure1.png", "Figure_2.png", "Figure4.png", "Figure5.png", "Figure6.png", "Figure7.png", "Figure8.png"]
        
        dataList=[dates_contribs, admins_contribs, perc_contribs_admins, dates_level1, contribs_level1, perc_contribs_level1,dates_level2, contribs_level2, perc_contribs_level2,dates_level3, contribs_level3, perc_contribs_level3, dates_level4, contribs_level4, perc_contribs_level4,dates_level5, contribs_level5, perc_contribs_level5, avg_contribs_user_1_month, avg_contribs_user_2_month, avg_contribs_user_3_month, avg_contribs_user_4_month, avg_contribs_user_5_month, list_users_1_month, list_users_2_month, list_users_3_month, list_users_4_month, list_users_5_month, perc_users_1_months, perc_users_2_months, perc_users_3_months, perc_users_4_months, perc_users_5_months]
        
        for filename, data in zip (filenames, dataList):
            if(filename.find('date')!=-1):
                f=open("./graphics/"+idioma+"/data/"+filename, 'w')
                for adate in data:
                    f.writelines(str(adate)+"\n")
                f.close()
            else:
                __makeDataFile(idioma, filename, data)
        
        #Pass data filenames to the GNU R script with a file
        f=open("./data/community_contrib_files_names.data",'w')
        for line in filenames:
            f.write("./graphics/"+idioma+"/data/"+line+"\n")
        f.close()
        
        #Idem with graphic output filenames
        f=open("./data/community_contrib_files_out.data",'w')
        for line in filenames_out:
            f.write("./graphics/"+idioma+"/"+line+"\n")
        f.close()
            
        #CALL GNU R SCRIPT measuring_Wiki.R
        
        succ=os.system("R --vanilla < ./community_contrib.R > debug_R")
        if succ==0:
            print "Funcion community_contrib.R ejecutada con exito para el lenguage... "+idioma
Beispiel #22
0
    def analyze(self):
        #Initialize all files headers
        #Survival data for all users (including editors out of MAIN)
        f=open("wkp_surv_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        #Survival data for all logged users who edited in MAIN
        f=open("wkp_surv_main_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        f=open("wkp_surv_join_core_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        f=open("wkp_surv_in_core_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        f=open("wkp_surv_core_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
        f=open("wkp_surv_join_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        f=open("wkp_surv_in_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
            
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
	    
            print "Starting language "+self.language+"\n"
            ##IN SYSTEM
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\
            "where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()

            ##IN MAIN
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_main_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished core users for language "+self.language+"\n"
            ###########################
            ##REV CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished all tasks for "+self.language+"\n"
Beispiel #23
0
def printSql2(sqlquery):

    data = dbaccess.raw_query_SQL(acceso[1], sqlquery)
    for row in data:
        print(row)
    def endElement(self, name):
##    Defining tasks to manage contents from the last readed tag
##        Catching the namespace of this page
        if name=='namespace':
            self.nspace_dict[self.current_text]=self.codens
 
        elif name=='id':
            if self.stack[-1]=='contributor':
                ##Detecting contributor's attributes inside a revision
                self.rev_dict['rev_user']=self.current_text
            elif self.stack[-1]=='revision':
                self.rev_dict[name]=self.current_text
            elif self.stack[-1]=='page':
                self.page_dict[name]=self.current_text
            else:
                self.f=open(self.fileErrPath,'w')
                if len(self.stack)>0:
                    self.f.write("Unsupported parent tag for '"+name+"': "+self.stack[-1])
                self.f.close()
 
        elif name=='ip':
            self.rev_dict['rev_user']='******'
            self.rev_dict['username']=self.current_text
 
        elif name=='timestamp':
            ##Adequate formatting of timestamps
            self.rev_dict['timestamp']=self.current_text.replace('Z','').replace('T',' ')
 
        elif name=='contributor':
            ##Pop contributor tag from the stack
            self.stack.pop()
 
        elif name=='revision':
            self.rev_count+=1
            ##Store whether this is a redirect or stub page or not
            if len(self.rev_dict['text'])>0:
                if self.rev_dict['text'][0:9].upper()=='#REDIRECT':
                    self.isRedirect='1'
                else:
                    self.isRedirect='0'
            ## Takes from the first argument the threshold for stub's length
            if str(2*len(self.rev_dict['text']))<=self.options.stubth:
                self.isStub='1'
            else:
                self.isStub='0'
 
            ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (STANDARD VERSION)######
            ##Values order: (rev_id, rev_page, [[rev_text_id=rev_id]], rev_comment,
            ##rev_user, rev_user_text, rev_timestamp, rev_is_minor)
            # Build current row for revinsert
            try:
                newrevinsert="("+self.rev_dict['id']+","+self.page_dict['id']+","+self.rev_dict['id']
                if self.rev_dict.has_key('comment'):
                    newrevinsert+=","+'"'+self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+'"'
                else:
                    newrevinsert+=",''"
                newrevinsert+=","+self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].\
                replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\
                '"'+","+'"'+self.rev_dict['timestamp']+\
                '"'+","+self.isMinor+")"
 
            # In case that any field is missing or flawed, skip this revision and log to standard error
            except (KeyError), e:
                self.printfile = codecs.open("error.log",'a','utf_8')
                self.printfile.write("Offending rev_dict was = \n")
                self.printfile.write(str(self.rev_dict))
                self.printfile.write("\n")
                self.printfile.write("Offending page_dict was = \n")
                self.printfile.write(str(self.page_dict))
                self.printfile.write("\n")
                self.printfile.write("====================================================\n")
                self.printfile.write(str(e)+"\n")
                self.printfile.write("====================================================\n\n")
                self.printfile.close()
                return
 
            if self.revinsertrows==0:
                #Always allow at least one row in extended inserts
                self.revinsert="INSERT INTO revision VALUES"+newrevinsert
                self.revinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            elif (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024) and\
            ((self.revinsertrows+1)<=self.options.imaxrows):
                #Append new row to self.revinsert
                self.revinsert+=","+newrevinsert
                self.revinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            else:
                #We must finish and write currrent insert and begin a new one
                if self.options.fileout:
                    self.revinsert+=";\n"
                    # Write output to SQL file
                    self.revfile = codecs.open(self.options.revfile,'a','utf_8')
                    self.revfile.write(revinsert)
                    self.revfile.close()
                elif self.options.streamout:
                    # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                    self.revinsert+=";"
                    print self.revinsert.encode('utf-8')
                elif self.options.monitor:
                    while 1:
                        try:
                            dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf-8'))
                        except (Exception), e:
                            print e
                        else:
                            break
                self.revinsert="INSERT INTO revision VALUES"+newrevinsert
                self.revinsertrows=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
class wikiHandler(ContentHandler):
    """Parse an XML file generated by Wikipedia Export page into SQL data
    suitable to be imported by MySQL"""
    def __init__(self, options):
        self.fileErrPath="./errors.log"; self.options=options
        if self.options.monitor and not self.options.fileout and not self.options.streamout:
            self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\
            self.options.user, self.options.passwd, self.options.database)
        self.nspace_dict={}; self.codens=''; self.page_dict={}; self.rev_dict = {}
        self.stack=[]; self.current_text = ''; self.current_elem=None; self.revfile=None
        self.pagefile=None
        self.page_num = 0; self.rev_num=0; self.last_page_len=0; self.rev_count=0
        self.prior_rev_id='NULL'; self.isRedirect='0'; self.isStub='0'; self.isMinor='0'
        self.revinsert=''; self.pageinsert=''; self.textinsert=''
        self.revinsertrows=0; self.revinsertsize=0; self.pageinsertrows=0
        self.pageinsertsize=0; self.textinsertrows=0; self.textinsertsize=0
        self.start=datetime.datetime.now(); self.timeCheck=None; self.timeDelta=None
 
    def startElement(self, name, attrs):
##    Here we define which tags we want to catch
##        In this case, we only want to recall the name of the tags in a stack
##        so we can later look up the parent node of a new tag
##        (for instance, to discriminate among page id, rev id and contributor id
##        all of them with the name=="id")
        if name=='page' or name=='revision' or name=='contributor':
            self.stack.append(name)
        elif name=='namespace':
            self.codens=attrs.get('key')
        elif name=='minor':
            self.isMinor='1'
        self.current_text=''
        self.current_elem=name
        return
 
    def endElement(self, name):
##    Defining tasks to manage contents from the last readed tag
##        Catching the namespace of this page
        if name=='namespace':
            self.nspace_dict[self.current_text]=self.codens
 
        elif name=='id':
            if self.stack[-1]=='contributor':
                ##Detecting contributor's attributes inside a revision
                self.rev_dict['rev_user']=self.current_text
            elif self.stack[-1]=='revision':
                self.rev_dict[name]=self.current_text
            elif self.stack[-1]=='page':
                self.page_dict[name]=self.current_text
            else:
                self.f=open(self.fileErrPath,'w')
                if len(self.stack)>0:
                    self.f.write("Unsupported parent tag for '"+name+"': "+self.stack[-1])
                self.f.close()
 
        elif name=='ip':
            self.rev_dict['rev_user']='******'
            self.rev_dict['username']=self.current_text
 
        elif name=='timestamp':
            ##Adequate formatting of timestamps
            self.rev_dict['timestamp']=self.current_text.replace('Z','').replace('T',' ')
 
        elif name=='contributor':
            ##Pop contributor tag from the stack
            self.stack.pop()
 
        elif name=='revision':
            self.rev_count+=1
            ##Store whether this is a redirect or stub page or not
            if len(self.rev_dict['text'])>0:
                if self.rev_dict['text'][0:9].upper()=='#REDIRECT':
                    self.isRedirect='1'
                else:
                    self.isRedirect='0'
            ## Takes from the first argument the threshold for stub's length
            if str(2*len(self.rev_dict['text']))<=self.options.stubth:
                self.isStub='1'
            else:
                self.isStub='0'
 
            ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (STANDARD VERSION)######
            ##Values order: (rev_id, rev_page, [[rev_text_id=rev_id]], rev_comment,
            ##rev_user, rev_user_text, rev_timestamp, rev_is_minor)
            # Build current row for revinsert
            try:
                newrevinsert="("+self.rev_dict['id']+","+self.page_dict['id']+","+self.rev_dict['id']
                if self.rev_dict.has_key('comment'):
                    newrevinsert+=","+'"'+self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+'"'
                else:
                    newrevinsert+=",''"
                newrevinsert+=","+self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].\
                replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\
                '"'+","+'"'+self.rev_dict['timestamp']+\
                '"'+","+self.isMinor+")"
 
            # In case that any field is missing or flawed, skip this revision and log to standard error
            except (KeyError), e:
                self.printfile = codecs.open("error.log",'a','utf_8')
                self.printfile.write("Offending rev_dict was = \n")
                self.printfile.write(str(self.rev_dict))
                self.printfile.write("\n")
                self.printfile.write("Offending page_dict was = \n")
                self.printfile.write(str(self.page_dict))
                self.printfile.write("\n")
                self.printfile.write("====================================================\n")
                self.printfile.write(str(e)+"\n")
                self.printfile.write("====================================================\n\n")
                self.printfile.close()
                return
 
            if self.revinsertrows==0:
                #Always allow at least one row in extended inserts
                self.revinsert="INSERT INTO revision VALUES"+newrevinsert
                self.revinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            elif (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024) and\
            ((self.revinsertrows+1)<=self.options.imaxrows):
                #Append new row to self.revinsert
                self.revinsert+=","+newrevinsert
                self.revinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            else:
                #We must finish and write currrent insert and begin a new one
                if self.options.fileout:
                    self.revinsert+=";\n"
                    # Write output to SQL file
                    self.revfile = codecs.open(self.options.revfile,'a','utf_8')
                    self.revfile.write(revinsert)
                    self.revfile.close()
                elif self.options.streamout:
                    # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                    self.revinsert+=";"
                    print self.revinsert.encode('utf-8')
                elif self.options.monitor:
                    while 1:
                        try:
                            dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf-8'))
                        except (Exception), e:
                            print e
                        else:
                            break
                self.revinsert="INSERT INTO revision VALUES"+newrevinsert
                self.revinsertrows=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
 
            ##################################################
            ##CONSTRUCTION OF EXTENDED INSERTS FOR TABLE TEXT
            ##Template for each row:
            ## (old_id, old_text, old_flags)
            newtextinsert="("+self.rev_dict['id']+','+'"'+\
            self.rev_dict['text'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\
            '",'+'"utf8")'
            if self.textinsertrows==0:
                #Always allow at least one row in extended inserts
                self.textinsert="INSERT INTO text VALUES"+newtextinsert
                self.textinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.textinsertsize=len(self.textinsert)*2
            elif (self.textinsertsize+(2*len(newtextinsert))<=self.options.imaxsize*1024) and\
            ((self.textinsertrows+1)<=self.options.imaxrows):
                #Append new row to self.revinsert
                self.textinsert+=","+newtextinsert
                self.textinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.textinsertsize=len(self.textinsert)*2
            else:
                #We must finish and write currrent insert and begin a new one
                if self.options.fileout:
                    self.textinsert+=";\n"
                    # Write output to SQL file
                    self.textfile = codecs.open(self.options.textfile,'a','utf_8')
                    self.textfile.write(textinsert)
                    self.textfile.close()
                elif self.options.streamout:
                    # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                    self.textinsert+=";"
                    print self.textinsert.encode('utf-8')
                elif self.options.monitor:
                    while 1:
                        try:
                            dbaccess.raw_query_SQL(self.acceso[1], self.textinsert.encode('utf-8'))
                        except (Exception), e:
                            print e
                        else:
                            break
                self.textinsert="INSERT INTO text VALUES"+newtextinsert
                self.textinsertrows=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.textinsertsize=len(self.textinsert)*2
Beispiel #26
0
 #We must write the last pageinsert before finishing this dump
 if self.options.fileout:
 # Write output to SQL file
     self.pageinsert+=";\n"
     self.pagefile = codecs.open(self.options.pagefile,'a','utf_8')
     self.pagefile.write(self.pageinsert)
     self.pagefile.close()
 elif self.options.streamout:
     # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
     self.pageinsert+=";"
     print self.pageinsert.encode('utf_8')
 elif self.options.monitor:
     chances=0
     while chances<5:
         try:
             dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode('utf_8'))
         except (Exception), e:
             self.printfile = codecs.open("error_"+self.options.database,'a','utf_8')
             self.printfile.write(str(e)+"\n")
             self.printfile.write(self.pageinsert[0:30]+"\n**********************************")
             self.printfile.close()
             chances+=1
         else:
             break
 #Reset status vars
 self.pageinsertrows=0
 self.pageinsertsize=0
 #INSERT NAMESPACES CODES AND TITLES IN SPECIAL TABLE
 nspaces= self.nspace_dict.iteritems()
 insertns='INSERT INTO namespaces VALUES'
 first_loop=True
 ################################################
 #We must write the las revinsert before finishing this page
 if self.options.fileout:
     self.revinsert+=";\n"
 # Write output to SQL file
     self.revfile = codecs.open(self.options.revfile,'a','utf_8')
     self.revfile.write(self.revinsert)
     self.revfile.close()
 elif self.options.streamout:
     # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
     self.revinsert+=";"
     print self.revinsert.encode('utf-8')
 elif self.options.monitor:
     while 1:
         try:
             dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf-8'))
         except (Exception), e:
             print e
         else:
             break
 #Reset status vars
 self.revinsertrows=0
 self.revinsertsize=0
 ################################################
 ##Same for Insert into text table
 if self.options.fileout:
     self.textinsert+=";\n"
 # Write output to SQL file
     self.textfile = codecs.open(self.options.textfile,'a','utf_8')
     self.textfile.write(self.textinsert)
     self.textfile.close()
Beispiel #28
0
    def endElement(self, name):
##    Defining tasks to manage contents from the last readed tag
##        Catching the namespace of this page
        if name=='namespace':
            self.nspace_dict[self.current_text]=self.codens
            
        elif name=='id':
            if self.stack[-1]=='contributor':
                ##Detecting contributor's attributes inside a revision
                self.rev_dict['rev_user']=self.current_text
            elif self.stack[-1]=='revision':
                self.rev_dict[name]=self.current_text
            elif self.stack[-1]=='page':
                self.page_dict[name]=self.current_text
            else:
                self.f=open(self.fileErrPath,'w')
                if len(self.stack)>0:
                    self.f.write("Unsupported parent tag for '"+name+"': "+self.stack[-1])
                self.f.close()
                
        elif name=='ip':
            self.rev_dict['rev_user']='******'
            self.rev_dict['username']=self.current_text
            
        elif name=='timestamp':
            ##Adequate formatting of timestamps
            self.rev_dict['timestamp']=self.current_text.replace('Z','').replace('T',' ')
                
        elif name=='contributor':
            ##Pop contributor tag from the stack
            self.stack.pop()
        #####################################################
        ## END OF REVISION
        #####################################################
        elif name=='revision':
            self.rev_count+=1
            ##Store whether this is a redirect or stub page or not
            ##TODO: Substitute the find command with a regexp
            if len(self.rev_dict['text'])>0:
                if string.upper(self.rev_dict['text'][0:9])=='#REDIRECT':
                    self.isRedirect='1'
                else:
                    self.isRedirect='0'
            ## Takes from the first argument the threshold for stub's length
            if 2*len(self.rev_dict['text'])<=self.options.stubth:
                self.isStub='1'
            else:
                self.isStub='0'
                
            ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (RESEARCH VERSION)######
            ##Values order: (rev_id, rev_page, rev_user, rev_user_text, rev_timestamp,
            ##rev_len, rev_letters_raw, rev_words_raw, rev_words_filter, rev_in_links, rev_out_links, rev_trans,
            ##rev_sections, rev_bolds, rev_italics, rev_bolditalics
            ##rev_parent_id, rev_is_redirect, rev_is_stub, rev_is_minor, rev_comment)
            ##Calculation of additional fancy statistics AND
            ##Detection and stripping of wiki tags and HTML tags
            ##We also store inlinks, outlinks and special links
            self.rev_dict['text']=re.sub(self.pathtml, '', self.rev_dict['text']) #filter HTML tags
            self.rev_dict['text']=re.sub(self.patunicode, 'X', self.rev_dict['text']) #convert unicode chars to X
##            self.highwords=re.findall(self.pathighwords, self.rev_dict['text']) #detect highlighted words
##            for i in range(len(self.highwords)):
##                self.highwords[i]=re.sub(self.pathighlight,'', self.highwords[i]) #Filter bold/italics tags
            self.rev_dict['text']=re.sub(self.pathighlight, '', self.rev_dict['text']) #filter highlight tags
##            self.special=re.findall(self.patspecial, self.rev_dict['text']) #detect special links
            self.trans=re.findall(self.pattrans, self.rev_dict['text']) #detect translation links
            self.rev_dict['text']=re.sub(self.patspecial, '', self.rev_dict['text']) #filter out special links (after detecting trans)
            self.inlinks=re.findall(self.patinlink, self.rev_dict['text']) #detect inlinks
            self.outlinks=re.findall(self.patoutlink, self.rev_dict['text']) #detect outlinks
            self.sections=re.findall(self.patsection, self.rev_dict['text']) #detect sections
            self.rev_dict['text']=re.sub(self.patitemize, '', self.rev_dict['text']) #filter out itemize bullets and line branches
            # Build current row for revinsert
            newrevinsert="("+self.rev_dict['id']+","+self.page_dict['id']+","+\
            self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\
            '"'+","+'"'+self.rev_dict['timestamp']+\
            '"'+","+str(2*len(self.rev_dict['text']))+\
            ","+str(len(self.rev_dict['text']))+\
            ","+str(len(self.rev_dict['text'].split())) 
##            ","+str(len(self.highwords))+","+str(len(self.special))+\
            newrevinsert+=","+str(len(self.inlinks))+","+str(len(self.outlinks))+","+str(len(self.trans))+","+str(len(self.sections))+\
            ","+self.prior_rev_id+","+self.isRedirect+","+self.isStub+","+self.isMinor
            if self.rev_dict.has_key('comment'):
                newrevinsert+=","+'"'+self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+'"'
            else:
                newrevinsert+=",''"
            newrevinsert+=")"
            #############################################
            ## CONSTRUCT DICTIONARIES WITH SPECIAL ITEMS
            #############################################
##            for item in self.highwords:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.highwords_dict.get(item)
##                if (stumble==None):
##                    self.highwords_dict[item]=self.highwords_id
##                    self.highwords_rev_insert.append((self.rev_dict['id'],self.highwords_id))
##                    self.highwords_id+=1
##                else:
##                    self.highwords_rev_insert.append((self.rev_dict['id'],stumble))
##            for item in self.special:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.special_dict.get(item)
##                if (stumble==None):
##                    self.special_dict[item]=self.special_id
##                    self.special_rev_insert.append((self.rev_dict['id'], self.special_id))
##                    self.special_id+=1
##                else:
##                    self.special_rev_insert.append((self.rev_dict['id'], stumble))
##            for item in self.inlinks:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.inlinks_dict.get(item)
##                if (stumble==None):
##                    self.inlinks_dict[item]=self.inlinks_id
##                    self.inlinks_rev_insert.append((self.rev_dict['id'], self.inlinks_id))
##                    self.inlinks_id+=1
##                else:
##                    self.inlinks_rev_insert.append((self.rev_dict['id'], stumble))
##            for item in self.outlinks:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.outlinks_dict.get(item)
##                if (stumble==None):
##                    self.outlinks_dict[item]=self.outlinks_id
##                    self.outlinks_rev_insert.append((self.rev_dict['id'], self.outlinks_id))
##                    self.outlinks_id+=1
##                else:
##                    self.outlinks_rev_insert.append((self.rev_dict['id'], stumble))
##            for item in self.trans:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.trans_dict.get(item)
##                if (stumble==None):
##                    self.trans_dict[item]=self.trans_id
##                    self.trans_rev_insert.append((self.rev_dict['id'], self.trans_id))
##                    self.trans_id+=1
##                else:
##                    self.trans_rev_insert.append((self.rev_dict['id'], stumble))
            ##############################################
            ## LOOK-AHEAD ALGORITHM
            ##############################################
            if self.revinsertrows==0:
                #Always allow at least one row in extended inserts
                self.revinsert="INSERT INTO revision VALUES"+newrevinsert
                self.revinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            elif (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024) and\
            ((self.revinsertrows+1)<=self.options.imaxrows):
                #Append new row to self.revinsert
                self.revinsert+=","+newrevinsert
                self.revinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            else:
                #We must finish and write currrent insert and begin a new one
                if self.options.fileout:
                    self.revinsert+=";\n"
                    # Write output to SQL file
                    self.revfile = codecs.open(self.options.revfile,'a','utf_8')
                    self.revfile.write(revinsert)
                    self.revfile.close()
                elif self.options.streamout:
                    # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                    self.revinsert+=";"
                    print self.revinsert.encode('utf_8')
                    print self.revinsert.encode('utf_8')
                elif self.options.monitor:
                    chances=0
                    while chances<5:
                        try:
                            dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf_8'))
                        except (Exception), e:
                            self.printfile = codecs.open("error_"+self.options.database,'a','utf_8')
                            self.printfile.write(str(e)+"\n")
                            self.printfile.write(self.revinsert[0:30]+"\n**********************************")
                            self.printfile.close()
                            chances+=1
                        else:
                            break
                self.revinsert="INSERT INTO revision VALUES"+newrevinsert
                self.revinsertrows=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            ##################################################
            ##Store this rev_id to recall it when processing the following revision, if it exists
            self.prior_rev_id=self.rev_dict['id']
            ##Store this rev_len to recall it for the current page_len, in case this is the last revision for that page
            self.last_page_len=2*len(self.rev_dict['text'])
            self.rev_dict.clear()
            self.stack.pop()
            self.isMinor='0'
            self.inlinks=[]; self.outlinks=[]; self.trans=[]; self.sections=[]
            self.highwords=[]; self.special=[]
            self.rev_num+=1
            if self.options.verbose and self.options.log is None:
                # Display status report
                if self.rev_num % 1000 == 0:
                    self.timeCheck=datetime.datetime.now()
                    self.timeDelta=self.timeCheck-self.start
                    if self.timeDelta.seconds==0:
                        print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)"\
                        % (self.page_num, 1e6*float(self.page_num)/self.timeDelta.microseconds,\
                        self.rev_num, 1e6*float(self.rev_num)/self.timeDelta.microseconds)
                        self.printfile = codecs.open(self.fileErrPath,'a','utf_8')
                        self.printfile.write("page "+str(self.page_num)+" ("+\
                        str( 1e6*float(self.page_num)/self.timeDelta.microseconds)+" pags. per sec.), revision "+\
                        str(self.rev_num)+ " ("+str(1e6*float(self.rev_num)/self.timeDelta.microseconds)+" revs. per sec.)\n")
                        self.printfile.close()
                    else:
                        print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)"\
                        % (self.page_num, float(self.page_num)/self.timeDelta.seconds,\
                        self.rev_num, float(self.rev_num)/self.timeDelta.seconds)
                        self.printfile = codecs.open(self.fileErrPath,'a','utf_8')
                        self.printfile.write("page "+str(self.page_num)+" ("+\
                        str( float(self.page_num)/self.timeDelta.seconds)+" pags./sec.), revision "+\
                        str(self.rev_num)+ " ("+str(float(self.rev_num)/self.timeDelta.seconds)+" revs./sec.)\n")
                        self.printfile.close()
            if self.options.verbose and self.options.log is not None:
                if self.rev_num%1000==0:
                    self.timeCheck=datetime.datetime.now()
                    self.timeDelta=self.timeCheck-self.start
                    if self.timeDelta.seconds==0:
                        self.printfile = codecs.open(self.options.log,'a','utf_8')
                        self.printfile.write("page "+str(self.page_num)+" ("+\
                        str( 1e6*float(self.page_num)/self.timeDelta.microseconds)+" pags./sec.), revision "+\
                        str(self.rev_num)+ " ("+str(1e6*float(self.rev_num)/self.timeDelta.microseconds)+" revs./sec.)\n")
                        self.printfile.close()
                    else:
                        self.printfile = codecs.open(self.options.log,'a','utf_8')
                        self.printfile.write("page "+str(self.page_num)+" ("+\
                        str( float(self.page_num)/self.timeDelta.seconds)+" pags./sec.), revision "+\
                        str(self.rev_num)+ " ("+str(float(self.rev_num)/self.timeDelta.seconds)+" revs./sec.)\n")
                        self.printfile.close()
Beispiel #29
0
    def bots(self):
        """
        Preprocessing actions with bots data
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research" 
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            #TABLE revs_bots (revisions made by officially identified bots, by year, month)
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_bots")
            dbaccess.raw_query_SQL(self.access[1], "create table revs_bots as select year(rev_timestamp) "+\
            "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision where rev_user!=0 "+\
            "and rev_user in (select ug_user from user_groups where ug_group='bot') group by "+\
            "year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)")

            #TABLE revs_logged (revisions made by logged authors, by year, month)
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_logged")
            dbaccess.raw_query_SQL(self.access[1],"create table revs_logged as select year(rev_timestamp) "+\
            "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision where rev_user!=0 "+\
            "group by year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)")

            #TABLE revs_all (revisions made by all authors, by year, month)
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_all")
            dbaccess.raw_query_SQL(self.access[1], "create table revs_all as select year(rev_timestamp) "+\
            "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision "+\
            "group by year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)")

            dbaccess.close_Connection(self.access[0])

        #FILE perc-bots-all-revs.dat % of all revisions due to bots
        file=open("overall/data/perc-bots-all-revs.dat",'w')
        file.write("year\tmonth\tperc_revs\tlang\n")
        file.close()
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research" 
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            #Obtain % of total no. revs due to bots, by year, month
            self.perc_revs=dbaccess.raw_query_SQL(self.access[1], "select bot.theyear, bot.themonth, "+\
            "(bot.num_revs/tot.num_revs)*100 perc_revs from revs_bots as bot, revs_all as tot "+\
            "where bot.theyear=tot.theyear and bot.themonth=tot.themonth;")
            dbaccess.close_Connection(self.access[0])
            #Writing data to file
            file=open("overall/data/perc-bots-all-revs.dat",'a')
            for item in self.perc_revs:
                file.write(str(int(item[0]))+"\t"+str(int(item[1]))+"\t"+\
                str(float(item[2]))+"\t"+self.language+"\n")
            file.close()

        #file perc-bots-logged-revs.dat % of all revisions due to bots
        file=open("overall/data/perc-bots-logged-revs.dat",'w')
        file.write("year\tmonth\tperc_revs\tlang\n")
        file.close()
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research" 
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            #obtain % of no. revs by logged editors due to bots, by year, month
            self.perc_revs=dbaccess.raw_query_sql(self.access[1], "select bot.theyear, bot.themonth, "+\
            "(bot.num_revs/logged.num_revs)*100 perc_logged_revs from revs_bots as bot, "+\
            "revs_logged as logged where bot.theyear=logged.theyear and bot.themonth=logged.themonth;")
            dbaccess.close_connection(self.access[0])
            #writing data to file
            file=open("overall/data/perc-bots-logged-revs.dat",'a')
            for item in self.perc_revs:
                file.write(str(int(item[0]))+"\t"+str(int(item[1]))+"\t"+\
                str(float(item[2]))+"\t"+self.language+"\n")
            file.close()
Beispiel #30
0
class wikiHandler(ContentHandler):
    """Parse an XML file generated by Wikipedia Export page into SQL data
    suitable to be imported by MySQL"""
    def __init__(self, options):
        self.options=options
        if self.options.monitor and (not self.options.fileout and not self.options.streamout):
            self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\
            self.options.user, self.options.passwd, self.options.database)
        self.nspace_dict={}
        self.codens=''
        self.page_dict={}
        self.rev_dict = {}
        self.stack=[]
        self.current_text = ''
        self.current_elem=None
        self.revfile=None
        self.pagefile=None
        self.page_num = 0
        self.rev_num=0
        self.last_page_len=0
        self.rev_count=0
        self.prior_rev_id='NULL'
        self.isRedirect='0'
        self.isStub='0'
        self.isMinor='0'
        self.inlinks=None # internal links
        self.outlinks=None # external links
        self.trans=None # translations to other language editions
        self.sections=None # sections (no matter their level)
        self.highwords=None #highlighted words (bold/italics/bold+italics)
        self.special=None #rev_text, special links filtered out
        ########################################
        ##REGEXPS
        ########################################
        self.pathighlight=r"\'\'+"#Regexp matching bold/italics/bold+italics wikitags
        self.pathighwords=r"\'\'+.*\'\'+" #Regexp for highlighted words
        self.pathtml=r"\<[^\>]+\>" #Regexp matching HTML tags
        self.patunicode=r"\&\w+\;|\&\#\d+\;|[\xc0-\xf7][\x80-\xbf]+" #Regexp matching unicode chars
        self.patspecial=r"\[\[[^\:\]]+\:[^\]]*\]\]" #Regexp matching special inlinks (image/category/interwiki)
        self.patinlink=r"\[\[.*\]\]" #Regexp matching inlinks (after filtering image/category/interwiki links)
        self.patoutlink=r"\s\[[^\[\]]*\]|http[s]?://" #Regexp matching outlinks
        self.patsection=r"\=\=+[\s]*[^\=]*[\s]*\=\=+" #Regexp matching section titles
        self.pattrans=r"\[\[..[.]?:"#Regexp matching translation links
        self.patitemize=r"\n\**" #Regexp matching itemize bullets and line branches
        self.patdumb=r"\)\(" #A rapid solution to concatenate tuples in special instert strings
        self.fileErrPath="./errors.log"
        #TODO: Solve lookup in global scope if the special item did not show up in any previous revision
        #of the whole dump (maybe lookup query to DB??)
        self.highwords_dict={}; self.special_dict={}; self.inlinks_dict={};
        self.outlinks_dict={}; self.trans_dict={}
        self.highwords_id=1; self.special_id=1; self.inlinks_id=1; self.outlinks_id=1
        self.trans_id=1;
        self.highwords_rev_insert=[]; self.special_rev_insert=[]; self.inlinks_rev_insert=[]
        self.outlinks_rev_insert=[]; self.trans_rev_insert=[];
        self.revinsert=''
        self.pageinsert=''
        self.revinsertrows=0
        self.revinsertsize=0
        self.pageinsertrows=0
        self.pageinsertsize=0
        self.start=datetime.datetime.now()
        self.timeCheck=None
        self.timeDelta=None
        
    def startElement(self, name, attrs):
##    Here we define which tags we want to catch
##        In this case, we only want to recall the name of the tags in a stack
##        so we can later look up the parent node of a new tag
##        (for instance, to discriminate among page id, rev id and contributor id
##        all of them with the name=="id")
        if name=='page' or name=='revision' or name=='contributor':
            self.stack.append(name)
        elif name=='namespace':
            self.codens=attrs.get('key')
        elif name=='minor':
            self.isMinor='1'
        self.current_text=''
        self.current_elem=name
        return
        
    def endElement(self, name):
##    Defining tasks to manage contents from the last readed tag
##        Catching the namespace of this page
        if name=='namespace':
            self.nspace_dict[self.current_text]=self.codens
            
        elif name=='id':
            if self.stack[-1]=='contributor':
                ##Detecting contributor's attributes inside a revision
                self.rev_dict['rev_user']=self.current_text
            elif self.stack[-1]=='revision':
                self.rev_dict[name]=self.current_text
            elif self.stack[-1]=='page':
                self.page_dict[name]=self.current_text
            else:
                self.f=open(self.fileErrPath,'w')
                if len(self.stack)>0:
                    self.f.write("Unsupported parent tag for '"+name+"': "+self.stack[-1])
                self.f.close()
                
        elif name=='ip':
            self.rev_dict['rev_user']='******'
            self.rev_dict['username']=self.current_text
            
        elif name=='timestamp':
            ##Adequate formatting of timestamps
            self.rev_dict['timestamp']=self.current_text.replace('Z','').replace('T',' ')
                
        elif name=='contributor':
            ##Pop contributor tag from the stack
            self.stack.pop()
        #####################################################
        ## END OF REVISION
        #####################################################
        elif name=='revision':
            self.rev_count+=1
            ##Store whether this is a redirect or stub page or not
            ##TODO: Substitute the find command with a regexp
            if len(self.rev_dict['text'])>0:
                if string.upper(self.rev_dict['text'][0:9])=='#REDIRECT':
                    self.isRedirect='1'
                else:
                    self.isRedirect='0'
            ## Takes from the first argument the threshold for stub's length
            if 2*len(self.rev_dict['text'])<=self.options.stubth:
                self.isStub='1'
            else:
                self.isStub='0'
                
            ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (RESEARCH VERSION)######
            ##Values order: (rev_id, rev_page, rev_user, rev_user_text, rev_timestamp,
            ##rev_len, rev_letters_raw, rev_words_raw, rev_words_filter, rev_in_links, rev_out_links, rev_trans,
            ##rev_sections, rev_bolds, rev_italics, rev_bolditalics
            ##rev_parent_id, rev_is_redirect, rev_is_stub, rev_is_minor, rev_comment)
            ##Calculation of additional fancy statistics AND
            ##Detection and stripping of wiki tags and HTML tags
            ##We also store inlinks, outlinks and special links
            self.rev_dict['text']=re.sub(self.pathtml, '', self.rev_dict['text']) #filter HTML tags
            self.rev_dict['text']=re.sub(self.patunicode, 'X', self.rev_dict['text']) #convert unicode chars to X
##            self.highwords=re.findall(self.pathighwords, self.rev_dict['text']) #detect highlighted words
##            for i in range(len(self.highwords)):
##                self.highwords[i]=re.sub(self.pathighlight,'', self.highwords[i]) #Filter bold/italics tags
            self.rev_dict['text']=re.sub(self.pathighlight, '', self.rev_dict['text']) #filter highlight tags
##            self.special=re.findall(self.patspecial, self.rev_dict['text']) #detect special links
            self.trans=re.findall(self.pattrans, self.rev_dict['text']) #detect translation links
            self.rev_dict['text']=re.sub(self.patspecial, '', self.rev_dict['text']) #filter out special links (after detecting trans)
            self.inlinks=re.findall(self.patinlink, self.rev_dict['text']) #detect inlinks
            self.outlinks=re.findall(self.patoutlink, self.rev_dict['text']) #detect outlinks
            self.sections=re.findall(self.patsection, self.rev_dict['text']) #detect sections
            self.rev_dict['text']=re.sub(self.patitemize, '', self.rev_dict['text']) #filter out itemize bullets and line branches
            # Build current row for revinsert
            newrevinsert="("+self.rev_dict['id']+","+self.page_dict['id']+","+\
            self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\
            '"'+","+'"'+self.rev_dict['timestamp']+\
            '"'+","+str(2*len(self.rev_dict['text']))+\
            ","+str(len(self.rev_dict['text']))+\
            ","+str(len(self.rev_dict['text'].split())) 
##            ","+str(len(self.highwords))+","+str(len(self.special))+\
            newrevinsert+=","+str(len(self.inlinks))+","+str(len(self.outlinks))+","+str(len(self.trans))+","+str(len(self.sections))+\
            ","+self.prior_rev_id+","+self.isRedirect+","+self.isStub+","+self.isMinor
            if self.rev_dict.has_key('comment'):
                newrevinsert+=","+'"'+self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+'"'
            else:
                newrevinsert+=",''"
            newrevinsert+=")"
            #############################################
            ## CONSTRUCT DICTIONARIES WITH SPECIAL ITEMS
            #############################################
##            for item in self.highwords:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.highwords_dict.get(item)
##                if (stumble==None):
##                    self.highwords_dict[item]=self.highwords_id
##                    self.highwords_rev_insert.append((self.rev_dict['id'],self.highwords_id))
##                    self.highwords_id+=1
##                else:
##                    self.highwords_rev_insert.append((self.rev_dict['id'],stumble))
##            for item in self.special:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.special_dict.get(item)
##                if (stumble==None):
##                    self.special_dict[item]=self.special_id
##                    self.special_rev_insert.append((self.rev_dict['id'], self.special_id))
##                    self.special_id+=1
##                else:
##                    self.special_rev_insert.append((self.rev_dict['id'], stumble))
##            for item in self.inlinks:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.inlinks_dict.get(item)
##                if (stumble==None):
##                    self.inlinks_dict[item]=self.inlinks_id
##                    self.inlinks_rev_insert.append((self.rev_dict['id'], self.inlinks_id))
##                    self.inlinks_id+=1
##                else:
##                    self.inlinks_rev_insert.append((self.rev_dict['id'], stumble))
##            for item in self.outlinks:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.outlinks_dict.get(item)
##                if (stumble==None):
##                    self.outlinks_dict[item]=self.outlinks_id
##                    self.outlinks_rev_insert.append((self.rev_dict['id'], self.outlinks_id))
##                    self.outlinks_id+=1
##                else:
##                    self.outlinks_rev_insert.append((self.rev_dict['id'], stumble))
##            for item in self.trans:
##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
##                stumble=self.trans_dict.get(item)
##                if (stumble==None):
##                    self.trans_dict[item]=self.trans_id
##                    self.trans_rev_insert.append((self.rev_dict['id'], self.trans_id))
##                    self.trans_id+=1
##                else:
##                    self.trans_rev_insert.append((self.rev_dict['id'], stumble))
            ##############################################
            ## LOOK-AHEAD ALGORITHM
            ##############################################
            if self.revinsertrows==0:
                #Always allow at least one row in extended inserts
                self.revinsert="INSERT INTO revision VALUES"+newrevinsert
                self.revinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            elif (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024) and\
            ((self.revinsertrows+1)<=self.options.imaxrows):
                #Append new row to self.revinsert
                self.revinsert+=","+newrevinsert
                self.revinsertrows+=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            else:
                #We must finish and write currrent insert and begin a new one
                if self.options.fileout:
                    self.revinsert+=";\n"
                    # Write output to SQL file
                    self.revfile = codecs.open(self.options.revfile,'a','utf_8')
                    self.revfile.write(revinsert)
                    self.revfile.close()
                elif self.options.streamout:
                    # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                    self.revinsert+=";"
                    print self.revinsert.encode('utf_8')
                    print self.revinsert.encode('utf_8')
                elif self.options.monitor:
                    chances=0
                    while chances<5:
                        try:
                            dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf_8'))
                        except (Exception), e:
                            self.printfile = codecs.open("error_"+self.options.database,'a','utf_8')
                            self.printfile.write(str(e)+"\n")
                            self.printfile.write(self.revinsert[0:30]+"\n**********************************")
                            self.printfile.close()
                            chances+=1
                        else:
                            break
                self.revinsert="INSERT INTO revision VALUES"+newrevinsert
                self.revinsertrows=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize=len(self.revinsert)*2
            ##################################################
            ##Store this rev_id to recall it when processing the following revision, if it exists
            self.prior_rev_id=self.rev_dict['id']
            ##Store this rev_len to recall it for the current page_len, in case this is the last revision for that page
            self.last_page_len=2*len(self.rev_dict['text'])
            self.rev_dict.clear()
            self.stack.pop()
            self.isMinor='0'
            self.inlinks=[]; self.outlinks=[]; self.trans=[]; self.sections=[]
            self.highwords=[]; self.special=[]
            self.rev_num+=1
            if self.options.verbose and self.options.log is None:
                # Display status report
                if self.rev_num % 1000 == 0:
                    self.timeCheck=datetime.datetime.now()
                    self.timeDelta=self.timeCheck-self.start
                    if self.timeDelta.seconds==0:
                        print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)"\
                        % (self.page_num, 1e6*float(self.page_num)/self.timeDelta.microseconds,\
                        self.rev_num, 1e6*float(self.rev_num)/self.timeDelta.microseconds)
                        self.printfile = codecs.open(self.fileErrPath,'a','utf_8')
                        self.printfile.write("page "+str(self.page_num)+" ("+\
                        str( 1e6*float(self.page_num)/self.timeDelta.microseconds)+" pags. per sec.), revision "+\
                        str(self.rev_num)+ " ("+str(1e6*float(self.rev_num)/self.timeDelta.microseconds)+" revs. per sec.)\n")
                        self.printfile.close()
                    else:
                        print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)"\
                        % (self.page_num, float(self.page_num)/self.timeDelta.seconds,\
                        self.rev_num, float(self.rev_num)/self.timeDelta.seconds)
                        self.printfile = codecs.open(self.fileErrPath,'a','utf_8')
                        self.printfile.write("page "+str(self.page_num)+" ("+\
                        str( float(self.page_num)/self.timeDelta.seconds)+" pags./sec.), revision "+\
                        str(self.rev_num)+ " ("+str(float(self.rev_num)/self.timeDelta.seconds)+" revs./sec.)\n")
                        self.printfile.close()
            if self.options.verbose and self.options.log is not None:
                if self.rev_num%1000==0:
                    self.timeCheck=datetime.datetime.now()
                    self.timeDelta=self.timeCheck-self.start
                    if self.timeDelta.seconds==0:
                        self.printfile = codecs.open(self.options.log,'a','utf_8')
                        self.printfile.write("page "+str(self.page_num)+" ("+\
                        str( 1e6*float(self.page_num)/self.timeDelta.microseconds)+" pags./sec.), revision "+\
                        str(self.rev_num)+ " ("+str(1e6*float(self.rev_num)/self.timeDelta.microseconds)+" revs./sec.)\n")
                        self.printfile.close()
                    else:
                        self.printfile = codecs.open(self.options.log,'a','utf_8')
                        self.printfile.write("page "+str(self.page_num)+" ("+\
                        str( float(self.page_num)/self.timeDelta.seconds)+" pags./sec.), revision "+\
                        str(self.rev_num)+ " ("+str(float(self.rev_num)/self.timeDelta.seconds)+" revs./sec.)\n")
                        self.printfile.close()
        #################################################
        ## END OF PAGE
        #################################################
        elif name=='page':
            ################################################
            ################################################
            #GENERATE AND COMMIT SPECIAL VALUES INSERTS FOR ALL REVISIONS OF THIS PAGE
            # CREATE INSERT STRINGS FROM values
            ##HIGHLIGHTED WORDS
##            self.high_insert_st='INSERT INTO highlight VALUES'
##            for item in self.highwords_dict.iteritems():
##                self.high_insert_st+="("+str(item[1])+',"'+item[0]+'")'
##            self.high_insert_st=re.sub(self.patdumb, "),(", self.high_insert_st)
####            self.debug(self.high_insert_st)
##            self.high_rev_insert_st='INSERT INTO rev_highlight VALUES'
##            for item in self.highwords_rev_insert:
##                self.high_rev_insert_st+="("+str(item[0])+","+str(item[1])+")"
##            self.high_rev_insert_st=re.sub(self.patdumb, "),(", self.high_rev_insert_st)
####            self.debug(self.high_rev_insert_st)
##            ##SPECIAL LINKS
##            self.special_insert_st='INSERT INTO special VALUES'
##            for item in self.special_dict.iteritems():
##                self.special_insert_st+="("+str(item[1])+',"'+item[0]+'")'
##            self.special_insert_st=re.sub(self.patdumb,"),(", self.special_insert_st)
####            self.debug(self.special_insert_st)
##            self.special_rev_insert_st='INSERT INTO rev_special VALUES'
##            for item in self.special_rev_insert:
##                self.special_rev_insert_st+="("+str(item[0])+","+str(item[1])+")"
##            self.special_rev_insert_st=re.sub(self.patdumb,"),(",self.special_rev_insert_st)
##            self.debug(self.special_rev_insert_st)
            ##INLINKS
##            self.inlinks_insert_st='INSERT INTO inlink VALUES'
##            for item in self.inlinks_dict.iteritems():
##                self.inlinks_insert_st+="("+str(item[1])+',"'+item[0]+'")'
##            self.inlinks_insert_st=re.sub(self.patdumb,"),(", self.inlinks_insert_st)
####            self.debug(self.inlinks_insert_st)
##            self.inlinks_rev_insert_st='INSERT INTO rev_inlink VALUES'
##            for item in self.inlinks_rev_insert:
##                self.inlinks_rev_insert_st+="("+str(item[0])+","+str(item[1])+")"
##            self.inlinks_rev_insert_st=re.sub(self.patdumb,"),(", self.inlinks_rev_insert_st)
####            self.debug(self.inlinks_rev_insert_st)
##            ##OUTLINKS
##            self.outlinks_insert_st='INSERT INTO outlink VALUES'
##            for item in self.outlinks_dict.iteritems():
##                self.outlinks_insert_st+="("+str(item[1])+',"'+item[0]+'")'
##            self.outlinks_insert_st=re.sub(self.patdumb,"),(", self.outlinks_insert_st)
####            self.debug(self.outlinks_insert_st)
##            self.outlinks_rev_insert_st='INSERT INTO rev_outlink VALUES'
##            for item in self.outlinks_rev_insert:
##                self.outlinks_rev_insert_st+="("+str(item[0])+","+str(item[1])+")"
##            self.outlinks_rev_insert_st=re.sub(self.patdumb,"),(",self.outlinks_rev_insert_st)
####            self.debug(self.outlinks_rev_insert_st)
##            ##TRANSLATION LINKS
##            self.trans_insert_st='INSERT INTO trans VALUES'
##            for item in self.trans_dict.iteritems():
##                self.trans_insert_st+="("+str(item[1])+',"'+item[0]+'")'
##            self.trans_insert_st=re.sub(self.patdumb,"),(",self.trans_insert_st)
####            self.debug(self.trans_insert_st)
##            self.trans_rev_insert_st='INSERT INTO rev_trans VALUES'
##            for item in self.trans_rev_insert:
##                self.trans_rev_insert_st+="("+str(item[0])+","+str(item[1])+")"
##            self.trans_rev_insert_st=re.sub(self.patdumb,"),(", self.trans_rev_insert_st)
##            self.debug(self.trans_rev_insert_st)
            #COMMIT NEAT INSERTS
            if self.options.fileout:
                self.high_insert_st="\n;";self.high_rev_insert_st="\n;"; self.special_insert_st="\n;"
                self.special_rev_insert_st="\n;"; self.inlinks_insert_st="\n;"; self.inlinks_rev_insert_st="\n;"
                self.outlinks_insert_st="\n;"; self.outlinks_rev_insert_st="\n;"; self.trans_insert_st="\n;"
                self.trans_rev_insert_st="\n;"
            # Write output to SQL file
            # TODO: get a filename for special inserts
                self.neatfile = codecs.open('neat.sql','a','utf_8')
                if len(self.highwords_dict)>0:
                    self.neatfile.write(self.high_insert_st);
                    self.neatfile.write(self.high_rev_insert_st);
                if len(self.special_dict)>0:
                    self.neatfile.write(self.special_insert_st)
                    self.neatfile.write(self.special_rev_insert_st);
                if len(self.inlinks_dict)>0:
                    self.neatfile.write(self.inlinks_insert_st);
                    self.neatfile.write(self.inlinks_rev_insert_st)
                if len(self.outlinks_dict)>0:
                    self.neatfile.write(self.outlinks_insert_st);
                    self.neatfile.write(self.outlinks_rev_insert_st)
                if len(self.trans_dict)>0:
                    self.neatfile.write(self.trans_insert_st); 
                    self.neatfile.write(self.trans_rev_insert_st)
                self.neatfile.close()
            elif self.options.streamout:
                # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                self.high_insert_st=";";self.high_rev_insert_st=";"; self.special_insert_st=";"
                self.special_rev_insert_st=";"; self.inlinks_insert_st=";"; self.inlinks_rev_insert_st=";"
                self.outlinks_insert_st=";"; self.outlinks_rev_insert_st=";"; self.trans_insert_st=";"
                self.trans_rev_insert_st=";"
                if len(self.highwords_dict)>0:
                    print self.high_insert_st.encode('utf_8');
                    print self.high_rev_insert_st.encode('utf_8')
                if len(self.special_dict)>0:
                    print self.special_insert_st.encode('utf_8');
                    print self.special_rev_insert_st.encode('utf_8')
                if len(self.inlinks_dict)>0:
                    print self.inlinks_insert_st.encode('utf_8');
                    print self.inlinks_rev_insert_st.enconde('utf_8')
                if len(self.outlinks_dict)>0:
                    print self.outlinks_insert_st.encode('utf_8'); 
                    print self.outlinks_rev_insert_st.encode('utf_8')
                if len(self.trans_dict)>0:
                    print self.trans_insert_st.encode('utf_8'); 
                    print self.trans_rev_insert_st.encode('utf_8')
##            elif self.options.monitor:
##                while 1:
##                    try:
##                        print str(len(self.highwords_dict))+" "+\
##                        str(len(self.special_dict))+ " "+str(len(self.inlinks_dict))+ " "+\
##                        str(len(self.outlinks_dict))+ " "+str(len(self.trans_dict))+"\n"
##                        if len(self.highwords_dict)>0:
##                            dbaccess.raw_query_SQL(self.acceso[1], self.high_insert_st.encode('utf_8'))
##                            dbaccess.raw_query_SQL(self.acceso[1], self.high_rev_insert_st.encode('utf_8'))
##                        if len(self.special_dict)>0:
##                            dbaccess.raw_query_SQL(self.acceso[1], self.special_insert_st.encode('utf_8'))
##                            dbaccess.raw_query_SQL(self.acceso[1], self.special_rev_insert_st.encode('utf_8'))
##                        if len(self.inlinks_dict)>0:
##                            dbaccess.raw_query_SQL(self.acceso[1], self.inlinks_insert_st.encode('utf_8'))
##                            dbaccess.raw_query_SQL(self.acceso[1], self.inlinks_rev_insert_st.encode('utf_8'))
##                        if len(self.outlinks_dict)>0:
##                            dbaccess.raw_query_SQL(self.acceso[1], self.outlinks_insert_st.encode('utf_8'))
##                            dbaccess.raw_query_SQL(self.acceso[1], self.outlinks_rev_insert_st.encode('utf_8'))
##                        if len(self.trans_dict)>0:
##                            dbaccess.raw_query_SQL(self.acceso[1], self.trans_insert_st.encode('utf_8'))
##                            dbaccess.raw_query_SQL(self.acceso[1], self.trans_rev_insert_st.encode('utf_8'))
##                    except (Exception), e:
##                        print e
##                    else:
##                        break
            #Reset status vars
            self.high_insert_st=""; self.high_rev_insert_st=""; self.special_insert_st=""
            self.special_rev_insert_st=""; self.inlinks_insert_st=""; self.inlinks_rev_insert_st=""
            self.outlinks_insert_st=""; self.outlinks_rev_insert_st=""; self.trans_insert_st=""
            self.trans_rev_insert_st=""
            self.highwords_dict={}; self.special_dict={}; self.inlinks_dict={};
            self.outlinks_dict={}; self.trans_dict={}
            self.highwords_rev_insert=[]; self.special_rev_insert=[]; self.inlinks_rev_insert=[]
            self.outlinks_rev_insert=[]; self.trans_rev_insert=[];
            ################################################
            ##Recovering namespace for this page
            if self.nspace_dict.has_key(self.page_dict['title'].split(':')[0]):
                self.page_dict['namespace']=self.nspace_dict[self.page_dict['title'].split(':')[0]]
            else:
                self.page_dict['namespace']='0'
            #########################################
            #CONSTRUCTION OF EXTENDED INSERT FOR PAGES (RESEARCH VERSION)
            #########################################
            ##Values order for page (page_id, page_namespace, page_title,  page_latest, page_len, page_is_redirect,
            ##page_is_stub, page_random, page_is_new, page_restrictions)
            newpageinsert="("+self.page_dict['id']+","+\
            self.page_dict['namespace']+","+'"'+\
            self.page_dict['title'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\
            '"'+","+self.prior_rev_id+","+str(self.last_page_len)+\
            ","+self.isRedirect+","+self.isStub+","+str(random.random())
            if self.rev_count>1:
                newpageinsert+=",1"
            else:
                newpageinsert+=",0"
            if self.page_dict.has_key('restrictions'):
                newpageinsert+=","+'"'+self.page_dict['restrictions']+'"'
            else:
                newpageinsert+=",''"
            newpageinsert+=")"
            if self.pageinsertrows==0:
                self.pageinsert="INSERT INTO page VALUES"+newpageinsert
                self.pageinsertrows+=1
                self.pageinsertsize=len(self.pageinsert)*2
            elif (self.pageinsertsize+(2*len(newpageinsert))<=self.options.imaxsize*1024) and\
            (self.pageinsertrows+1<=self.options.imaxrows):
                #Append current row to extended insert
                self.pageinsert+=","+newpageinsert
                self.pageinsertrows+=1
                self.pageinsertsize=len(self.pageinsert)*2
            else:
                #We must write this extended insert and begin a new one
                if self.options.fileout:
                    #Write extended insert to file
                    self.pageinsert+=";\n"
                    self.pagefile = codecs.open(self.options.pagefile,'a','utf_8')
                    self.pagefile.write(self.pageinsert)
                    self.pagefile.close()
                elif self.options.streamout:
                    #Write extended insert to sys.stdout (stream to MySQL)
                    self.pageinsert+=";"
                    print self.pageinsert.encode('utf_8')
                elif self.options.monitor:
                    chances=0
                    while chances<5:
                        try:
                            dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode('utf_8'))
                        except (Exception), e:
                            self.printfile = codecs.open("error_"+self.options.database,'a','utf_8')
                            self.printfile.write(str(e)+"\n")
                            self.printfile.write(self.pageinsert[0:30]+"\n**********************************")
                            self.printfile.close()
                            chances+=1
                        else:
                            break
                self.pageinsert="INSERT INTO page VALUES"+newpageinsert
                self.pageinsertrows=1
                self.pageinsertsize=len(self.pageinsert)*2
 # We must write the last pageinsert before finishing this dump
 if self.options.fileout:
     # Write output to SQL file
     self.pageinsert += ";\n"
     self.pagefile = codecs.open(self.options.pagefile, "a", "utf_8")
     self.pagefile.write(self.pageinsert)
     self.pagefile.close()
 elif self.options.streamout:
     # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
     self.pageinsert += ";"
     print self.pageinsert.encode("utf_8")
 elif self.options.monitor:
     chances = 0
     while chances < 5:
         try:
             dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode("utf_8"))
         except (Exception), e:
             self.printfile = codecs.open("error_" + self.options.database, "a", "utf_8")
             self.printfile.write(str(e) + "\n")
             self.printfile.write(self.pageinsert[0:30] + "\n**********************************")
             self.printfile.close()
             chances += 1
         else:
             break
 # Reset status vars
 self.pageinsertrows = 0
 self.pageinsertsize = 0
 # INSERT NAMESPACES CODES AND TITLES IN SPECIAL TABLE
 nspaces = self.nspace_dict.iteritems()
 insertns = "INSERT INTO namespaces VALUES"
 first_loop = True
    def endElement(self, name):
        ##    Defining tasks to manage contents from the last readed tag
        ##        Catching the namespace of this page
        if name == "namespace":
            self.nspace_dict[self.current_text] = self.codens

        elif name == "id":
            if self.stack[-1] == "contributor":
                ##Detecting contributor's attributes inside a revision
                self.rev_dict["rev_user"] = self.current_text
            elif self.stack[-1] == "revision":
                self.rev_dict[name] = self.current_text
            elif self.stack[-1] == "page":
                self.page_dict[name] = self.current_text
            else:
                self.f = open(self.fileErrPath, "w")
                if len(self.stack) > 0:
                    self.f.write("Unsupported parent tag for '" + name + "': " + self.stack[-1])
                self.f.close()

        elif name == "ip":
            self.rev_dict["rev_user"] = "******"
            self.rev_dict["username"] = self.current_text

        elif name == "timestamp":
            ##Adequate formatting of timestamps
            self.rev_dict["timestamp"] = self.current_text.replace("Z", "").replace("T", " ")

        elif name == "contributor":
            ##Pop contributor tag from the stack
            self.stack.pop()
        #####################################################
        ## END OF REVISION
        #####################################################
        elif name == "revision":
            self.rev_count += 1
            ##Store whether this is a redirect or stub page or not
            ##TODO: Substitute the find command with a regexp
            if len(self.rev_dict["text"]) > 0:
                if string.upper(self.rev_dict["text"][0:9]) == "#REDIRECT":
                    self.isRedirect = "1"
                else:
                    self.isRedirect = "0"
            ## Takes from the first argument the threshold for stub's length
            if 2 * len(self.rev_dict["text"]) <= self.options.stubth:
                self.isStub = "1"
            else:
                self.isStub = "0"

            ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (RESEARCH VERSION)######
            ##Values order: (rev_id, rev_page, rev_user, rev_user_text, rev_timestamp,
            ##rev_len, rev_letters_raw, rev_words_raw, rev_words_filter, rev_in_links, rev_out_links, rev_trans,
            ##rev_sections, rev_bolds, rev_italics, rev_bolditalics
            ##rev_parent_id, rev_is_redirect, rev_is_stub, rev_is_minor, rev_comment)
            ##Calculation of additional fancy statistics AND
            ##Detection and stripping of wiki tags and HTML tags
            ##We also store inlinks, outlinks and special links
            self.rev_dict["text"] = re.sub(self.pathtml, "", self.rev_dict["text"])  # filter HTML tags
            self.rev_dict["text"] = re.sub(self.patunicode, "X", self.rev_dict["text"])  # convert unicode chars to X
            ##            self.highwords=re.findall(self.pathighwords, self.rev_dict['text']) #detect highlighted words
            ##            for i in range(len(self.highwords)):
            ##                self.highwords[i]=re.sub(self.pathighlight,'', self.highwords[i]) #Filter bold/italics tags
            self.rev_dict["text"] = re.sub(self.pathighlight, "", self.rev_dict["text"])  # filter highlight tags
            ##            self.special=re.findall(self.patspecial, self.rev_dict['text']) #detect special links
            self.trans = re.findall(self.pattrans, self.rev_dict["text"])  # detect translation links
            self.rev_dict["text"] = re.sub(
                self.patspecial, "", self.rev_dict["text"]
            )  # filter out special links (after detecting trans)
            self.inlinks = re.findall(self.patinlink, self.rev_dict["text"])  # detect inlinks
            self.outlinks = re.findall(self.patoutlink, self.rev_dict["text"])  # detect outlinks
            self.sections = re.findall(self.patsection, self.rev_dict["text"])  # detect sections
            self.rev_dict["text"] = re.sub(
                self.patitemize, "", self.rev_dict["text"]
            )  # filter out itemize bullets and line branches
            # Build current row for revinsert
            newrevinsert = (
                "("
                + self.rev_dict["id"]
                + ","
                + self.page_dict["id"]
                + ","
                + self.rev_dict["rev_user"]
                + ","
                + '"'
                + self.rev_dict["username"].replace("\\", "\\\\").replace("'", "\\'").replace('"', '\\"')
                + '"'
                + ","
                + '"'
                + self.rev_dict["timestamp"]
                + '"'
                + ","
                + str(2 * len(self.rev_dict["text"]))
                + ","
                + str(len(self.rev_dict["text"]))
                + ","
                + str(len(self.rev_dict["text"].split()))
            )
            ##            ","+str(len(self.highwords))+","+str(len(self.special))+\
            newrevinsert += (
                ","
                + str(len(self.inlinks))
                + ","
                + str(len(self.outlinks))
                + ","
                + str(len(self.trans))
                + ","
                + str(len(self.sections))
                + ","
                + self.prior_rev_id
                + ","
                + self.isRedirect
                + ","
                + self.isStub
                + ","
                + self.isMinor
            )
            if self.rev_dict.has_key("comment"):
                newrevinsert += (
                    ","
                    + '"'
                    + self.rev_dict["comment"].replace("\\", "\\\\").replace("'", "\\'").replace('"', '\\"')
                    + '"'
                )
            else:
                newrevinsert += ",''"
            newrevinsert += ")"
            #############################################
            ## CONSTRUCT DICTIONARIES WITH SPECIAL ITEMS
            #############################################
            ##            for item in self.highwords:
            ##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
            ##                stumble=self.highwords_dict.get(item)
            ##                if (stumble==None):
            ##                    self.highwords_dict[item]=self.highwords_id
            ##                    self.highwords_rev_insert.append((self.rev_dict['id'],self.highwords_id))
            ##                    self.highwords_id+=1
            ##                else:
            ##                    self.highwords_rev_insert.append((self.rev_dict['id'],stumble))
            ##            for item in self.special:
            ##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
            ##                stumble=self.special_dict.get(item)
            ##                if (stumble==None):
            ##                    self.special_dict[item]=self.special_id
            ##                    self.special_rev_insert.append((self.rev_dict['id'], self.special_id))
            ##                    self.special_id+=1
            ##                else:
            ##                    self.special_rev_insert.append((self.rev_dict['id'], stumble))
            ##            for item in self.inlinks:
            ##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
            ##                stumble=self.inlinks_dict.get(item)
            ##                if (stumble==None):
            ##                    self.inlinks_dict[item]=self.inlinks_id
            ##                    self.inlinks_rev_insert.append((self.rev_dict['id'], self.inlinks_id))
            ##                    self.inlinks_id+=1
            ##                else:
            ##                    self.inlinks_rev_insert.append((self.rev_dict['id'], stumble))
            ##            for item in self.outlinks:
            ##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
            ##                stumble=self.outlinks_dict.get(item)
            ##                if (stumble==None):
            ##                    self.outlinks_dict[item]=self.outlinks_id
            ##                    self.outlinks_rev_insert.append((self.rev_dict['id'], self.outlinks_id))
            ##                    self.outlinks_id+=1
            ##                else:
            ##                    self.outlinks_rev_insert.append((self.rev_dict['id'], stumble))
            ##            for item in self.trans:
            ##                item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')
            ##                stumble=self.trans_dict.get(item)
            ##                if (stumble==None):
            ##                    self.trans_dict[item]=self.trans_id
            ##                    self.trans_rev_insert.append((self.rev_dict['id'], self.trans_id))
            ##                    self.trans_id+=1
            ##                else:
            ##                    self.trans_rev_insert.append((self.rev_dict['id'], stumble))
            ##############################################
            ## LOOK-AHEAD ALGORITHM
            ##############################################
            if self.revinsertrows == 0:
                # Always allow at least one row in extended inserts
                self.revinsert = "INSERT INTO revision VALUES" + newrevinsert
                self.revinsertrows += 1
                # Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize = len(self.revinsert) * 2
            elif (self.revinsertsize + (2 * len(newrevinsert)) <= self.options.imaxsize * 1024) and (
                (self.revinsertrows + 1) <= self.options.imaxrows
            ):
                # Append new row to self.revinsert
                self.revinsert += "," + newrevinsert
                self.revinsertrows += 1
                # Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize = len(self.revinsert) * 2
            else:
                # We must finish and write currrent insert and begin a new one
                if self.options.fileout:
                    self.revinsert += ";\n"
                    # Write output to SQL file
                    self.revfile = codecs.open(self.options.revfile, "a", "utf_8")
                    self.revfile.write(revinsert)
                    self.revfile.close()
                elif self.options.streamout:
                    # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                    self.revinsert += ";"
                    print self.revinsert.encode("utf_8")
                    print self.revinsert.encode("utf_8")
                elif self.options.monitor:
                    chances = 0
                    while chances < 5:
                        try:
                            dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode("utf_8"))
                        except (Exception), e:
                            self.printfile = codecs.open("error_" + self.options.database, "a", "utf_8")
                            self.printfile.write(str(e) + "\n")
                            self.printfile.write(self.revinsert[0:30] + "\n**********************************")
                            self.printfile.close()
                            chances += 1
                        else:
                            break
                self.revinsert = "INSERT INTO revision VALUES" + newrevinsert
                self.revinsertrows = 1
                # Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize = len(self.revinsert) * 2
            ##################################################
            ##Store this rev_id to recall it when processing the following revision, if it exists
            self.prior_rev_id = self.rev_dict["id"]
            ##Store this rev_len to recall it for the current page_len, in case this is the last revision for that page
            self.last_page_len = 2 * len(self.rev_dict["text"])
            self.rev_dict.clear()
            self.stack.pop()
            self.isMinor = "0"
            self.inlinks = []
            self.outlinks = []
            self.trans = []
            self.sections = []
            self.highwords = []
            self.special = []
            self.rev_num += 1
            if self.options.verbose and self.options.log is None:
                # Display status report
                if self.rev_num % 1000 == 0:
                    self.timeCheck = datetime.datetime.now()
                    self.timeDelta = self.timeCheck - self.start
                    if self.timeDelta.seconds == 0:
                        print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)" % (
                            self.page_num,
                            1e6 * float(self.page_num) / self.timeDelta.microseconds,
                            self.rev_num,
                            1e6 * float(self.rev_num) / self.timeDelta.microseconds,
                        )
                        self.printfile = codecs.open(self.fileErrPath, "a", "utf_8")
                        self.printfile.write(
                            "page "
                            + str(self.page_num)
                            + " ("
                            + str(1e6 * float(self.page_num) / self.timeDelta.microseconds)
                            + " pags. per sec.), revision "
                            + str(self.rev_num)
                            + " ("
                            + str(1e6 * float(self.rev_num) / self.timeDelta.microseconds)
                            + " revs. per sec.)\n"
                        )
                        self.printfile.close()
                    else:
                        print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)" % (
                            self.page_num,
                            float(self.page_num) / self.timeDelta.seconds,
                            self.rev_num,
                            float(self.rev_num) / self.timeDelta.seconds,
                        )
                        self.printfile = codecs.open(self.fileErrPath, "a", "utf_8")
                        self.printfile.write(
                            "page "
                            + str(self.page_num)
                            + " ("
                            + str(float(self.page_num) / self.timeDelta.seconds)
                            + " pags./sec.), revision "
                            + str(self.rev_num)
                            + " ("
                            + str(float(self.rev_num) / self.timeDelta.seconds)
                            + " revs./sec.)\n"
                        )
                        self.printfile.close()
            if self.options.verbose and self.options.log is not None:
                if self.rev_num % 1000 == 0:
                    self.timeCheck = datetime.datetime.now()
                    self.timeDelta = self.timeCheck - self.start
                    if self.timeDelta.seconds == 0:
                        self.printfile = codecs.open(self.options.log, "a", "utf_8")
                        self.printfile.write(
                            "page "
                            + str(self.page_num)
                            + " ("
                            + str(1e6 * float(self.page_num) / self.timeDelta.microseconds)
                            + " pags./sec.), revision "
                            + str(self.rev_num)
                            + " ("
                            + str(1e6 * float(self.rev_num) / self.timeDelta.microseconds)
                            + " revs./sec.)\n"
                        )
                        self.printfile.close()
                    else:
                        self.printfile = codecs.open(self.options.log, "a", "utf_8")
                        self.printfile.write(
                            "page "
                            + str(self.page_num)
                            + " ("
                            + str(float(self.page_num) / self.timeDelta.seconds)
                            + " pags./sec.), revision "
                            + str(self.rev_num)
                            + " ("
                            + str(float(self.rev_num) / self.timeDelta.seconds)
                            + " revs./sec.)\n"
                        )
                        self.printfile.close()
Beispiel #33
0
 def overall(self):
     """
     Preprocessing tables for evolution of page length over time
     """
     for self.language in self.languages:
       self.dbname="wx_"+self.language+"wiki_research"	
       self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
       dbaccess.raw_query_SQL(self.access[1], "create or replace view page_main_nored as "+\
       "(select page_id from page where page_namespace=0 and page_is_redirect=0)")
       dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_main_nored as ("+\
       "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\
       "(select page_id from page_main_nored))")
       self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision")
       self.years=range(int(self.minyear[0][0])+1, 2009)
       for self.year in self.years:
       dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_"+str(self.year))
               dbaccess.raw_query_SQL(self.access[1],"create table max_rev_"+str(self.year)+\
       " as (select max(rev_id) as max_id, rev_page from rev_main_nored "+\
       "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)")
       dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add primary key (max_id)")
       dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add index (rev_page)")
         
       dbaccess.close_Connection(self.access[0])
Beispiel #34
0
 ################################################
 #We must write the las revinsert before finishing this page
 if self.options.fileout:
     self.revinsert += ";\n"
     # Write output to SQL file
     self.revfile = codecs.open(self.options.revfile, 'a', 'utf_8')
     self.revfile.write(self.revinsert)
     self.revfile.close()
 elif self.options.streamout:
     # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
     self.revinsert += ";"
     print self.revinsert.encode('utf_8')
 elif self.options.monitor:
     while 1:
         try:
             dbaccess.raw_query_SQL(self.acceso[1],
                                    self.revinsert.encode('utf_8'))
         except (Exception), e:
             print e
         else:
             break
 #Reset status vars
 self.revinsertrows = 0
 self.revinsertsize = 0
 ################################################
 ##Same for Insert into text table
 if self.options.fileout:
     self.textinsert += ";\n"
     # Write output to SQL file
     self.textfile = codecs.open(self.options.textfile, 'a',
                                 'utf_8')
     self.textfile.write(self.textinsert)
Beispiel #35
0
    def endElement(self, name):
        ##    Defining tasks to manage contents from the last readed tag
        ##        Catching the namespace of this page
        if name == 'namespace':
            self.nspace_dict[self.current_text] = self.codens

        elif name == 'id':
            if self.stack[-1] == 'contributor':
                ##Detecting contributor's attributes inside a revision
                self.rev_dict['rev_user'] = self.current_text
            elif self.stack[-1] == 'revision':
                self.rev_dict[name] = self.current_text
            elif self.stack[-1] == 'page':
                self.page_dict[name] = self.current_text
            else:
                self.f = open(self.fileErrPath, 'w')
                if len(self.stack) > 0:
                    self.f.write("Unsupported parent tag for '" + name +
                                 "': " + self.stack[-1])
                self.f.close()

        elif name == 'ip':
            self.rev_dict['rev_user'] = '******'
            self.rev_dict['username'] = self.current_text

        elif name == 'timestamp':
            ##Adequate formatting of timestamps
            self.rev_dict['timestamp'] = self.current_text.replace(
                'Z', '').replace('T', ' ')

        elif name == 'contributor':
            ##Pop contributor tag from the stack
            self.stack.pop()

        elif name == 'revision':
            self.rev_count += 1
            ##Store whether this is a redirect or stub page or not
            if len(self.rev_dict['text']) > 0:
                if string.upper(self.rev_dict['text'][0:9]) == '#REDIRECT':
                    self.isRedirect = '1'
                else:
                    self.isRedirect = '0'
            ## Takes from the first argument the threshold for stub's length
            if str(2 * len(self.rev_dict['text'])) <= self.options.stubth:
                self.isStub = '1'
            else:
                self.isStub = '0'

            ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (STANDARD VERSION)######
            ##Values order: (rev_id, rev_page, [[rev_text_id=rev_id]], rev_comment,
            ##rev_user, rev_user_text, rev_timestamp, rev_is_minor)
            # Build current row for revinsert
            newrevinsert = "(" + self.rev_dict['id'] + "," + self.page_dict[
                'id'] + "," + self.rev_dict['id']
            if self.rev_dict.has_key('comment'):
                newrevinsert += "," + '"' + self.rev_dict['comment'].replace(
                    "\\", "\\\\").replace("'", "\\'").replace('"', '\\"') + '"'
            else:
                newrevinsert += ",''"
            newrevinsert+=","+self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].\
            replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\
            '"'+","+'"'+self.rev_dict['timestamp']+\
            '"'+","+self.isMinor+")"
            if self.revinsertrows == 0:
                #Always allow at least one row in extended inserts
                self.revinsert = "INSERT INTO revision VALUES" + newrevinsert
                self.revinsertrows += 1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize = len(self.revinsert) * 2
            elif (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024) and\
            ((self.revinsertrows+1)<=self.options.imaxrows):
                #Append new row to self.revinsert
                self.revinsert += "," + newrevinsert
                self.revinsertrows += 1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize = len(self.revinsert) * 2
            else:
                #We must finish and write currrent insert and begin a new one
                if self.options.fileout:
                    self.revinsert += ";\n"
                    # Write output to SQL file
                    self.revfile = codecs.open(self.options.revfile, 'a',
                                               'utf_8')
                    self.revfile.write(revinsert)
                    self.revfile.close()
                elif self.options.streamout:
                    # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                    self.revinsert += ";"
                    print self.revinsert.encode('utf_8')
                elif self.options.monitor:
                    while 1:
                        try:
                            dbaccess.raw_query_SQL(
                                self.acceso[1], self.revinsert.encode('utf_8'))
                        except (Exception), e:
                            print e
                        else:
                            break
                self.revinsert = "INSERT INTO revision VALUES" + newrevinsert
                self.revinsertrows = 1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.revinsertsize = len(self.revinsert) * 2

            ##################################################
            ##CONSTRUCTION OF EXTENDED INSERTS FOR TABLE TEXT
            ##Template for each row:
            ## (old_id, old_text, old_flags)
            newtextinsert = "(" + self.rev_dict['id'] + ',' + '"'
            if self.options.inject != None:
                newtextinsert += self.options.inject
            newtextinsert+=self.rev_dict['text'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\
            '",'+'"utf8")'
            if self.textinsertrows == 0:
                #Always allow at least one row in extended inserts
                self.textinsert = "INSERT INTO text VALUES" + newtextinsert
                self.textinsertrows += 1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.textinsertsize = len(self.textinsert) * 2
            elif (self.textinsertsize+(2*len(newtextinsert))<=self.options.imaxsize*1024) and\
            ((self.textinsertrows+1)<=self.options.imaxrows):
                #Append new row to self.revinsert
                self.textinsert += "," + newtextinsert
                self.textinsertrows += 1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.textinsertsize = len(self.textinsert) * 2
            else:
                #We must finish and write currrent insert and begin a new one
                if self.options.fileout:
                    self.textinsert += ";\n"
                    # Write output to SQL file
                    self.textfile = codecs.open(self.options.textfile, 'a',
                                                'utf_8')
                    self.textfile.write(textinsert)
                    self.textfile.close()
                elif self.options.streamout:
                    # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                    self.textinsert += ";"
                    print self.textinsert.encode('utf_8')
                elif self.options.monitor:
                    while 1:
                        try:
                            dbaccess.raw_query_SQL(
                                self.acceso[1],
                                self.textinsert.encode('utf_8'))
                        except (Exception), e:
                            print e
                        else:
                            break
                self.textinsert = "INSERT INTO text VALUES" + newtextinsert
                self.textinsertrows = 1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.textinsertsize = len(self.textinsert) * 2
Beispiel #36
0
    def analyze(self):
        #Initialize file header
        f=open("wkp_cox_prop_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts,in_talk,in_FAs\n")
        f.close()
        
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
	    
            print "Starting language "+self.language+"\n"
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            ##Create table of users in talk pages
            
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_talk")
            dbaccess.raw_query_SQL(self.access[1],"create table users_in_talk as (select distinct(rev_user) from revision "+\
            "where rev_page in (select page_id from page where page_namespace=1))")
            dbaccess.raw_query_SQL(self.access[1],"alter table users_in_talk add primary key (rev_user)")
            
            ##Create table of users in FAs
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_FAs")
            dbaccess.raw_query_SQL(self.access[1],"create table users_in_FAs as (select distinct(rev_user) from revision_FAs)")
            dbaccess.raw_query_SQL(self.access[1],"alter table users_in_FAs add primary key (rev_user)")
            
            ##MIX previous info with time_range_authors --> save result in new table time_range_cox
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_cox")
            dbaccess.raw_query_SQL(self.access[1],"create table time_range_cox as (select rev_user, "+\
            "date(min_ts) as min_ts, date(max_ts) as max_ts, "+\
            "case when rev_user in (select rev_user from users_in_talk) then 1 else 0 end as in_talk, "+\
            "case when rev_user in (select rev_user from users_in_FAs) then 1 else 0 end as in_FAs "+\
            "from time_range_authors)")
	    
            ##IN SYSTEM
            print "Interm. tables created proceeding to write out data..."+self.language+"\n"
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, min_ts, max_ts, in_talk, in_FAs "+\
            "from time_range_cox "+\
            " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
	    
            f=open("wkp_cox_prop_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\","+\
                str(int(result[3]))+","+str(int(result[4]))+"\n")
            f.close()
            print "Finished all tasks for "+self.language+"\n"
Beispiel #37
0
 def decompress (self):
     """
     Decompress the DB dumps into MySQL
     """
     ##TODO: Ad-hoc acuerdate de quitar esto POR DIOSSS
     ##self.filename="mtwiki-latest-pages-meta-history.xml.7z" 
     if self.dumptype=="research":
         program="dump_sax_research.py"
     elif self.dumptype=="standard":
         program="dump_sax.py"
     else:
         print "Error! Unexpected type of dump received"
         return -1
     self.filename=self.filenameTemplate.safe_substitute(language=self.language,file=self.files[0])
     #Then we call our parser "dump_sax_research.py" to load data into MySQL
     command_7z="7za e -so dumps/"+self.filename+" | "+"python "+program+\
     " -u "+self.msqlu+" -p "+self.msqlp+" -d "+"wx_"+self.language+"_"+self.dumptype+\
     " --log "+self.language+".log"
     success=os.system(command_7z)
     if success == 0:
         print "DB "+"wx_"+self.language+\
         self.dumptype+" successfully decompressed...\n\n"
     else:
         print "Error! There was an error trying to decompress database --> "+\
         "wx_"+self.language+self.dumptype
         return -1
     #Loading into MySQL other interesting tables directly provided in SQL format
     #SQL code to generate the tables is embedded in the SQL file itself
     for index in range(1,len(self.files)):
         self.filename=self.filenameTemplate.safe_substitute(language=self.language, file=self.files[index])
         command_gzip="gzip -d dumps/"+self.filename
         command_mysql="mysql -u "+self.msqlu+" -p"+self.msqlp+\
         " wx_"+self.language+"_"+self.dumptype+\
         " < dumps/"+self.filename.strip(".gz")
         command_comp="gzip dumps/"+self.filename.strip(".gz")
         print "Decompressing "+self.filename+"..."
         success=os.system(command_gzip)
         if success==0:
             print "Loading "+self.filename.strip(".gz")+" into MySQL database..."
             success=os.system(command_mysql)
             if success==0:
                 print "Compressing again "+self.filename.strip(".gz")+"..."
                 success=os.system(command_comp)
                 if success!=0:
                     print "Error compressing again "+self.filename.strip(".gz")
                     return -1
             else:
                 print "Error loading "+self.filename.strip(".gz")
                 return -1
         else:
             print "Error decompressing "+self.filename
             return -1
     print "Generating indexes for tables page and revision...\n"
     print "Depending on the dump size this may take a while...\n"
     acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu,\
     self.msqlp, "wx_"+self.language+"_"+self.dumptype)
     #Generate adequate indexes and keys in tables page and revision
     print "Generating index for page_len...\n"
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE page ADD INDEX (page_len)")
     print "Modifying rev_timestamp to support DATETIME and creating index...\n"
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision MODIFY rev_timestamp DATETIME")
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX timestamp (rev_timestamp)")
     print "Generating index for rev_page and rev_timestamp...\n"
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)")
     print "Generating index for rev_user and rev_timestamp...\n"
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)")
     print "Generating index for rev_user_text and timestamp...\n"
     dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)")
     dbaccess.close_Connection(acceso[0])
     print "Database ready for quantitative analysis...\n"
     print "Let's go on... Cross your fingers... ;-) \n\n\n"
     return success
Beispiel #38
0
     print "An exception ocurred, the problem was the following:\n"
     print e
     print "*************\n\n"
 try:
     print "Generating index for rev_page and rev_timestamp...\n"
     dbaccess.raw_query_SQL(
         self.access[1], "ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)"
     )
 except Exception, e:
     print "An exception ocurred, the problem was the following:\n"
     print e
     print "*************\n\n"
 try:
     print "Generating index for rev_user and rev_timestamp...\n"
     dbaccess.raw_query_SQL(
         self.access[1], "ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)"
     )
 except Exception, e:
     print "An exception ocurred, the problem was the following:\n"
     print e
     print "*************\n\n"
 # try:
 # print "Generating index for rev_user_text and timestamp...\n"
 # dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)")
 # except Exception, e:
 # print "An exception ocurred, the problem was the following:\n"
 # print e
 # print "*************\n\n"
 print "Database" + "wx_" + self.language + "wiki_" + self.dumptype + " ready for quantitative analysis...\n"
 ##Close connection to DB server
 dbaccess.close_Connection(self.access[0])
Beispiel #39
0
    def core_prepro(self):
        """
        Creates intermediate tables with info about core members (by activity
        and by top % of total number of revisions
        """
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)

            #Obtain the list of years and months, with total num. of revisions and total num of logged users
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists core_limits_monthly")
            dbaccess.raw_query_SQL(self.access[1],"create table core_limits_monthly as "+\
            "(select year(rev_timestamp) as year, month(rev_timestamp) as month, "+\
            "count(distinct(rev_user)) num_users, count(*) num_revs from revision_logged group by year, month "+\
            "order by year, month)")
            
            print "Created table core_limits_monthly "+self.language+"\n"
            
            date_range=dbaccess.raw_query_SQL(self.access[1],"select * from core_limits_monthly "+\
            "order by year, month")
            
            #Core users: top-10% of total number of authors in that month
            #Core users with top-10% of total number of revisions in that month
            
            #Loop for each month
            need_create=True
            #LOOP FOR EACH MONTH IN LANG
            for adate in date_range:
                print "Processing year "+str(adate[0])+" month "+str(adate[1])+"\n"
                total_users=adate[2] #Total number of authors in that month
                total_revs=adate[3] #Total number of revisions in that month
                # To take the core of top-10% most active authors in that month
                limit_auth=int(round(total_users*0.1))+1
                # To take the core of authors responsible for top-10% of tot num.revs in that month
                limit_revs=int(round(total_revs*0.1))
                count_users=0
                count_revs=0
                insert_users=True
                insert_revs=True
                    
                #Get the list of active logged users for that month (descendent order!)
                ##IMPORTANT NOTE: FIRST APPLY SUBQUERY TO FILTER ALL REVISIONS IN THIS MONTH
                ##THEN APPLY THE GROUP AND ORDER CLAUSES ON THAT SUBQUERY
                ##THIS WAY, WE SAVE **A LOT** OF TIME DURING THIS PREPROCESSING STAGE
                month_users=dbaccess.raw_query_SQL(self.access[1],"select rev_user, count(*) num_revs_month from "+\
                "(select rev_user, rev_timestamp from revision_logged where "+\
                "year(rev_timestamp)="+str(int(adate[0]))+" and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\
                "order by num_revs_month desc" )
                
                #Calculate num. of authors accumulating top-10% of revs in that month
                for auser in month_users:
                    count_revs=count_revs+int(auser[1])
                    count_users=count_users+1
                    if (count_revs>limit_revs):
                        break
                    
                if (need_create):
                    #TABLE: Monthly info for users in core (by activity)
                    dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_core_monthly")
                    dbaccess.raw_query_SQL(self.access[1],"create table users_core_monthly as (select "+\
                    "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\
                    "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\
                    "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\
                    " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\
                    "order by num_revs_month desc limit "+str(limit_auth)+")" )
                    
                    #TABLE: Monthly info for users in core (by revisions)
                    dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_rev_core_monthly")
                    dbaccess.raw_query_SQL(self.access[1],"create table users_rev_core_monthly as (select "+\
                    "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\
                    "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\
                    "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\
                    " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\
                    "order by num_revs_month desc limit "+str(count_users)+")" )
                            
                    print "Created tables monthly data for "+self.language+"\n"
                    need_create=False
                
                else:
                    #Insert info in table with monthly info for users in core (by activity)
                    dbaccess.raw_query_SQL(self.access[1],"insert into users_core_monthly (select "+\
                    "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\
                    "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\
                    "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\
                    " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\
                    "order by num_revs_month desc limit "+str(limit_auth)+")" )
                    
                    #Insert info in table with monthly info for users in core (by revisions)
                    dbaccess.raw_query_SQL(self.access[1],"insert into users_rev_core_monthly (select "+\
                    "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\
                    "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\
                    "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\
                    " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\
                    "order by num_revs_month desc limit "+str(count_users)+")" )
                    
                print "Inserted monthly data for "+self.language+"\n"
                ####NOTE: WE ARE SUPPOSING THAT USERS DOES NOT LEAVE THE CORE SUBSEQUENTLY, TO COME BACK AGAIN, i.e.
                ####ONCE THEY JOIN THE CORE, WE ASSUME THAT THE DEFINITELY LEAVE IT AT max_ts_core
                ####BY THE MOMENT, WE WILL STICK TO THIS ASSUMPTION. LATER ON, WE CAN SEE HOW TO IDENTIFY BLANK PERIODS
            
            #Insert in table of core users values
            #users_core = top-10% most active authors in each month
            #users_rev_core = authors accumulating top-10% of tot. num. of revs. in that month
            
            #TABLE: Users in core by activity (user, min_ts, max_ts, min_ts_core, max_ts_core)
            print "Creating table users_core for "+ self.language+"\n"
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_core")
            dbaccess.raw_query_SQL(self.access[1], "create table users_core as (select x.*, "+\
            "(select min_ts from time_range_users r where r.rev_user=x.rev_user) min_ts, (select max_ts from "+\
            "time_range_users s where s.rev_user=x.rev_user) max_ts "+\
            "from (select rev_user, min(lower_ts_month) min_ts_core, "+\
            "max(upper_ts_month) max_ts_core from users_core_monthly group by rev_user) x)")
            
            #TABLE: Users in core by activity (user, min_ts, max_ts, min_ts_core, max_ts_core)
            print "Creating table users_rev_core for "+ self.language+"\n"
            dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_rev_core")
            dbaccess.raw_query_SQL(self.access[1], "create table users_rev_core as (select x.*, "+\
            "(select min_ts from time_range_users r where r.rev_user=x.rev_user) min_ts, (select max_ts from "+\
            "time_range_users s where s.rev_user=x.rev_user) max_ts "+\
            "from (select rev_user, min(lower_ts_month) min_ts_core, "+\
            "max(upper_ts_month) max_ts_core from users_rev_core_monthly group by rev_user) x)")
            
            print "All core_prepro tasks finished for"+ self.language+"\n"

            #Close DB connection
            dbaccess.close_Connection(self.access[0])
Beispiel #40
0
            print "*************\n\n"
	
        try:
            print "Retrieving list of logged users..."
            users=dbaccess.raw_query_SQL(self.access[1],"select distinct(rev_user) from revision where rev_user!=0 "+\
	    "and rev_user not in (select ug_user from user_groups where ug_group='bot')")
        except Exception, e:
            print "An exception ocurred, the problem was the following:\n"
            print e
            print "*************\n\n"
	print "Composing lag info, and inserting in db table...\n"
	for user in users:
	  history=[]
	  try:
	      print "User "+str(int(user[0]))+"..."
	      history=dbaccess.raw_query_SQL(self.access[1],"select rev_user, rev_timestamp from revision "+\
	      "where rev_user="******" order by rev_timestamp")
	  except Exception, e:
	      print "An exception ocurred in user processing, the problem was the following:\n"
	      print e
	      print "*************\n\n"
	  # It only makes sense to insert information if there are at least 2 editions for a certain user
	  if length(history)>1:
	    j=0
	    result=[]
	    query=""
	    for item in history:
		if (j+1)<len(history):
		    result.append((item[0], item[1], history[j+1][1]))
		    j=j+1
	    k=0
	    for item in result:
Beispiel #41
0
    def surv_files(self):
        """
        Creates all data files used as input for demography scripts in GNU R
        """
        #Initialize all files headers
        #FILE: Survival data for all users (including editors out of MAIN)
        f=open("wkp_surv_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        #FILE: Survival data for all logged users who edited in MAIN
        f=open("wkp_surv_main_all.dat",'w')
        f.write("Project,rev_user,min_ts,max_ts\n")
        f.close()
        #FILE: Survival data for all logged editors until they join the core (activity)
        f=open("wkp_surv_join_core_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        #FILE: Survival data for logged editors since they join the core until they leave it (activity)
        f=open("wkp_surv_in_core_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        #FILE: Survival data for loged editors since they leave the core until death (activity)
        f=open("wkp_surv_core_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
        #FILE: Survival data for all logged editors until they join the core (revisions)
        f=open("wkp_surv_join_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts,min_ts_core\n")
        f.close()
        #FILE: Survival data for logged editors since they join the core until they leave it (revisions)
        f=open("wkp_surv_in_core_rev_all.dat",'w')
        f.write("Project,rev_user,min_ts_core,max_ts_core\n")
        f.close()
        #FILE: Survival data for loged editors since they leave the core until death (revisions)
        f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w')
        f.write("Project,rev_user,max_ts_core,max_ts\n")
        f.close()
            
        for self.language in self.languages:
            self.dbname="wx_"+self.language+"wiki_research"
        
            print "Starting language "+self.language+"\n"
            ##IN SYSTEM
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\
            " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()

            ##IN MAIN
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_main_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished core users by activity for language "+self.language+"\n"

            ###########################
            ##REV CORE
            ##JOIN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_join_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##IN CORE
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_in_core_rev_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            ##CORE TO DEATH
            self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname)
            results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core")
            #Close DB connection
            dbaccess.close_Connection(self.access[0])
            
            f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a')
            for result in results:
                f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n")
            f.close()
            
            print "Finished all surv_file tasks for "+self.language+"\n"
Beispiel #42
0
    def decompress (self):
        """
        Decompress the DB dumps into MySQL
        """
        ##TODO: Ad-hoc acuerdate de quitar esto POR DIOSSS
        ##self.filename="mtwiki-latest-pages-meta-history.xml.7z" 
        if self.dumptype=="research":
            program="dump_sax_research.py"
        elif self.dumptype=="standard":
            program="dump_sax.py"
        else:
            print "Error! Unexpected type of dump received"
            return -1
        self.filename=self.filenameTemplate.safe_substitute(language=self.language,file=self.files[0])
        #Then we call our parser "dump_sax_research.py" to load data into MySQL
        command_7z="7za e -so dumps/"+self.filename+" | "+"python "+program+\
        " -u "+self.msqlu+" -p "+self.msqlp+" -d "+"wx_"+self.language+"_"+self.dumptype+\
        " --log "+self.language+".log"
        success=os.system(command_7z)
        if success == 0:
            print "DB "+"wx_"+self.language+\
            self.dumptype+" successfully decompressed...\n\n"
        else:
            print "Error! There was an error trying to decompress database --> "+\
            "wx_"+self.language+self.dumptype
            return -1
        #Loading into MySQL other interesting tables directly provided in SQL format
        #SQL code to generate the tables is embedded in the SQL file itself
##        for index in range(1,len(self.files)):
##            self.filename=self.filenameTemplate.safe_substitute(language=self.language, file=self.files[index])
##            command_gzip="gzip -d dumps/"+self.filename
##            command_mysql="mysql -u "+self.msqlu+" -p"+self.msqlp+\
##            " wx_"+self.language+"_"+self.dumptype+\
##            " < dumps/"+self.filename.rstrip(".gz")
##            command_comp="gzip dumps/"+self.filename.rstrip(".gz")
##            print "Decompressing "+self.filename+"..."
##            success=os.system(command_gzip)
##            if success==0:
##                print "Loading "+self.filename.rstrip(".gz")+" into MySQL database..."
##                success=os.system(command_mysql)
##                if success==0:
##                    print "Compressing again "+self.filename.rstrip(".gz")+"..."
##                    success=os.system(command_comp)
##                    if success!=0:
##                        print "Error compressing again "+self.filename.rstrip(".gz")
##                        return -1
##                else:
##                    print "Error loading "+self.filename.rstrip(".gz")
##                    return -1
##            else:
##                print "Error decompressing "+self.filename
##                return -1
        print "Generating indexes for tables page and revision...\n"
        print "Depending on the dump size this may take a while...\n"
        acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu,\
        self.msqlp, "wx_"+self.language+"_"+self.dumptype)
        #Generate adequate indexes and keys in tables page and revision
        print "Generating index for page_len...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE page ADD INDEX (page_len)")
        print "Modifying rev_timestamp to support DATETIME and creating index...\n"
        #dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision MODIFY rev_timestamp DATETIME")
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX timestamp (rev_timestamp)")
        print "Generating index for rev_page and rev_timestamp...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)")
        print "Generating index for rev_user and rev_timestamp...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)")
        print "Generating index for rev_user_text and timestamp...\n"
        dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)")
        dbaccess.close_Connection(acceso[0])
        print "Database ready for quantitative analysis...\n"
        print "Let's go on... Cross your fingers... ;-) \n\n\n"
        return success
Beispiel #43
0
                #We must finish and write currrent insert and begin a new one
                if self.options.fileout:
                    self.loginsert+=";\n"
                    # Write output to SQL file
                    self.revfile = codecs.open(self.options.sqlfile,'a','utf_8')
                    self.revfile.write(self.loginsert)
                    self.revfile.close()
                elif self.options.streamout:
                    # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL
                    self.loginsert+=";"
                    print self.loginsert.encode('utf_8')
                elif self.options.monitor:
                    chances=0
                    while chances<5:
                        try:
                            dbaccess.raw_query_SQL(self.acceso[1], self.loginsert.encode('utf_8'))
                        except (Exception), e:
                            self.printfile = codecs.open("error_"+self.options.database,'a','utf_8')
                            self.printfile.write(str(e)+"\n")
                            self.printfile.write(self.loginsert[0:30]+"\n**********************************")
                            self.printfile.close()
                            chances+=1
                        else:
                            break
                self.loginsert="INSERT INTO logging VALUES"+newloginsert
                self.loginsertrows=1
                #Conservative approach: assuming 2 bytes per UTF-8 character
                self.loginsertsize=len(self.loginsert)*2

            ##################################################