def endDocument(self): ################################################ # We must write the las revinsert before finishing this page if self.options.fileout: self.revinsert += ";\n" # Write output to SQL file self.revfile = codecs.open(self.options.revfile, "a", "utf_8") self.revfile.write(self.revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.revinsert += ";" print self.revinsert.encode("utf_8") elif self.options.monitor: chances = 0 while chances < 5: try: dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode("utf_8")) except (Exception), e: self.printfile = codecs.open("error_" + self.options.database, "a", "utf_8") self.printfile.write(str(e) + "\n") self.printfile.write(self.revinsert[0:30] + "\n**********************************") self.printfile.close() chances += 1 else: break
def endDocument(self): ################################################ #We must write the las revinsert before finishing this page if self.options.fileout: self.revinsert+=";\n" # Write output to SQL file self.revfile = codecs.open(self.options.revfile,'a','utf_8') self.revfile.write(self.revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.revinsert+=";" print self.revinsert.encode('utf_8') elif self.options.monitor: chances=0 while chances<5: try: dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf_8')) except (Exception), e: self.printfile = codecs.open("error_"+self.options.database,'a','utf_8') self.printfile.write(str(e)+"\n") self.printfile.write(self.revinsert[0:30]+"\n**********************************") self.printfile.close() chances+=1 else: break
def printSql(sqlquery, csvfile, header): writer = csv.writer(open(csvfile, "w")) writer.writerow(header) data = dbaccess.raw_query_SQL(acceso[1], sqlquery) for row in data: #print(row) writer.writerow(row)
def ratios(self): """ .dat files showing interesting descriptive ratios """ #FILE author-pages.dat ratio no. logged editors/no. user pages file=open("overall/data/editors-userpages.dat",'w') file.write("logged_authors\tuser_pages\tratio\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Obtain number of different logged authors self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\ "revision where rev_user!=0") #Obtain number of different user pages (nspace =2) self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=2") dbaccess.close_Connection(self.access[0]) #Writing data to file file=open("overall/data/author-pages.dat",'a') file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\ str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n") file.close() #print "Completed lang "+self.language+"\n" #FILE articles-talk-ratio.dat ratio of no. articles/no. talk pages (excluding redirects) file=open("overall/data/articles-talk-ratio.dat",'w') file.write("articles\ttalk\tratio\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Obtain number of articles excluding redirects self.articles=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=0 and page_is_redirect=0") #Obtain number of talk pages self.talk=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=1") dbaccess.close_Connection(self.access[0]) #Writing data to file file=open("overall/data/articles-talk-ratio.dat",'a') file.write(str(int(self.articles[0][0]))+"\t"+str(int(self.talk[0][0]))+"\t"+\ str(float(self.talk[0][0])/float(self.articles[0][0]))+"\t"+self.language+"\n") file.close()
def make_indexes(self): self.access = dbaccess.get_Connection( "localhost", 3306, self.user, self.passw, "wx_" + self.language + "wiki_" + self.dumptype ) # Generate adequate indexes and keys in tables page and revision # try: # print "Generating index for page_len...\n" # dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE page ADD INDEX page_len(page_len)") # except Exception, e: # print "An exception ocurred, the problem was the following:\n" # print e # print "*************\n\n" try: print "Creating index for rev_timestamp" dbaccess.raw_query_SQL(self.access[1], "ALTER TABLE revision ADD INDEX timestamp(rev_timestamp)") except Exception, e: print "An exception ocurred, the problem was the following:\n" print e print "*************\n\n"
def overall(self): """ Preprocessing tables for evolution of page length over time """ file=open("author-pages.dat",'w') file.write("logged_authors\tuser_pages\tratio\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) self.logged_authors=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(rev_user)) from "+\ "revision where rev_user!=0") self.user_pages=dbaccess.raw_query_SQL(self.access[1], "select count(distinct(page_id)) from "+\ "page where page_namespace=2") dbaccess.close_Connection(self.access[0]) file=open("author-pages.dat",'a') file.write(str(int(self.logged_authors[0][0]))+"\t"+str(int(self.user_pages[0][0]))+"\t"+\ str(float(self.user_pages[0][0])/float(self.logged_authors[0][0]))+"\t"+self.language+"\n") file.close() print "Completed lang "+self.language+"\n"
def calculate(self): self.access = dbaccess.get_Connection("localhost", 3306, self.user,\ self.passw, "wx_"+self.language+"wiki_"+self.dumptype) try: print "Creating table for logged users..." users=dbaccess.raw_query_SQL(self.access[1],"create table lag_info (rev_user INT(10) UNSIGNED NOT NULL,"+\ "fecha1 datetime not null, fecha2 datetime not null)") except Exception, e: print "An exception ocurred, the problem was the following:\n" print e print "*************\n\n"
def endDocument(self): ################################################ #We must write the last pageinsert before finishing this dump if self.options.fileout: # Write output to SQL file self.pageinsert+=";\n" self.pagefile = codecs.open(self.options.pagefile,'a','utf_8') self.pagefile.write(self.pageinsert) self.pagefile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.pageinsert+=";" print self.pageinsert.encode('utf-8') elif self.options.monitor: while 1: try: dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode('utf-8')) except (Exception), e: print e else: break
def commitsPerPeriodPerCommiter(self): """ Print lines to a file, each line repesenting: period commiter commits """ filehand = open(self.dataPath + "commits_per_period_per_commiter", "w") result = dbaccess.raw_query_SQL(self.acceso[1], "select * from contribs_period_author_" + self.language) for row in result: # 0:period, 1:commiter, 2:commits filehand.write(row[0] + " " + row[1] + " " + row[2] + "\n") filehand.close()
def __init__(self, language="furwiki", dumptype="research", msqlu="", msqlp=""): """ It receives the language and dumptype to download It returns an int =0 if the DB was successfully set up, =-1 if there was an error """ self.language=language #language to download self.dumptype=dumptype #type of dump self.files=["pages-meta-history.xml.7z", "redirect.sql.gz","page_restrictions.sql.gz",\ "user_groups.sql.gz", "logging.sql.gz", "interwiki.sql.gz", "langlinks.sql.gz", "externallinks.sql.gz",\ "templatelinks.sql.gz", "imagelinks.sql.gz", "categorylinks.sql.gz", "pagelinks.sql.gz", "oldimage.sql.gz",\ "image.sql.gz"] self.filename="" self.filenameTemplate=string.Template("""$language-latest-$file""") #dump's filename in Wikimedia's server #URL to download the file self.urld="" self.urldTemplate=string.Template("""http://download.wikimedia.org/$language/latest/$language-latest-$file""") if (msqlu=="" or msqlp==""): print "Error initializing DB dump object. You must provide a valid MySQL username and password" else: self.msqlu=msqlu #MySQL username for accessing and editing the DB self.msqlp=msqlp #MySQL password #We can manage two different types of dumps, stubs (without the text of every revision) and pages #(containing the text of every revision) #self.urld="http://download.wikimedia.org/"+self.language+"/latest/"+\ #self.language+"-latest-pages-meta-history.xml.7z" #File to download #patterns for files #http://download.wikimedia.org/furwiki/20060921/furwiki-20060921-pages-meta-history.xml.7z #http://download.wikimedia.org/amwiki/20061014/amwiki-20061014-stub-meta-history.xml.gz #Create /dumps directory if it does not exist yet directories=os.listdir("./") if ("dumps" not in directories): os.makedirs("./dumps") ## Initialize DB in MySQL: create DB and tables definitions print "Initializing DB for --> "+ self.language +"\n" #Retrieving connection and cursor to access the DB acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,"mysql") dbaccess.createDB_SQL(acceso[1],"wx_"+self.language+"_"+self.dumptype) if self.dumptype=="research": command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\ "wx_"+self.language+"_"+self.dumptype+" < tables_research.sql > debug_mysql.log" elif self.dumptype=="standard": command="mysql -u "+self.msqlu+" -p"+self.msqlp+" " +\ "wx_"+self.language+"_"+self.dumptype+" < tables_standard.sql > debug_mysql.log" ok=os.system(command) if ok == 0: acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu, self.msqlp,\ "wx_"+self.language+"_"+self.dumptype) dbaccess.raw_query_SQL(acceso[1], "alter table page max_rows = 200000000000 avg_row_length = 50") dbaccess.raw_query_SQL(acceso[1], "alter table revision max_rows = 200000000000 avg_row_length = 50") if self.dumptype=="standard": dbaccess.raw_query_SQL(acceso[1], "alter table text max_rows = 200000000000 avg_row_length = 50") dbaccess.close_Connection(acceso[0]) else: print "Error! There was a problem initializing definitions for DB tables" dbaccess.close_Connection(acceso[0])
def commitsCommiterAllPeriods(self, commiter, arrayCommits): """ Calculate the number of commits per period, for all periods, for a given commiter, and produce an add that information to the given arrayCommits (which should be zeroed before calling this function in case this is the first call to fill it in). """ # print arrayCommits result = dbaccess.raw_query_SQL (self.acceso[1],\ "select period, contribs from contribs_period_author_"+\ self.language+" where author="+\ str(commiter) + " group by period") for row in result: # 0:period, 1:commits period = int(row[0]) ## It is better for us to compute the log10(num-revisions) for low values to appear arrayCommits[period] = arrayCommits[period] + int(row[1])
def commitsPerPeriod(self): """ Print lines to a file, each line representing: period commits commiters Also, fills in self.commitsPeriodDict (commits per period) and self.commitersPeriodDict (commiters per period) """ filehand = open(self.dataPath + 'data_per_period', 'w') # Commits per period, as an array of rows commitsPeriod = dbaccess.raw_query_SQL(self.acceso[1],\ "select period, sum(contribs), count(DISTINCT(author))"+\ " from contribs_period_author_"+self.language+" group by period") for row in commitsPeriod: # 0:period, 1:commits, 2: commiters filehand.write( str(row[0]) + ' ' + str(row[1]) + ' ' + str(row[2]) + '\n') self.commitsPeriodDict[int(row[0])] = int(row[1]) self.commitersPeriodDict[int(row[0])] = int(row[2])
def general_stats(self): """ Preprocessing actions for general statistics scripts """ #FILE page_len.dat, with info about length of pages self.f=open("overall/data/page_len.dat", 'w') self.f.write("page_len\tns\tis_redirect\tis_stub\tis_new\tlang\n") self.f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) print "Retrieving info from "+self.language+"\n" results=dbaccess.raw_query_SQL(self.access[1], "SELECT page_len, page_namespace, page_is_redirect, page_is_stub, "+\ "page_is_new FROM page") print "Updating page_len info file with "+self.language+"\n" self.f=open("overall/data/page_len.dat", 'a') for result in results: self.f.write(str(int(result[0]))+"\t"+str(int(result[1]))+"\t"+str(int(result[2]))+"\t"+\ str(int(result[3]))+"\t"+str(int(result[4]))+"\t"+self.language+"\n") self.f.close() results=None dbaccess.close_Connection(self.access[0])
def commitsCommiterAllPeriods(self, commiter, arrayCommits): """ Calculate the number of commits per period, for all periods, for a given commiter, and produce an add that information to the given arrayCommits (which should be zeroed before calling this function in case this is the first call to fill it in). """ # print arrayCommits result = dbaccess.raw_query_SQL( self.acceso[1], "select period, contribs from contribs_period_author_" + self.language + " where author=" + str(commiter) + " group by period", ) for row in result: # 0:period, 1:commits period = int(row[0]) ## It is better for us to compute the log10(num-revisions) for low values to appear arrayCommits[period] = arrayCommits[period] + int(row[1])
def commitsPerPeriod(self): """ Print lines to a file, each line representing: period commits commiters Also, fills in self.commitsPeriodDict (commits per period) and self.commitersPeriodDict (commiters per period) """ filehand = open(self.dataPath + "data_per_period", "w") # Commits per period, as an array of rows commitsPeriod = dbaccess.raw_query_SQL( self.acceso[1], "select period, sum(contribs), count(DISTINCT(author))" + " from contribs_period_author_" + self.language + " group by period", ) for row in commitsPeriod: # 0:period, 1:commits, 2: commiters filehand.write(str(row[0]) + " " + str(row[1]) + " " + str(row[2]) + "\n") self.commitsPeriodDict[int(row[0])] = int(row[1]) self.commitersPeriodDict[int(row[0])] = int(row[2])
def analyze(self): for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Total no. of revisions made by every logged author dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\ "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\ "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user") dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)") print "Created table user_revs for "+self.language+"wiki...\n" #Min and max timestamp for every logged author + total num_revs dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_allns AS "+\ "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\ "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\ "ORDER BY min_ts)") print "Created table time_range_allns for "+self.language+"wiki...\n" #Close DB connection dbaccess.close_Connection(self.access[0])
def time_range(self): """ Creates intermediate tables with time frame of editors activity """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) ##### TIME RANGE FOR AUTHORS IN ALL NAMESPACES #TABLE: Total no. of revisions made by every logged author dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS user_revs AS "+\ "SELECT rev_user, count(*) num_revs from revision WHERE rev_user!=0 AND "+\ "rev_user not in (SELECT ug_user FROM user_groups WHERE ug_group='bot') GROUP BY rev_user") dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE user_revs ADD PRIMARY KEY (rev_user)") print "Created table user_revs for "+self.language+"wiki...\n" #TABLE: Min and max timestamp for every logged author + total num_revs dbaccess.raw_query_SQL(self.access[1],"CREATE TABLE IF NOT EXISTS time_range_authors AS "+\ "(SELECT x.*, (select num_revs from user_revs d where d.rev_user=x.rev_user) num_revs FROM "+\ "(SELECT rev_user, min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision group by rev_user) x "+\ "ORDER BY min_ts)") print "Created table time_range_authors for "+self.language+"wiki...\n" ##### TIME RANGE FOR AUTHORS IN MAIN ONLY print "Processing language "+self.language+"\n" #VIEW: Create view for filtering annons and bots #Filter from rev_main_nored revisions from logged authors only dbaccess.raw_query_SQL(self.access[1],"create or replace view revision_logged as (select * from rev_main_nored "+\ " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot') )") dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_users") #TABLE: Intermediate table, storing for each logged author the min and max ts in the system dbaccess.raw_query_SQL(self.access[1],"create table time_range_users as (SELECT rev_user, "+\ "min(rev_timestamp) min_ts, max(rev_timestamp) max_ts from revision_logged group by rev_user)") dbaccess.raw_query_SQL(self.access[1],"alter table time_range_users add primary key (rev_user)") print "Created time_range_users for "+self.language +"\n" #Close DB connection dbaccess.close_Connection(self.access[0])
def prepro_red_talk(self): """ Data and evolution for redirects and talk pages """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #VIEW page_redirect (pages with redirect flag activated) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_redirect as "+\ "(select page_id from page where page_namespace=0 and page_is_redirect=1)") #VIEW rev_redirect (revisions corresponding to redirect pages) dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_redirect as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_redirect))") #VIEW page_talk (pages in talk nspace) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_talk as "+\ "(select page_id from page where page_namespace=1)") #VIEW rev_talk (revisions corresponding to talk pages) dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_talk as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_talk))") #TABLES max_rev_talk_YYYY (latest revision for each pages in talk nspace, in year YYYY) self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision") self.years=range(int(self.minyear[0][0])+1, 2009) for self.year in self.years: dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_talk_"+str(self.year)) dbaccess.raw_query_SQL(self.access[1],"create table max_rev_talk_"+str(self.year)+\ " as (select max(rev_id) as max_id, rev_page from rev_talk "+\ "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add primary key (max_id)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_talk_"+str(self.year)+" add index (rev_page)") dbaccess.close_Connection(self.access[0])
def endElement(self, name): ## Defining tasks to manage contents from the last readed tag ## Catching the namespace of this page if name=='namespace': self.nspace_dict[self.current_text]=self.codens elif name=='id': if self.stack[-1]=='contributor': ##Detecting contributor's attributes inside a revision self.rev_dict['rev_user']=self.current_text elif self.stack[-1]=='revision': self.rev_dict[name]=self.current_text elif self.stack[-1]=='page': self.page_dict[name]=self.current_text else: self.f=open(self.fileErrPath,'w') if len(self.stack)>0: self.f.write("Unsupported parent tag for '"+name+"': "+self.stack[-1]) self.f.close() elif name=='ip': self.rev_dict['rev_user']='******' self.rev_dict['username']=self.current_text elif name=='timestamp': ##Adequate formatting of timestamps self.rev_dict['timestamp']=self.current_text.replace('Z','').replace('T',' ') elif name=='contributor': ##Pop contributor tag from the stack self.stack.pop() ##################################################### ## END OF REVISION ##################################################### elif name=='revision': self.rev_count+=1 ##Store whether this is a redirect or stub page or not ##TODO: Substitute the find command with a regexp if len(self.rev_dict['text'])>0: if string.upper(self.rev_dict['text'][0:9])=='#REDIRECT': self.isRedirect='1' else: self.isRedirect='0' ## Takes from the first argument the threshold for stub's length if str(2*len(self.rev_dict['text']))<=self.options.stubth: self.isStub='1' else: self.isStub='0' ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (RESEARCH VERSION)###### ##Values order: (rev_id, rev_page, rev_user, rev_user_text, rev_timestamp, ##rev_len, rev_letters_raw, rev_words_raw, rev_words_filter, rev_in_links, rev_out_links, rev_trans, ##rev_sections, rev_bolds, rev_italics, rev_bolditalics ##rev_parent_id, rev_is_redirect, rev_is_stub, rev_is_minor, rev_comment) ##Calculation of additional fancy statistics AND ##Detection and stripping of wiki tags and HTML tags ##We also store inlinks, outlinks and special links #### ********************** #self.rev_dict['text']=re.sub(self.pathtml, '', self.rev_dict['text']) #filter HTML tags #self.rev_dict['text']=re.sub(self.patunicode, 'X', self.rev_dict['text']) #convert unicode chars to X ### self.highwords=re.findall(self.pathighwords, self.rev_dict['text']) #detect highlighted words ### for i in range(len(self.highwords)): ### self.highwords[i]=re.sub(self.pathighlight,'', self.highwords[i]) #Filter bold/italics tags #self.rev_dict['text']=re.sub(self.pathighlight, '', self.rev_dict['text']) #filter highlight tags ### self.special=re.findall(self.patspecial, self.rev_dict['text']) #detect special links #self.trans=re.findall(self.pattrans, self.rev_dict['text']) #detect translation links #self.rev_dict['text']=re.sub(self.patspecial, '', self.rev_dict['text']) #filter out special links (after detecting trans) #self.inlinks=re.findall(self.patinlink, self.rev_dict['text']) #detect inlinks #self.outlinks=re.findall(self.patoutlink, self.rev_dict['text']) #detect outlinks #self.sections=re.findall(self.patsection, self.rev_dict['text']) #detect sections #self.rev_dict['text']=re.sub(self.patitemize, '', self.rev_dict['text']) #filter out itemize bullets and line branches #### ********************** # Build current row for revinsert ## IMPORTANT PERFORMANCE NOTE: using str.join instead of plain '+' operator ## for increased performance try: #newrevinsert="("+self.rev_dict['id']+","+self.page_dict['id']+","+\ #self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\ #'"'+","+'"'+self.rev_dict['timestamp']+\ #'"'+","+str(2*len(self.rev_dict['text']))+\ #","+str(len(self.rev_dict['text']))+\ #","+str(len(self.rev_dict['text'].split())) ### ","+str(len(self.highwords))+","+str(len(self.special))+\ #newrevinsert+=","+str(len(self.inlinks))+","+str(len(self.outlinks))+","+str(len(self.trans))+","+str(len(self.sections))+\ #","+self.prior_rev_id+","+self.isRedirect+","+self.isStub+","+self.isMinor newrevinsert="".join(["(",self.rev_dict['id'],",",self.page_dict['id'],",", self.rev_dict['rev_user'],",",'"',self.rev_dict['username'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"'), '"',",",'"',self.rev_dict['timestamp'],'"', ",",str(len(self.rev_dict['text'])), #",",str(len(self.rev_dict['text'])),",",str(len(self.rev_dict['text'].split())), #",",str(len(self.inlinks)),",",str(len(self.outlinks)),",",str(len(self.trans)),",",str(len(self.sections)), ",",self.prior_rev_id,",",self.isRedirect,",",self.isStub,",",self.isMinor]) if self.rev_dict.has_key('comment'): #newrevinsert+=","+'"'+self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+'"' newrevinsert="".join([newrevinsert, ",",'"',self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"'),'"']) else: #newrevinsert+=",''" newrevinsert="".join([newrevinsert,",''"]) #newrevinsert+=")" newrevinsert="".join([newrevinsert,")"]) # In case that any field is missing or flawed, skip this revision and log to standard error except (KeyError), e: self.printfile = codecs.open("error_"+self.options.database,'a','utf_8') self.printfile.write("Offending rev_dict was = \n") self.printfile.write(str(self.rev_dict)) self.printfile.write("\n") self.printfile.write("Offending page_dict was = \n") self.printfile.write(str(self.page_dict)) self.printfile.write("\n") self.printfile.write("====================================================\n") self.printfile.write(str(e)+"\n") self.printfile.write("====================================================\n\n") self.printfile.close() return ############################################# # CONSTRUCT DICTIONARIES WITH SPECIAL ITEMS ############################################# ## for item in self.highwords: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.highwords_dict.get(item) ## if (stumble==None): ## self.highwords_dict[item]=self.highwords_id ## self.highwords_rev_insert.append((self.rev_dict['id'],self.highwords_id)) ## self.highwords_id+=1 ## else: ## self.highwords_rev_insert.append((self.rev_dict['id'],stumble)) ## for item in self.special: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.special_dict.get(item) ## if (stumble==None): ## self.special_dict[item]=self.special_id ## self.special_rev_insert.append((self.rev_dict['id'], self.special_id)) ## self.special_id+=1 ## else: ## self.special_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.inlinks: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.inlinks_dict.get(item) ## if (stumble==None): ## self.inlinks_dict[item]=self.inlinks_id ## self.inlinks_rev_insert.append((self.rev_dict['id'], self.inlinks_id)) ## self.inlinks_id+=1 ## else: ## self.inlinks_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.outlinks: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.outlinks_dict.get(item) ## if (stumble==None): ## self.outlinks_dict[item]=self.outlinks_id ## self.outlinks_rev_insert.append((self.rev_dict['id'], self.outlinks_id)) ## self.outlinks_id+=1 ## else: ## self.outlinks_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.trans: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.trans_dict.get(item) ## if (stumble==None): ## self.trans_dict[item]=self.trans_id ## self.trans_rev_insert.append((self.rev_dict['id'], self.trans_id)) ## self.trans_id+=1 ## else: ## self.trans_rev_insert.append((self.rev_dict['id'], stumble)) ############################################## ## LOOK-AHEAD ALGORITHM ## IMPORTANT PERFORMANCE NOTE: using str.join instead of plain '+' operator ## for increased performance ############################################## if self.revinsertrows==0: #Always allow at least one row in extended inserts #self.revinsert="INSERT INTO revision VALUES"+newrevinsert self.revinsert="".join(["INSERT INTO revision VALUES",newrevinsert]) self.revinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 elif ((self.revinsertrows+1)<=self.options.imaxrows) or\ (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024): #Append new row to self.revinsert #self.revinsert+=","+newrevinsert self.revinsert="".join([self.revinsert,",",newrevinsert]) self.revinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 else: #We must finish and write currrent insert and begin a new one if self.options.fileout: #self.revinsert+=";\n" self.revinsert="".join([self.revinsert,";\n"]) #self.revinsert # Write output to SQL file self.revfile = codecs.open(self.options.revfile,'a','utf_8') self.revfile.write(revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL #self.revinsert+=";" self.revinsert="".join([self.revinsert,";"]) print self.revinsert.encode('utf_8') elif self.options.monitor: chances=0 while chances<5: try: dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf_8')) except (Exception), e: self.printfile = codecs.open("error_"+self.options.database,'a','utf_8') self.printfile.write(str(e)+"\n") self.printfile.write("".join([self.revinsert[0:30],"\n**********************************"])) self.printfile.close() chances+=1 else: break self.revinsert="".join(["INSERT INTO revision VALUES",newrevinsert]) self.revinsertrows=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2
def community_contrib(idiomas): for idioma in idiomas: list_admins=test_admins.process_admins(idioma) num_admins=list_admins.pop() where_clause1=list_admins.pop() acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") admins_ids=dbaccess.raw_query_SQL(acceso[1], "SELECT DISTINCT(author) FROM stats_"+idioma+" WHERE "+where_clause1+" LIMIT "+str(num_admins)) ## MONTAR WHERE CLAUSE CON ADMINS IDS list_admins_ids=[] for item in list_admins_ids: list_admins_ids.append(int(item[0])) where_clause2=test_admins.process_users_ids(list_admins_ids,idioma) edits_admin_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_NoAnnons_months_author_"+idioma+" ", where=where_clause2, group="year, month ", order="year, month") dates_admins=[] admins_contribs=[] for element in edits_admin_month: dates_admins.append(list(element[0:2])) admins_contribs.append(int(element[2])) ## PASAR A UN ARCHIVO PARA PLOT (FIG 2) ## RECUPERAMOS CONTRIBUCIONES TOTALES POR MESES total_edits_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, group="year, month ") dates_contribs=[] total_contribs=[] for element in total_edits_month: dates_contribs.append(list(element[0:2])) total_contribs.append(int(element[2])) ## DIVIDIR LA PRIMERA LISTA POR LA SEGUNDA perc_contribs_admins=[] for admin_contrib, total_contrib in zip(admins_contribs, total_contribs): perc_contribs_admins.append((float(admin_contrib)/total_contrib)) ## PASAR A UN ARCHIVO PARA PLOT (FIG 1) ## FIG 4 TOTAL EDITS MADE BY USERS WITH DIFFERENT EDIT LEVELS ## CREATE CLUSTER OF USERS IDENTIFIED BY CONTRIBUTIONS LEVEL ## 5 LEVELS: <100, 100-1K, 1K-5K, 5K-10K, >10K users_level1=[] users_level2=[] users_level3=[] users_level4=[] users_level5=[] level1=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount<=100") for userid in level1: users_level1.append(int(userid[0])) level2=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>100 AND theCount<=1000") for userid in level2: users_level2.append(int(userid[0])) level3=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>1000 AND theCount<=5000") for userid in level3: users_level3.append(int(userid[0])) level4=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>5000 AND theCount<=10000") for userid in level4: users_level4.append(int(userid[0])) level5=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>10000") for userid in level5: users_level5.append(int(userid[0])) where_clause_level1=test_admins.process_users_ids(users_level1,idioma) where_clause_level2=test_admins.process_users_ids(users_level2,idioma) where_clause_level3=test_admins.process_users_ids(users_level3,idioma) where_clause_level4=test_admins.process_users_ids(users_level4,idioma) where_clause_level5=test_admins.process_users_ids(users_level5,idioma) contribs_level1_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level1, group="year, month") contribs_level2_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level2, group="year, month") contribs_level3_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level3, group="year, month") contribs_level4_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level4, group="year, month") contribs_level5_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level5, group="year, month") list_level1=__process_contribs(contribs_level1_month, total_contribs) perc_contribs_level1=list_level1.pop() contribs_level1=list_level1.pop() dates_level1=list_level1.pop() list_level2=__process_contribs(contribs_level2_month, total_contribs) perc_contribs_level2=list_level2.pop() contribs_level2=list_level2.pop() dates_level2=list_level2.pop() list_level3=__process_contribs(contribs_level3_month, total_contribs) perc_contribs_level3=list_level3.pop() contribs_level3=list_level3.pop() dates_level3=list_level1.pop() list_level4=__process_contribs(contribs_level4_month, total_contribs) perc_contribs_level4=list_level4.pop() contribs_level4=list_level4.pop() dates_level4=list_level4.pop() list_level5=__process_contribs(contribs_level5_month, total_contribs) perc_contribs_level5=list_level5.pop() contribs_level5=list_level5.pop() dates_level5=list_level5.pop() ## FIG 5 PLOT 4b ## FIG 6 AVERAGE NUMBER OF EDITS PER USER PER MONTH FOR EACH LEVEL ## RETRIEVE NUM USERS FOR EACH MONTH IN EACH LEVEL WHO HAVE MADE AT LEAST ONE CONTRIB num_users_1_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level1, group="year, month") num_users_2_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level2, group="year, month") num_users_3_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level3, group="year, month") num_users_4_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level4, group="year, month") num_users_5_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level5, group="year, month") list_users_1_month=[] for element in num_users_1_month: list_users_1_month.append(int(element[0])) list_users_2_month=[] for element in num_users_2_month: list_users_2_month.append(int(element[0])) list_users_3_month=[] for element in num_users_3_month: list_users_3_month.append(int(element[0])) list_users_4_month=[] for element in num_users_4_month: list_users_4_month.append(int(element[0])) list_users_5_month=[] for element in num_users_5_month: list_users_5_month.append(int(element[0])) ## DIVIDE TOT NUM CONTRIBS PER LEVEL PER MONTH BY THE NUM USERS FOR EACH MONTH IN EACH LEVEL avg_contribs_user_1_month=[] for contribmonth, usermonth in zip(contribs_level1, list_users_1_month): avg_contribs_user_1_month.append(float(contribmonth)/usermonth) avg_contribs_user_2_month=[] for contribmonth, usermonth in zip(contribs_level2, list_users_2_month): avg_contribs_user_2_month.append(float(contribmonth)/usermonth) avg_contribs_user_3_month=[] for contribmonth, usermonth in zip(contribs_level3, list_users_3_month): avg_contribs_user_3_month.append(float(contribmonth)/usermonth) avg_contribs_user_4_month=[] for contribmonth, usermonth in zip(contribs_level4, list_users_4_month): avg_contribs_user_4_month.append(float(contribmonth)/usermonth) avg_contribs_user_5_month=[] for contribmonth, usermonth in zip(contribs_level5, list_users_5_month): avg_contribs_user_5_month.append(float(contribmonth)/usermonth) ## FIG 7 POPULATION GROWTH FOR EACH USER GROUP ## SIMPLY RETRIEVE list_users_X_month ## FIG 8 % OF TOTAL POPULATION OF EACH USER GROUP perc_users_1_months=[] perc_users_2_months=[] perc_users_3_months=[] perc_users_4_months=[] perc_users_5_months=[] for e1, e2, e3, e4, e5 in zip(list_users_1_month,list_users_2_month,list_users_3_month,list_users_4_month,list_users_5_month): total_users_month=e1+e2+e3+e4+e5 perc_users_1_months.append((float(e1)/total_users_month)) perc_users_2_months.append((float(e2)/total_users_month)) perc_users_3_months.append((float(e3)/total_users_month)) perc_users_4_months.append((float(e4)/total_users_month)) perc_users_5_months.append((float(e5)/total_users_month)) ############################### ## FINAL DUTIES, TRANSFER DATA AND EXECUTE R SCRIPT filenames=["dates_admin_contrib.data","contribs_admins_months.data", "perc_contribs_months.data","dates_level1_contrib.data", "contribs_level1_months.data", "perc_contribs_level1_months.data", "dates_level2_contrib.data", "contribs_level2_months.data", "perc_contribs_level2_months.data","dates_level3_contrib.data", "contribs_level3_months.data", "perc_contribs_level3_months.data","dates_level4_contrib.data", "contribs_level4_months.data", "perc_contribs_level4_months.data","dates_level5_contrib.data" ,"contribs_level5_months.data", "perc_contribs_level5_months.data", "avg_contribs_user_1_month.data", "avg_contribs_user_2_month.data", "avg_contribs_user_3_month.data", "avg_contribs_user_4_month.data", "avg_contribs_user_5_month.data", "users_1_month.data", "users_2_month.data", "users_3_month.data", "users_4_month.data", "users_5_month.data", "perc_users_1_months.data","perc_users_2_months.data", "perc_users_3_months.data", "perc_users_4_months.data", "perc_users_5_months.data"] filenames_out=["Figure1.png", "Figure_2.png", "Figure4.png", "Figure5.png", "Figure6.png", "Figure7.png", "Figure8.png"] dataList=[dates_contribs, admins_contribs, perc_contribs_admins, dates_level1, contribs_level1, perc_contribs_level1,dates_level2, contribs_level2, perc_contribs_level2,dates_level3, contribs_level3, perc_contribs_level3, dates_level4, contribs_level4, perc_contribs_level4,dates_level5, contribs_level5, perc_contribs_level5, avg_contribs_user_1_month, avg_contribs_user_2_month, avg_contribs_user_3_month, avg_contribs_user_4_month, avg_contribs_user_5_month, list_users_1_month, list_users_2_month, list_users_3_month, list_users_4_month, list_users_5_month, perc_users_1_months, perc_users_2_months, perc_users_3_months, perc_users_4_months, perc_users_5_months] for filename, data in zip (filenames, dataList): if(filename.find('date')!=-1): f=open("./graphics/"+idioma+"/data/"+filename, 'w') for adate in data: f.writelines(str(adate)+"\n") f.close() else: __makeDataFile(idioma, filename, data) #Pass data filenames to the GNU R script with a file f=open("./data/community_contrib_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/data/"+line+"\n") f.close() #Idem with graphic output filenames f=open("./data/community_contrib_files_out.data",'w') for line in filenames_out: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() #CALL GNU R SCRIPT measuring_Wiki.R succ=os.system("R --vanilla < ./community_contrib.R > debug_R") if succ==0: print "Funcion community_contrib.R ejecutada con exito para el lenguage... "+idioma
def analyze(self): #Initialize all files headers #Survival data for all users (including editors out of MAIN) f=open("wkp_surv_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() #Survival data for all logged users who edited in MAIN f=open("wkp_surv_main_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() f=open("wkp_surv_join_core_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() f=open("wkp_surv_in_core_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() f=open("wkp_surv_core_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() f=open("wkp_surv_join_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() f=open("wkp_surv_in_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" print "Starting language "+self.language+"\n" ##IN SYSTEM self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\ "where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN MAIN self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_main_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished core users for language "+self.language+"\n" ########################### ##REV CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished all tasks for "+self.language+"\n"
def printSql2(sqlquery): data = dbaccess.raw_query_SQL(acceso[1], sqlquery) for row in data: print(row)
def endElement(self, name): ## Defining tasks to manage contents from the last readed tag ## Catching the namespace of this page if name=='namespace': self.nspace_dict[self.current_text]=self.codens elif name=='id': if self.stack[-1]=='contributor': ##Detecting contributor's attributes inside a revision self.rev_dict['rev_user']=self.current_text elif self.stack[-1]=='revision': self.rev_dict[name]=self.current_text elif self.stack[-1]=='page': self.page_dict[name]=self.current_text else: self.f=open(self.fileErrPath,'w') if len(self.stack)>0: self.f.write("Unsupported parent tag for '"+name+"': "+self.stack[-1]) self.f.close() elif name=='ip': self.rev_dict['rev_user']='******' self.rev_dict['username']=self.current_text elif name=='timestamp': ##Adequate formatting of timestamps self.rev_dict['timestamp']=self.current_text.replace('Z','').replace('T',' ') elif name=='contributor': ##Pop contributor tag from the stack self.stack.pop() elif name=='revision': self.rev_count+=1 ##Store whether this is a redirect or stub page or not if len(self.rev_dict['text'])>0: if self.rev_dict['text'][0:9].upper()=='#REDIRECT': self.isRedirect='1' else: self.isRedirect='0' ## Takes from the first argument the threshold for stub's length if str(2*len(self.rev_dict['text']))<=self.options.stubth: self.isStub='1' else: self.isStub='0' ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (STANDARD VERSION)###### ##Values order: (rev_id, rev_page, [[rev_text_id=rev_id]], rev_comment, ##rev_user, rev_user_text, rev_timestamp, rev_is_minor) # Build current row for revinsert try: newrevinsert="("+self.rev_dict['id']+","+self.page_dict['id']+","+self.rev_dict['id'] if self.rev_dict.has_key('comment'): newrevinsert+=","+'"'+self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+'"' else: newrevinsert+=",''" newrevinsert+=","+self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].\ replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\ '"'+","+'"'+self.rev_dict['timestamp']+\ '"'+","+self.isMinor+")" # In case that any field is missing or flawed, skip this revision and log to standard error except (KeyError), e: self.printfile = codecs.open("error.log",'a','utf_8') self.printfile.write("Offending rev_dict was = \n") self.printfile.write(str(self.rev_dict)) self.printfile.write("\n") self.printfile.write("Offending page_dict was = \n") self.printfile.write(str(self.page_dict)) self.printfile.write("\n") self.printfile.write("====================================================\n") self.printfile.write(str(e)+"\n") self.printfile.write("====================================================\n\n") self.printfile.close() return if self.revinsertrows==0: #Always allow at least one row in extended inserts self.revinsert="INSERT INTO revision VALUES"+newrevinsert self.revinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 elif (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024) and\ ((self.revinsertrows+1)<=self.options.imaxrows): #Append new row to self.revinsert self.revinsert+=","+newrevinsert self.revinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 else: #We must finish and write currrent insert and begin a new one if self.options.fileout: self.revinsert+=";\n" # Write output to SQL file self.revfile = codecs.open(self.options.revfile,'a','utf_8') self.revfile.write(revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.revinsert+=";" print self.revinsert.encode('utf-8') elif self.options.monitor: while 1: try: dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf-8')) except (Exception), e: print e else: break self.revinsert="INSERT INTO revision VALUES"+newrevinsert self.revinsertrows=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2
class wikiHandler(ContentHandler): """Parse an XML file generated by Wikipedia Export page into SQL data suitable to be imported by MySQL""" def __init__(self, options): self.fileErrPath="./errors.log"; self.options=options if self.options.monitor and not self.options.fileout and not self.options.streamout: self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\ self.options.user, self.options.passwd, self.options.database) self.nspace_dict={}; self.codens=''; self.page_dict={}; self.rev_dict = {} self.stack=[]; self.current_text = ''; self.current_elem=None; self.revfile=None self.pagefile=None self.page_num = 0; self.rev_num=0; self.last_page_len=0; self.rev_count=0 self.prior_rev_id='NULL'; self.isRedirect='0'; self.isStub='0'; self.isMinor='0' self.revinsert=''; self.pageinsert=''; self.textinsert='' self.revinsertrows=0; self.revinsertsize=0; self.pageinsertrows=0 self.pageinsertsize=0; self.textinsertrows=0; self.textinsertsize=0 self.start=datetime.datetime.now(); self.timeCheck=None; self.timeDelta=None def startElement(self, name, attrs): ## Here we define which tags we want to catch ## In this case, we only want to recall the name of the tags in a stack ## so we can later look up the parent node of a new tag ## (for instance, to discriminate among page id, rev id and contributor id ## all of them with the name=="id") if name=='page' or name=='revision' or name=='contributor': self.stack.append(name) elif name=='namespace': self.codens=attrs.get('key') elif name=='minor': self.isMinor='1' self.current_text='' self.current_elem=name return def endElement(self, name): ## Defining tasks to manage contents from the last readed tag ## Catching the namespace of this page if name=='namespace': self.nspace_dict[self.current_text]=self.codens elif name=='id': if self.stack[-1]=='contributor': ##Detecting contributor's attributes inside a revision self.rev_dict['rev_user']=self.current_text elif self.stack[-1]=='revision': self.rev_dict[name]=self.current_text elif self.stack[-1]=='page': self.page_dict[name]=self.current_text else: self.f=open(self.fileErrPath,'w') if len(self.stack)>0: self.f.write("Unsupported parent tag for '"+name+"': "+self.stack[-1]) self.f.close() elif name=='ip': self.rev_dict['rev_user']='******' self.rev_dict['username']=self.current_text elif name=='timestamp': ##Adequate formatting of timestamps self.rev_dict['timestamp']=self.current_text.replace('Z','').replace('T',' ') elif name=='contributor': ##Pop contributor tag from the stack self.stack.pop() elif name=='revision': self.rev_count+=1 ##Store whether this is a redirect or stub page or not if len(self.rev_dict['text'])>0: if self.rev_dict['text'][0:9].upper()=='#REDIRECT': self.isRedirect='1' else: self.isRedirect='0' ## Takes from the first argument the threshold for stub's length if str(2*len(self.rev_dict['text']))<=self.options.stubth: self.isStub='1' else: self.isStub='0' ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (STANDARD VERSION)###### ##Values order: (rev_id, rev_page, [[rev_text_id=rev_id]], rev_comment, ##rev_user, rev_user_text, rev_timestamp, rev_is_minor) # Build current row for revinsert try: newrevinsert="("+self.rev_dict['id']+","+self.page_dict['id']+","+self.rev_dict['id'] if self.rev_dict.has_key('comment'): newrevinsert+=","+'"'+self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+'"' else: newrevinsert+=",''" newrevinsert+=","+self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].\ replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\ '"'+","+'"'+self.rev_dict['timestamp']+\ '"'+","+self.isMinor+")" # In case that any field is missing or flawed, skip this revision and log to standard error except (KeyError), e: self.printfile = codecs.open("error.log",'a','utf_8') self.printfile.write("Offending rev_dict was = \n") self.printfile.write(str(self.rev_dict)) self.printfile.write("\n") self.printfile.write("Offending page_dict was = \n") self.printfile.write(str(self.page_dict)) self.printfile.write("\n") self.printfile.write("====================================================\n") self.printfile.write(str(e)+"\n") self.printfile.write("====================================================\n\n") self.printfile.close() return if self.revinsertrows==0: #Always allow at least one row in extended inserts self.revinsert="INSERT INTO revision VALUES"+newrevinsert self.revinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 elif (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024) and\ ((self.revinsertrows+1)<=self.options.imaxrows): #Append new row to self.revinsert self.revinsert+=","+newrevinsert self.revinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 else: #We must finish and write currrent insert and begin a new one if self.options.fileout: self.revinsert+=";\n" # Write output to SQL file self.revfile = codecs.open(self.options.revfile,'a','utf_8') self.revfile.write(revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.revinsert+=";" print self.revinsert.encode('utf-8') elif self.options.monitor: while 1: try: dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf-8')) except (Exception), e: print e else: break self.revinsert="INSERT INTO revision VALUES"+newrevinsert self.revinsertrows=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 ################################################## ##CONSTRUCTION OF EXTENDED INSERTS FOR TABLE TEXT ##Template for each row: ## (old_id, old_text, old_flags) newtextinsert="("+self.rev_dict['id']+','+'"'+\ self.rev_dict['text'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\ '",'+'"utf8")' if self.textinsertrows==0: #Always allow at least one row in extended inserts self.textinsert="INSERT INTO text VALUES"+newtextinsert self.textinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.textinsertsize=len(self.textinsert)*2 elif (self.textinsertsize+(2*len(newtextinsert))<=self.options.imaxsize*1024) and\ ((self.textinsertrows+1)<=self.options.imaxrows): #Append new row to self.revinsert self.textinsert+=","+newtextinsert self.textinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.textinsertsize=len(self.textinsert)*2 else: #We must finish and write currrent insert and begin a new one if self.options.fileout: self.textinsert+=";\n" # Write output to SQL file self.textfile = codecs.open(self.options.textfile,'a','utf_8') self.textfile.write(textinsert) self.textfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.textinsert+=";" print self.textinsert.encode('utf-8') elif self.options.monitor: while 1: try: dbaccess.raw_query_SQL(self.acceso[1], self.textinsert.encode('utf-8')) except (Exception), e: print e else: break self.textinsert="INSERT INTO text VALUES"+newtextinsert self.textinsertrows=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.textinsertsize=len(self.textinsert)*2
#We must write the last pageinsert before finishing this dump if self.options.fileout: # Write output to SQL file self.pageinsert+=";\n" self.pagefile = codecs.open(self.options.pagefile,'a','utf_8') self.pagefile.write(self.pageinsert) self.pagefile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.pageinsert+=";" print self.pageinsert.encode('utf_8') elif self.options.monitor: chances=0 while chances<5: try: dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode('utf_8')) except (Exception), e: self.printfile = codecs.open("error_"+self.options.database,'a','utf_8') self.printfile.write(str(e)+"\n") self.printfile.write(self.pageinsert[0:30]+"\n**********************************") self.printfile.close() chances+=1 else: break #Reset status vars self.pageinsertrows=0 self.pageinsertsize=0 #INSERT NAMESPACES CODES AND TITLES IN SPECIAL TABLE nspaces= self.nspace_dict.iteritems() insertns='INSERT INTO namespaces VALUES' first_loop=True
################################################ #We must write the las revinsert before finishing this page if self.options.fileout: self.revinsert+=";\n" # Write output to SQL file self.revfile = codecs.open(self.options.revfile,'a','utf_8') self.revfile.write(self.revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.revinsert+=";" print self.revinsert.encode('utf-8') elif self.options.monitor: while 1: try: dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf-8')) except (Exception), e: print e else: break #Reset status vars self.revinsertrows=0 self.revinsertsize=0 ################################################ ##Same for Insert into text table if self.options.fileout: self.textinsert+=";\n" # Write output to SQL file self.textfile = codecs.open(self.options.textfile,'a','utf_8') self.textfile.write(self.textinsert) self.textfile.close()
def endElement(self, name): ## Defining tasks to manage contents from the last readed tag ## Catching the namespace of this page if name=='namespace': self.nspace_dict[self.current_text]=self.codens elif name=='id': if self.stack[-1]=='contributor': ##Detecting contributor's attributes inside a revision self.rev_dict['rev_user']=self.current_text elif self.stack[-1]=='revision': self.rev_dict[name]=self.current_text elif self.stack[-1]=='page': self.page_dict[name]=self.current_text else: self.f=open(self.fileErrPath,'w') if len(self.stack)>0: self.f.write("Unsupported parent tag for '"+name+"': "+self.stack[-1]) self.f.close() elif name=='ip': self.rev_dict['rev_user']='******' self.rev_dict['username']=self.current_text elif name=='timestamp': ##Adequate formatting of timestamps self.rev_dict['timestamp']=self.current_text.replace('Z','').replace('T',' ') elif name=='contributor': ##Pop contributor tag from the stack self.stack.pop() ##################################################### ## END OF REVISION ##################################################### elif name=='revision': self.rev_count+=1 ##Store whether this is a redirect or stub page or not ##TODO: Substitute the find command with a regexp if len(self.rev_dict['text'])>0: if string.upper(self.rev_dict['text'][0:9])=='#REDIRECT': self.isRedirect='1' else: self.isRedirect='0' ## Takes from the first argument the threshold for stub's length if 2*len(self.rev_dict['text'])<=self.options.stubth: self.isStub='1' else: self.isStub='0' ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (RESEARCH VERSION)###### ##Values order: (rev_id, rev_page, rev_user, rev_user_text, rev_timestamp, ##rev_len, rev_letters_raw, rev_words_raw, rev_words_filter, rev_in_links, rev_out_links, rev_trans, ##rev_sections, rev_bolds, rev_italics, rev_bolditalics ##rev_parent_id, rev_is_redirect, rev_is_stub, rev_is_minor, rev_comment) ##Calculation of additional fancy statistics AND ##Detection and stripping of wiki tags and HTML tags ##We also store inlinks, outlinks and special links self.rev_dict['text']=re.sub(self.pathtml, '', self.rev_dict['text']) #filter HTML tags self.rev_dict['text']=re.sub(self.patunicode, 'X', self.rev_dict['text']) #convert unicode chars to X ## self.highwords=re.findall(self.pathighwords, self.rev_dict['text']) #detect highlighted words ## for i in range(len(self.highwords)): ## self.highwords[i]=re.sub(self.pathighlight,'', self.highwords[i]) #Filter bold/italics tags self.rev_dict['text']=re.sub(self.pathighlight, '', self.rev_dict['text']) #filter highlight tags ## self.special=re.findall(self.patspecial, self.rev_dict['text']) #detect special links self.trans=re.findall(self.pattrans, self.rev_dict['text']) #detect translation links self.rev_dict['text']=re.sub(self.patspecial, '', self.rev_dict['text']) #filter out special links (after detecting trans) self.inlinks=re.findall(self.patinlink, self.rev_dict['text']) #detect inlinks self.outlinks=re.findall(self.patoutlink, self.rev_dict['text']) #detect outlinks self.sections=re.findall(self.patsection, self.rev_dict['text']) #detect sections self.rev_dict['text']=re.sub(self.patitemize, '', self.rev_dict['text']) #filter out itemize bullets and line branches # Build current row for revinsert newrevinsert="("+self.rev_dict['id']+","+self.page_dict['id']+","+\ self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\ '"'+","+'"'+self.rev_dict['timestamp']+\ '"'+","+str(2*len(self.rev_dict['text']))+\ ","+str(len(self.rev_dict['text']))+\ ","+str(len(self.rev_dict['text'].split())) ## ","+str(len(self.highwords))+","+str(len(self.special))+\ newrevinsert+=","+str(len(self.inlinks))+","+str(len(self.outlinks))+","+str(len(self.trans))+","+str(len(self.sections))+\ ","+self.prior_rev_id+","+self.isRedirect+","+self.isStub+","+self.isMinor if self.rev_dict.has_key('comment'): newrevinsert+=","+'"'+self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+'"' else: newrevinsert+=",''" newrevinsert+=")" ############################################# ## CONSTRUCT DICTIONARIES WITH SPECIAL ITEMS ############################################# ## for item in self.highwords: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.highwords_dict.get(item) ## if (stumble==None): ## self.highwords_dict[item]=self.highwords_id ## self.highwords_rev_insert.append((self.rev_dict['id'],self.highwords_id)) ## self.highwords_id+=1 ## else: ## self.highwords_rev_insert.append((self.rev_dict['id'],stumble)) ## for item in self.special: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.special_dict.get(item) ## if (stumble==None): ## self.special_dict[item]=self.special_id ## self.special_rev_insert.append((self.rev_dict['id'], self.special_id)) ## self.special_id+=1 ## else: ## self.special_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.inlinks: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.inlinks_dict.get(item) ## if (stumble==None): ## self.inlinks_dict[item]=self.inlinks_id ## self.inlinks_rev_insert.append((self.rev_dict['id'], self.inlinks_id)) ## self.inlinks_id+=1 ## else: ## self.inlinks_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.outlinks: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.outlinks_dict.get(item) ## if (stumble==None): ## self.outlinks_dict[item]=self.outlinks_id ## self.outlinks_rev_insert.append((self.rev_dict['id'], self.outlinks_id)) ## self.outlinks_id+=1 ## else: ## self.outlinks_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.trans: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.trans_dict.get(item) ## if (stumble==None): ## self.trans_dict[item]=self.trans_id ## self.trans_rev_insert.append((self.rev_dict['id'], self.trans_id)) ## self.trans_id+=1 ## else: ## self.trans_rev_insert.append((self.rev_dict['id'], stumble)) ############################################## ## LOOK-AHEAD ALGORITHM ############################################## if self.revinsertrows==0: #Always allow at least one row in extended inserts self.revinsert="INSERT INTO revision VALUES"+newrevinsert self.revinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 elif (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024) and\ ((self.revinsertrows+1)<=self.options.imaxrows): #Append new row to self.revinsert self.revinsert+=","+newrevinsert self.revinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 else: #We must finish and write currrent insert and begin a new one if self.options.fileout: self.revinsert+=";\n" # Write output to SQL file self.revfile = codecs.open(self.options.revfile,'a','utf_8') self.revfile.write(revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.revinsert+=";" print self.revinsert.encode('utf_8') print self.revinsert.encode('utf_8') elif self.options.monitor: chances=0 while chances<5: try: dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf_8')) except (Exception), e: self.printfile = codecs.open("error_"+self.options.database,'a','utf_8') self.printfile.write(str(e)+"\n") self.printfile.write(self.revinsert[0:30]+"\n**********************************") self.printfile.close() chances+=1 else: break self.revinsert="INSERT INTO revision VALUES"+newrevinsert self.revinsertrows=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 ################################################## ##Store this rev_id to recall it when processing the following revision, if it exists self.prior_rev_id=self.rev_dict['id'] ##Store this rev_len to recall it for the current page_len, in case this is the last revision for that page self.last_page_len=2*len(self.rev_dict['text']) self.rev_dict.clear() self.stack.pop() self.isMinor='0' self.inlinks=[]; self.outlinks=[]; self.trans=[]; self.sections=[] self.highwords=[]; self.special=[] self.rev_num+=1 if self.options.verbose and self.options.log is None: # Display status report if self.rev_num % 1000 == 0: self.timeCheck=datetime.datetime.now() self.timeDelta=self.timeCheck-self.start if self.timeDelta.seconds==0: print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)"\ % (self.page_num, 1e6*float(self.page_num)/self.timeDelta.microseconds,\ self.rev_num, 1e6*float(self.rev_num)/self.timeDelta.microseconds) self.printfile = codecs.open(self.fileErrPath,'a','utf_8') self.printfile.write("page "+str(self.page_num)+" ("+\ str( 1e6*float(self.page_num)/self.timeDelta.microseconds)+" pags. per sec.), revision "+\ str(self.rev_num)+ " ("+str(1e6*float(self.rev_num)/self.timeDelta.microseconds)+" revs. per sec.)\n") self.printfile.close() else: print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)"\ % (self.page_num, float(self.page_num)/self.timeDelta.seconds,\ self.rev_num, float(self.rev_num)/self.timeDelta.seconds) self.printfile = codecs.open(self.fileErrPath,'a','utf_8') self.printfile.write("page "+str(self.page_num)+" ("+\ str( float(self.page_num)/self.timeDelta.seconds)+" pags./sec.), revision "+\ str(self.rev_num)+ " ("+str(float(self.rev_num)/self.timeDelta.seconds)+" revs./sec.)\n") self.printfile.close() if self.options.verbose and self.options.log is not None: if self.rev_num%1000==0: self.timeCheck=datetime.datetime.now() self.timeDelta=self.timeCheck-self.start if self.timeDelta.seconds==0: self.printfile = codecs.open(self.options.log,'a','utf_8') self.printfile.write("page "+str(self.page_num)+" ("+\ str( 1e6*float(self.page_num)/self.timeDelta.microseconds)+" pags./sec.), revision "+\ str(self.rev_num)+ " ("+str(1e6*float(self.rev_num)/self.timeDelta.microseconds)+" revs./sec.)\n") self.printfile.close() else: self.printfile = codecs.open(self.options.log,'a','utf_8') self.printfile.write("page "+str(self.page_num)+" ("+\ str( float(self.page_num)/self.timeDelta.seconds)+" pags./sec.), revision "+\ str(self.rev_num)+ " ("+str(float(self.rev_num)/self.timeDelta.seconds)+" revs./sec.)\n") self.printfile.close()
def bots(self): """ Preprocessing actions with bots data """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #TABLE revs_bots (revisions made by officially identified bots, by year, month) dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_bots") dbaccess.raw_query_SQL(self.access[1], "create table revs_bots as select year(rev_timestamp) "+\ "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision where rev_user!=0 "+\ "and rev_user in (select ug_user from user_groups where ug_group='bot') group by "+\ "year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)") #TABLE revs_logged (revisions made by logged authors, by year, month) dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_logged") dbaccess.raw_query_SQL(self.access[1],"create table revs_logged as select year(rev_timestamp) "+\ "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision where rev_user!=0 "+\ "group by year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)") #TABLE revs_all (revisions made by all authors, by year, month) dbaccess.raw_query_SQL(self.access[1],"drop table if exists revs_all") dbaccess.raw_query_SQL(self.access[1], "create table revs_all as select year(rev_timestamp) "+\ "theyear, month(rev_timestamp) themonth, count(*) num_revs from revision "+\ "group by year(rev_timestamp), month(rev_timestamp) order by year(rev_timestamp), month(rev_timestamp)") dbaccess.close_Connection(self.access[0]) #FILE perc-bots-all-revs.dat % of all revisions due to bots file=open("overall/data/perc-bots-all-revs.dat",'w') file.write("year\tmonth\tperc_revs\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Obtain % of total no. revs due to bots, by year, month self.perc_revs=dbaccess.raw_query_SQL(self.access[1], "select bot.theyear, bot.themonth, "+\ "(bot.num_revs/tot.num_revs)*100 perc_revs from revs_bots as bot, revs_all as tot "+\ "where bot.theyear=tot.theyear and bot.themonth=tot.themonth;") dbaccess.close_Connection(self.access[0]) #Writing data to file file=open("overall/data/perc-bots-all-revs.dat",'a') for item in self.perc_revs: file.write(str(int(item[0]))+"\t"+str(int(item[1]))+"\t"+\ str(float(item[2]))+"\t"+self.language+"\n") file.close() #file perc-bots-logged-revs.dat % of all revisions due to bots file=open("overall/data/perc-bots-logged-revs.dat",'w') file.write("year\tmonth\tperc_revs\tlang\n") file.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #obtain % of no. revs by logged editors due to bots, by year, month self.perc_revs=dbaccess.raw_query_sql(self.access[1], "select bot.theyear, bot.themonth, "+\ "(bot.num_revs/logged.num_revs)*100 perc_logged_revs from revs_bots as bot, "+\ "revs_logged as logged where bot.theyear=logged.theyear and bot.themonth=logged.themonth;") dbaccess.close_connection(self.access[0]) #writing data to file file=open("overall/data/perc-bots-logged-revs.dat",'a') for item in self.perc_revs: file.write(str(int(item[0]))+"\t"+str(int(item[1]))+"\t"+\ str(float(item[2]))+"\t"+self.language+"\n") file.close()
class wikiHandler(ContentHandler): """Parse an XML file generated by Wikipedia Export page into SQL data suitable to be imported by MySQL""" def __init__(self, options): self.options=options if self.options.monitor and (not self.options.fileout and not self.options.streamout): self.acceso = dbaccess.get_Connection(self.options.machine, self.options.port,\ self.options.user, self.options.passwd, self.options.database) self.nspace_dict={} self.codens='' self.page_dict={} self.rev_dict = {} self.stack=[] self.current_text = '' self.current_elem=None self.revfile=None self.pagefile=None self.page_num = 0 self.rev_num=0 self.last_page_len=0 self.rev_count=0 self.prior_rev_id='NULL' self.isRedirect='0' self.isStub='0' self.isMinor='0' self.inlinks=None # internal links self.outlinks=None # external links self.trans=None # translations to other language editions self.sections=None # sections (no matter their level) self.highwords=None #highlighted words (bold/italics/bold+italics) self.special=None #rev_text, special links filtered out ######################################## ##REGEXPS ######################################## self.pathighlight=r"\'\'+"#Regexp matching bold/italics/bold+italics wikitags self.pathighwords=r"\'\'+.*\'\'+" #Regexp for highlighted words self.pathtml=r"\<[^\>]+\>" #Regexp matching HTML tags self.patunicode=r"\&\w+\;|\&\#\d+\;|[\xc0-\xf7][\x80-\xbf]+" #Regexp matching unicode chars self.patspecial=r"\[\[[^\:\]]+\:[^\]]*\]\]" #Regexp matching special inlinks (image/category/interwiki) self.patinlink=r"\[\[.*\]\]" #Regexp matching inlinks (after filtering image/category/interwiki links) self.patoutlink=r"\s\[[^\[\]]*\]|http[s]?://" #Regexp matching outlinks self.patsection=r"\=\=+[\s]*[^\=]*[\s]*\=\=+" #Regexp matching section titles self.pattrans=r"\[\[..[.]?:"#Regexp matching translation links self.patitemize=r"\n\**" #Regexp matching itemize bullets and line branches self.patdumb=r"\)\(" #A rapid solution to concatenate tuples in special instert strings self.fileErrPath="./errors.log" #TODO: Solve lookup in global scope if the special item did not show up in any previous revision #of the whole dump (maybe lookup query to DB??) self.highwords_dict={}; self.special_dict={}; self.inlinks_dict={}; self.outlinks_dict={}; self.trans_dict={} self.highwords_id=1; self.special_id=1; self.inlinks_id=1; self.outlinks_id=1 self.trans_id=1; self.highwords_rev_insert=[]; self.special_rev_insert=[]; self.inlinks_rev_insert=[] self.outlinks_rev_insert=[]; self.trans_rev_insert=[]; self.revinsert='' self.pageinsert='' self.revinsertrows=0 self.revinsertsize=0 self.pageinsertrows=0 self.pageinsertsize=0 self.start=datetime.datetime.now() self.timeCheck=None self.timeDelta=None def startElement(self, name, attrs): ## Here we define which tags we want to catch ## In this case, we only want to recall the name of the tags in a stack ## so we can later look up the parent node of a new tag ## (for instance, to discriminate among page id, rev id and contributor id ## all of them with the name=="id") if name=='page' or name=='revision' or name=='contributor': self.stack.append(name) elif name=='namespace': self.codens=attrs.get('key') elif name=='minor': self.isMinor='1' self.current_text='' self.current_elem=name return def endElement(self, name): ## Defining tasks to manage contents from the last readed tag ## Catching the namespace of this page if name=='namespace': self.nspace_dict[self.current_text]=self.codens elif name=='id': if self.stack[-1]=='contributor': ##Detecting contributor's attributes inside a revision self.rev_dict['rev_user']=self.current_text elif self.stack[-1]=='revision': self.rev_dict[name]=self.current_text elif self.stack[-1]=='page': self.page_dict[name]=self.current_text else: self.f=open(self.fileErrPath,'w') if len(self.stack)>0: self.f.write("Unsupported parent tag for '"+name+"': "+self.stack[-1]) self.f.close() elif name=='ip': self.rev_dict['rev_user']='******' self.rev_dict['username']=self.current_text elif name=='timestamp': ##Adequate formatting of timestamps self.rev_dict['timestamp']=self.current_text.replace('Z','').replace('T',' ') elif name=='contributor': ##Pop contributor tag from the stack self.stack.pop() ##################################################### ## END OF REVISION ##################################################### elif name=='revision': self.rev_count+=1 ##Store whether this is a redirect or stub page or not ##TODO: Substitute the find command with a regexp if len(self.rev_dict['text'])>0: if string.upper(self.rev_dict['text'][0:9])=='#REDIRECT': self.isRedirect='1' else: self.isRedirect='0' ## Takes from the first argument the threshold for stub's length if 2*len(self.rev_dict['text'])<=self.options.stubth: self.isStub='1' else: self.isStub='0' ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (RESEARCH VERSION)###### ##Values order: (rev_id, rev_page, rev_user, rev_user_text, rev_timestamp, ##rev_len, rev_letters_raw, rev_words_raw, rev_words_filter, rev_in_links, rev_out_links, rev_trans, ##rev_sections, rev_bolds, rev_italics, rev_bolditalics ##rev_parent_id, rev_is_redirect, rev_is_stub, rev_is_minor, rev_comment) ##Calculation of additional fancy statistics AND ##Detection and stripping of wiki tags and HTML tags ##We also store inlinks, outlinks and special links self.rev_dict['text']=re.sub(self.pathtml, '', self.rev_dict['text']) #filter HTML tags self.rev_dict['text']=re.sub(self.patunicode, 'X', self.rev_dict['text']) #convert unicode chars to X ## self.highwords=re.findall(self.pathighwords, self.rev_dict['text']) #detect highlighted words ## for i in range(len(self.highwords)): ## self.highwords[i]=re.sub(self.pathighlight,'', self.highwords[i]) #Filter bold/italics tags self.rev_dict['text']=re.sub(self.pathighlight, '', self.rev_dict['text']) #filter highlight tags ## self.special=re.findall(self.patspecial, self.rev_dict['text']) #detect special links self.trans=re.findall(self.pattrans, self.rev_dict['text']) #detect translation links self.rev_dict['text']=re.sub(self.patspecial, '', self.rev_dict['text']) #filter out special links (after detecting trans) self.inlinks=re.findall(self.patinlink, self.rev_dict['text']) #detect inlinks self.outlinks=re.findall(self.patoutlink, self.rev_dict['text']) #detect outlinks self.sections=re.findall(self.patsection, self.rev_dict['text']) #detect sections self.rev_dict['text']=re.sub(self.patitemize, '', self.rev_dict['text']) #filter out itemize bullets and line branches # Build current row for revinsert newrevinsert="("+self.rev_dict['id']+","+self.page_dict['id']+","+\ self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\ '"'+","+'"'+self.rev_dict['timestamp']+\ '"'+","+str(2*len(self.rev_dict['text']))+\ ","+str(len(self.rev_dict['text']))+\ ","+str(len(self.rev_dict['text'].split())) ## ","+str(len(self.highwords))+","+str(len(self.special))+\ newrevinsert+=","+str(len(self.inlinks))+","+str(len(self.outlinks))+","+str(len(self.trans))+","+str(len(self.sections))+\ ","+self.prior_rev_id+","+self.isRedirect+","+self.isStub+","+self.isMinor if self.rev_dict.has_key('comment'): newrevinsert+=","+'"'+self.rev_dict['comment'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+'"' else: newrevinsert+=",''" newrevinsert+=")" ############################################# ## CONSTRUCT DICTIONARIES WITH SPECIAL ITEMS ############################################# ## for item in self.highwords: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.highwords_dict.get(item) ## if (stumble==None): ## self.highwords_dict[item]=self.highwords_id ## self.highwords_rev_insert.append((self.rev_dict['id'],self.highwords_id)) ## self.highwords_id+=1 ## else: ## self.highwords_rev_insert.append((self.rev_dict['id'],stumble)) ## for item in self.special: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.special_dict.get(item) ## if (stumble==None): ## self.special_dict[item]=self.special_id ## self.special_rev_insert.append((self.rev_dict['id'], self.special_id)) ## self.special_id+=1 ## else: ## self.special_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.inlinks: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.inlinks_dict.get(item) ## if (stumble==None): ## self.inlinks_dict[item]=self.inlinks_id ## self.inlinks_rev_insert.append((self.rev_dict['id'], self.inlinks_id)) ## self.inlinks_id+=1 ## else: ## self.inlinks_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.outlinks: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.outlinks_dict.get(item) ## if (stumble==None): ## self.outlinks_dict[item]=self.outlinks_id ## self.outlinks_rev_insert.append((self.rev_dict['id'], self.outlinks_id)) ## self.outlinks_id+=1 ## else: ## self.outlinks_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.trans: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.trans_dict.get(item) ## if (stumble==None): ## self.trans_dict[item]=self.trans_id ## self.trans_rev_insert.append((self.rev_dict['id'], self.trans_id)) ## self.trans_id+=1 ## else: ## self.trans_rev_insert.append((self.rev_dict['id'], stumble)) ############################################## ## LOOK-AHEAD ALGORITHM ############################################## if self.revinsertrows==0: #Always allow at least one row in extended inserts self.revinsert="INSERT INTO revision VALUES"+newrevinsert self.revinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 elif (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024) and\ ((self.revinsertrows+1)<=self.options.imaxrows): #Append new row to self.revinsert self.revinsert+=","+newrevinsert self.revinsertrows+=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 else: #We must finish and write currrent insert and begin a new one if self.options.fileout: self.revinsert+=";\n" # Write output to SQL file self.revfile = codecs.open(self.options.revfile,'a','utf_8') self.revfile.write(revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.revinsert+=";" print self.revinsert.encode('utf_8') print self.revinsert.encode('utf_8') elif self.options.monitor: chances=0 while chances<5: try: dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf_8')) except (Exception), e: self.printfile = codecs.open("error_"+self.options.database,'a','utf_8') self.printfile.write(str(e)+"\n") self.printfile.write(self.revinsert[0:30]+"\n**********************************") self.printfile.close() chances+=1 else: break self.revinsert="INSERT INTO revision VALUES"+newrevinsert self.revinsertrows=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize=len(self.revinsert)*2 ################################################## ##Store this rev_id to recall it when processing the following revision, if it exists self.prior_rev_id=self.rev_dict['id'] ##Store this rev_len to recall it for the current page_len, in case this is the last revision for that page self.last_page_len=2*len(self.rev_dict['text']) self.rev_dict.clear() self.stack.pop() self.isMinor='0' self.inlinks=[]; self.outlinks=[]; self.trans=[]; self.sections=[] self.highwords=[]; self.special=[] self.rev_num+=1 if self.options.verbose and self.options.log is None: # Display status report if self.rev_num % 1000 == 0: self.timeCheck=datetime.datetime.now() self.timeDelta=self.timeCheck-self.start if self.timeDelta.seconds==0: print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)"\ % (self.page_num, 1e6*float(self.page_num)/self.timeDelta.microseconds,\ self.rev_num, 1e6*float(self.rev_num)/self.timeDelta.microseconds) self.printfile = codecs.open(self.fileErrPath,'a','utf_8') self.printfile.write("page "+str(self.page_num)+" ("+\ str( 1e6*float(self.page_num)/self.timeDelta.microseconds)+" pags. per sec.), revision "+\ str(self.rev_num)+ " ("+str(1e6*float(self.rev_num)/self.timeDelta.microseconds)+" revs. per sec.)\n") self.printfile.close() else: print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)"\ % (self.page_num, float(self.page_num)/self.timeDelta.seconds,\ self.rev_num, float(self.rev_num)/self.timeDelta.seconds) self.printfile = codecs.open(self.fileErrPath,'a','utf_8') self.printfile.write("page "+str(self.page_num)+" ("+\ str( float(self.page_num)/self.timeDelta.seconds)+" pags./sec.), revision "+\ str(self.rev_num)+ " ("+str(float(self.rev_num)/self.timeDelta.seconds)+" revs./sec.)\n") self.printfile.close() if self.options.verbose and self.options.log is not None: if self.rev_num%1000==0: self.timeCheck=datetime.datetime.now() self.timeDelta=self.timeCheck-self.start if self.timeDelta.seconds==0: self.printfile = codecs.open(self.options.log,'a','utf_8') self.printfile.write("page "+str(self.page_num)+" ("+\ str( 1e6*float(self.page_num)/self.timeDelta.microseconds)+" pags./sec.), revision "+\ str(self.rev_num)+ " ("+str(1e6*float(self.rev_num)/self.timeDelta.microseconds)+" revs./sec.)\n") self.printfile.close() else: self.printfile = codecs.open(self.options.log,'a','utf_8') self.printfile.write("page "+str(self.page_num)+" ("+\ str( float(self.page_num)/self.timeDelta.seconds)+" pags./sec.), revision "+\ str(self.rev_num)+ " ("+str(float(self.rev_num)/self.timeDelta.seconds)+" revs./sec.)\n") self.printfile.close() ################################################# ## END OF PAGE ################################################# elif name=='page': ################################################ ################################################ #GENERATE AND COMMIT SPECIAL VALUES INSERTS FOR ALL REVISIONS OF THIS PAGE # CREATE INSERT STRINGS FROM values ##HIGHLIGHTED WORDS ## self.high_insert_st='INSERT INTO highlight VALUES' ## for item in self.highwords_dict.iteritems(): ## self.high_insert_st+="("+str(item[1])+',"'+item[0]+'")' ## self.high_insert_st=re.sub(self.patdumb, "),(", self.high_insert_st) #### self.debug(self.high_insert_st) ## self.high_rev_insert_st='INSERT INTO rev_highlight VALUES' ## for item in self.highwords_rev_insert: ## self.high_rev_insert_st+="("+str(item[0])+","+str(item[1])+")" ## self.high_rev_insert_st=re.sub(self.patdumb, "),(", self.high_rev_insert_st) #### self.debug(self.high_rev_insert_st) ## ##SPECIAL LINKS ## self.special_insert_st='INSERT INTO special VALUES' ## for item in self.special_dict.iteritems(): ## self.special_insert_st+="("+str(item[1])+',"'+item[0]+'")' ## self.special_insert_st=re.sub(self.patdumb,"),(", self.special_insert_st) #### self.debug(self.special_insert_st) ## self.special_rev_insert_st='INSERT INTO rev_special VALUES' ## for item in self.special_rev_insert: ## self.special_rev_insert_st+="("+str(item[0])+","+str(item[1])+")" ## self.special_rev_insert_st=re.sub(self.patdumb,"),(",self.special_rev_insert_st) ## self.debug(self.special_rev_insert_st) ##INLINKS ## self.inlinks_insert_st='INSERT INTO inlink VALUES' ## for item in self.inlinks_dict.iteritems(): ## self.inlinks_insert_st+="("+str(item[1])+',"'+item[0]+'")' ## self.inlinks_insert_st=re.sub(self.patdumb,"),(", self.inlinks_insert_st) #### self.debug(self.inlinks_insert_st) ## self.inlinks_rev_insert_st='INSERT INTO rev_inlink VALUES' ## for item in self.inlinks_rev_insert: ## self.inlinks_rev_insert_st+="("+str(item[0])+","+str(item[1])+")" ## self.inlinks_rev_insert_st=re.sub(self.patdumb,"),(", self.inlinks_rev_insert_st) #### self.debug(self.inlinks_rev_insert_st) ## ##OUTLINKS ## self.outlinks_insert_st='INSERT INTO outlink VALUES' ## for item in self.outlinks_dict.iteritems(): ## self.outlinks_insert_st+="("+str(item[1])+',"'+item[0]+'")' ## self.outlinks_insert_st=re.sub(self.patdumb,"),(", self.outlinks_insert_st) #### self.debug(self.outlinks_insert_st) ## self.outlinks_rev_insert_st='INSERT INTO rev_outlink VALUES' ## for item in self.outlinks_rev_insert: ## self.outlinks_rev_insert_st+="("+str(item[0])+","+str(item[1])+")" ## self.outlinks_rev_insert_st=re.sub(self.patdumb,"),(",self.outlinks_rev_insert_st) #### self.debug(self.outlinks_rev_insert_st) ## ##TRANSLATION LINKS ## self.trans_insert_st='INSERT INTO trans VALUES' ## for item in self.trans_dict.iteritems(): ## self.trans_insert_st+="("+str(item[1])+',"'+item[0]+'")' ## self.trans_insert_st=re.sub(self.patdumb,"),(",self.trans_insert_st) #### self.debug(self.trans_insert_st) ## self.trans_rev_insert_st='INSERT INTO rev_trans VALUES' ## for item in self.trans_rev_insert: ## self.trans_rev_insert_st+="("+str(item[0])+","+str(item[1])+")" ## self.trans_rev_insert_st=re.sub(self.patdumb,"),(", self.trans_rev_insert_st) ## self.debug(self.trans_rev_insert_st) #COMMIT NEAT INSERTS if self.options.fileout: self.high_insert_st="\n;";self.high_rev_insert_st="\n;"; self.special_insert_st="\n;" self.special_rev_insert_st="\n;"; self.inlinks_insert_st="\n;"; self.inlinks_rev_insert_st="\n;" self.outlinks_insert_st="\n;"; self.outlinks_rev_insert_st="\n;"; self.trans_insert_st="\n;" self.trans_rev_insert_st="\n;" # Write output to SQL file # TODO: get a filename for special inserts self.neatfile = codecs.open('neat.sql','a','utf_8') if len(self.highwords_dict)>0: self.neatfile.write(self.high_insert_st); self.neatfile.write(self.high_rev_insert_st); if len(self.special_dict)>0: self.neatfile.write(self.special_insert_st) self.neatfile.write(self.special_rev_insert_st); if len(self.inlinks_dict)>0: self.neatfile.write(self.inlinks_insert_st); self.neatfile.write(self.inlinks_rev_insert_st) if len(self.outlinks_dict)>0: self.neatfile.write(self.outlinks_insert_st); self.neatfile.write(self.outlinks_rev_insert_st) if len(self.trans_dict)>0: self.neatfile.write(self.trans_insert_st); self.neatfile.write(self.trans_rev_insert_st) self.neatfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.high_insert_st=";";self.high_rev_insert_st=";"; self.special_insert_st=";" self.special_rev_insert_st=";"; self.inlinks_insert_st=";"; self.inlinks_rev_insert_st=";" self.outlinks_insert_st=";"; self.outlinks_rev_insert_st=";"; self.trans_insert_st=";" self.trans_rev_insert_st=";" if len(self.highwords_dict)>0: print self.high_insert_st.encode('utf_8'); print self.high_rev_insert_st.encode('utf_8') if len(self.special_dict)>0: print self.special_insert_st.encode('utf_8'); print self.special_rev_insert_st.encode('utf_8') if len(self.inlinks_dict)>0: print self.inlinks_insert_st.encode('utf_8'); print self.inlinks_rev_insert_st.enconde('utf_8') if len(self.outlinks_dict)>0: print self.outlinks_insert_st.encode('utf_8'); print self.outlinks_rev_insert_st.encode('utf_8') if len(self.trans_dict)>0: print self.trans_insert_st.encode('utf_8'); print self.trans_rev_insert_st.encode('utf_8') ## elif self.options.monitor: ## while 1: ## try: ## print str(len(self.highwords_dict))+" "+\ ## str(len(self.special_dict))+ " "+str(len(self.inlinks_dict))+ " "+\ ## str(len(self.outlinks_dict))+ " "+str(len(self.trans_dict))+"\n" ## if len(self.highwords_dict)>0: ## dbaccess.raw_query_SQL(self.acceso[1], self.high_insert_st.encode('utf_8')) ## dbaccess.raw_query_SQL(self.acceso[1], self.high_rev_insert_st.encode('utf_8')) ## if len(self.special_dict)>0: ## dbaccess.raw_query_SQL(self.acceso[1], self.special_insert_st.encode('utf_8')) ## dbaccess.raw_query_SQL(self.acceso[1], self.special_rev_insert_st.encode('utf_8')) ## if len(self.inlinks_dict)>0: ## dbaccess.raw_query_SQL(self.acceso[1], self.inlinks_insert_st.encode('utf_8')) ## dbaccess.raw_query_SQL(self.acceso[1], self.inlinks_rev_insert_st.encode('utf_8')) ## if len(self.outlinks_dict)>0: ## dbaccess.raw_query_SQL(self.acceso[1], self.outlinks_insert_st.encode('utf_8')) ## dbaccess.raw_query_SQL(self.acceso[1], self.outlinks_rev_insert_st.encode('utf_8')) ## if len(self.trans_dict)>0: ## dbaccess.raw_query_SQL(self.acceso[1], self.trans_insert_st.encode('utf_8')) ## dbaccess.raw_query_SQL(self.acceso[1], self.trans_rev_insert_st.encode('utf_8')) ## except (Exception), e: ## print e ## else: ## break #Reset status vars self.high_insert_st=""; self.high_rev_insert_st=""; self.special_insert_st="" self.special_rev_insert_st=""; self.inlinks_insert_st=""; self.inlinks_rev_insert_st="" self.outlinks_insert_st=""; self.outlinks_rev_insert_st=""; self.trans_insert_st="" self.trans_rev_insert_st="" self.highwords_dict={}; self.special_dict={}; self.inlinks_dict={}; self.outlinks_dict={}; self.trans_dict={} self.highwords_rev_insert=[]; self.special_rev_insert=[]; self.inlinks_rev_insert=[] self.outlinks_rev_insert=[]; self.trans_rev_insert=[]; ################################################ ##Recovering namespace for this page if self.nspace_dict.has_key(self.page_dict['title'].split(':')[0]): self.page_dict['namespace']=self.nspace_dict[self.page_dict['title'].split(':')[0]] else: self.page_dict['namespace']='0' ######################################### #CONSTRUCTION OF EXTENDED INSERT FOR PAGES (RESEARCH VERSION) ######################################### ##Values order for page (page_id, page_namespace, page_title, page_latest, page_len, page_is_redirect, ##page_is_stub, page_random, page_is_new, page_restrictions) newpageinsert="("+self.page_dict['id']+","+\ self.page_dict['namespace']+","+'"'+\ self.page_dict['title'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\ '"'+","+self.prior_rev_id+","+str(self.last_page_len)+\ ","+self.isRedirect+","+self.isStub+","+str(random.random()) if self.rev_count>1: newpageinsert+=",1" else: newpageinsert+=",0" if self.page_dict.has_key('restrictions'): newpageinsert+=","+'"'+self.page_dict['restrictions']+'"' else: newpageinsert+=",''" newpageinsert+=")" if self.pageinsertrows==0: self.pageinsert="INSERT INTO page VALUES"+newpageinsert self.pageinsertrows+=1 self.pageinsertsize=len(self.pageinsert)*2 elif (self.pageinsertsize+(2*len(newpageinsert))<=self.options.imaxsize*1024) and\ (self.pageinsertrows+1<=self.options.imaxrows): #Append current row to extended insert self.pageinsert+=","+newpageinsert self.pageinsertrows+=1 self.pageinsertsize=len(self.pageinsert)*2 else: #We must write this extended insert and begin a new one if self.options.fileout: #Write extended insert to file self.pageinsert+=";\n" self.pagefile = codecs.open(self.options.pagefile,'a','utf_8') self.pagefile.write(self.pageinsert) self.pagefile.close() elif self.options.streamout: #Write extended insert to sys.stdout (stream to MySQL) self.pageinsert+=";" print self.pageinsert.encode('utf_8') elif self.options.monitor: chances=0 while chances<5: try: dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode('utf_8')) except (Exception), e: self.printfile = codecs.open("error_"+self.options.database,'a','utf_8') self.printfile.write(str(e)+"\n") self.printfile.write(self.pageinsert[0:30]+"\n**********************************") self.printfile.close() chances+=1 else: break self.pageinsert="INSERT INTO page VALUES"+newpageinsert self.pageinsertrows=1 self.pageinsertsize=len(self.pageinsert)*2
# We must write the last pageinsert before finishing this dump if self.options.fileout: # Write output to SQL file self.pageinsert += ";\n" self.pagefile = codecs.open(self.options.pagefile, "a", "utf_8") self.pagefile.write(self.pageinsert) self.pagefile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.pageinsert += ";" print self.pageinsert.encode("utf_8") elif self.options.monitor: chances = 0 while chances < 5: try: dbaccess.raw_query_SQL(self.acceso[1], self.pageinsert.encode("utf_8")) except (Exception), e: self.printfile = codecs.open("error_" + self.options.database, "a", "utf_8") self.printfile.write(str(e) + "\n") self.printfile.write(self.pageinsert[0:30] + "\n**********************************") self.printfile.close() chances += 1 else: break # Reset status vars self.pageinsertrows = 0 self.pageinsertsize = 0 # INSERT NAMESPACES CODES AND TITLES IN SPECIAL TABLE nspaces = self.nspace_dict.iteritems() insertns = "INSERT INTO namespaces VALUES" first_loop = True
def endElement(self, name): ## Defining tasks to manage contents from the last readed tag ## Catching the namespace of this page if name == "namespace": self.nspace_dict[self.current_text] = self.codens elif name == "id": if self.stack[-1] == "contributor": ##Detecting contributor's attributes inside a revision self.rev_dict["rev_user"] = self.current_text elif self.stack[-1] == "revision": self.rev_dict[name] = self.current_text elif self.stack[-1] == "page": self.page_dict[name] = self.current_text else: self.f = open(self.fileErrPath, "w") if len(self.stack) > 0: self.f.write("Unsupported parent tag for '" + name + "': " + self.stack[-1]) self.f.close() elif name == "ip": self.rev_dict["rev_user"] = "******" self.rev_dict["username"] = self.current_text elif name == "timestamp": ##Adequate formatting of timestamps self.rev_dict["timestamp"] = self.current_text.replace("Z", "").replace("T", " ") elif name == "contributor": ##Pop contributor tag from the stack self.stack.pop() ##################################################### ## END OF REVISION ##################################################### elif name == "revision": self.rev_count += 1 ##Store whether this is a redirect or stub page or not ##TODO: Substitute the find command with a regexp if len(self.rev_dict["text"]) > 0: if string.upper(self.rev_dict["text"][0:9]) == "#REDIRECT": self.isRedirect = "1" else: self.isRedirect = "0" ## Takes from the first argument the threshold for stub's length if 2 * len(self.rev_dict["text"]) <= self.options.stubth: self.isStub = "1" else: self.isStub = "0" ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (RESEARCH VERSION)###### ##Values order: (rev_id, rev_page, rev_user, rev_user_text, rev_timestamp, ##rev_len, rev_letters_raw, rev_words_raw, rev_words_filter, rev_in_links, rev_out_links, rev_trans, ##rev_sections, rev_bolds, rev_italics, rev_bolditalics ##rev_parent_id, rev_is_redirect, rev_is_stub, rev_is_minor, rev_comment) ##Calculation of additional fancy statistics AND ##Detection and stripping of wiki tags and HTML tags ##We also store inlinks, outlinks and special links self.rev_dict["text"] = re.sub(self.pathtml, "", self.rev_dict["text"]) # filter HTML tags self.rev_dict["text"] = re.sub(self.patunicode, "X", self.rev_dict["text"]) # convert unicode chars to X ## self.highwords=re.findall(self.pathighwords, self.rev_dict['text']) #detect highlighted words ## for i in range(len(self.highwords)): ## self.highwords[i]=re.sub(self.pathighlight,'', self.highwords[i]) #Filter bold/italics tags self.rev_dict["text"] = re.sub(self.pathighlight, "", self.rev_dict["text"]) # filter highlight tags ## self.special=re.findall(self.patspecial, self.rev_dict['text']) #detect special links self.trans = re.findall(self.pattrans, self.rev_dict["text"]) # detect translation links self.rev_dict["text"] = re.sub( self.patspecial, "", self.rev_dict["text"] ) # filter out special links (after detecting trans) self.inlinks = re.findall(self.patinlink, self.rev_dict["text"]) # detect inlinks self.outlinks = re.findall(self.patoutlink, self.rev_dict["text"]) # detect outlinks self.sections = re.findall(self.patsection, self.rev_dict["text"]) # detect sections self.rev_dict["text"] = re.sub( self.patitemize, "", self.rev_dict["text"] ) # filter out itemize bullets and line branches # Build current row for revinsert newrevinsert = ( "(" + self.rev_dict["id"] + "," + self.page_dict["id"] + "," + self.rev_dict["rev_user"] + "," + '"' + self.rev_dict["username"].replace("\\", "\\\\").replace("'", "\\'").replace('"', '\\"') + '"' + "," + '"' + self.rev_dict["timestamp"] + '"' + "," + str(2 * len(self.rev_dict["text"])) + "," + str(len(self.rev_dict["text"])) + "," + str(len(self.rev_dict["text"].split())) ) ## ","+str(len(self.highwords))+","+str(len(self.special))+\ newrevinsert += ( "," + str(len(self.inlinks)) + "," + str(len(self.outlinks)) + "," + str(len(self.trans)) + "," + str(len(self.sections)) + "," + self.prior_rev_id + "," + self.isRedirect + "," + self.isStub + "," + self.isMinor ) if self.rev_dict.has_key("comment"): newrevinsert += ( "," + '"' + self.rev_dict["comment"].replace("\\", "\\\\").replace("'", "\\'").replace('"', '\\"') + '"' ) else: newrevinsert += ",''" newrevinsert += ")" ############################################# ## CONSTRUCT DICTIONARIES WITH SPECIAL ITEMS ############################################# ## for item in self.highwords: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.highwords_dict.get(item) ## if (stumble==None): ## self.highwords_dict[item]=self.highwords_id ## self.highwords_rev_insert.append((self.rev_dict['id'],self.highwords_id)) ## self.highwords_id+=1 ## else: ## self.highwords_rev_insert.append((self.rev_dict['id'],stumble)) ## for item in self.special: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.special_dict.get(item) ## if (stumble==None): ## self.special_dict[item]=self.special_id ## self.special_rev_insert.append((self.rev_dict['id'], self.special_id)) ## self.special_id+=1 ## else: ## self.special_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.inlinks: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.inlinks_dict.get(item) ## if (stumble==None): ## self.inlinks_dict[item]=self.inlinks_id ## self.inlinks_rev_insert.append((self.rev_dict['id'], self.inlinks_id)) ## self.inlinks_id+=1 ## else: ## self.inlinks_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.outlinks: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.outlinks_dict.get(item) ## if (stumble==None): ## self.outlinks_dict[item]=self.outlinks_id ## self.outlinks_rev_insert.append((self.rev_dict['id'], self.outlinks_id)) ## self.outlinks_id+=1 ## else: ## self.outlinks_rev_insert.append((self.rev_dict['id'], stumble)) ## for item in self.trans: ## item=item.replace("\\","\\\\").replace("'","\\'").replace('"', '\\"') ## stumble=self.trans_dict.get(item) ## if (stumble==None): ## self.trans_dict[item]=self.trans_id ## self.trans_rev_insert.append((self.rev_dict['id'], self.trans_id)) ## self.trans_id+=1 ## else: ## self.trans_rev_insert.append((self.rev_dict['id'], stumble)) ############################################## ## LOOK-AHEAD ALGORITHM ############################################## if self.revinsertrows == 0: # Always allow at least one row in extended inserts self.revinsert = "INSERT INTO revision VALUES" + newrevinsert self.revinsertrows += 1 # Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize = len(self.revinsert) * 2 elif (self.revinsertsize + (2 * len(newrevinsert)) <= self.options.imaxsize * 1024) and ( (self.revinsertrows + 1) <= self.options.imaxrows ): # Append new row to self.revinsert self.revinsert += "," + newrevinsert self.revinsertrows += 1 # Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize = len(self.revinsert) * 2 else: # We must finish and write currrent insert and begin a new one if self.options.fileout: self.revinsert += ";\n" # Write output to SQL file self.revfile = codecs.open(self.options.revfile, "a", "utf_8") self.revfile.write(revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.revinsert += ";" print self.revinsert.encode("utf_8") print self.revinsert.encode("utf_8") elif self.options.monitor: chances = 0 while chances < 5: try: dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode("utf_8")) except (Exception), e: self.printfile = codecs.open("error_" + self.options.database, "a", "utf_8") self.printfile.write(str(e) + "\n") self.printfile.write(self.revinsert[0:30] + "\n**********************************") self.printfile.close() chances += 1 else: break self.revinsert = "INSERT INTO revision VALUES" + newrevinsert self.revinsertrows = 1 # Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize = len(self.revinsert) * 2 ################################################## ##Store this rev_id to recall it when processing the following revision, if it exists self.prior_rev_id = self.rev_dict["id"] ##Store this rev_len to recall it for the current page_len, in case this is the last revision for that page self.last_page_len = 2 * len(self.rev_dict["text"]) self.rev_dict.clear() self.stack.pop() self.isMinor = "0" self.inlinks = [] self.outlinks = [] self.trans = [] self.sections = [] self.highwords = [] self.special = [] self.rev_num += 1 if self.options.verbose and self.options.log is None: # Display status report if self.rev_num % 1000 == 0: self.timeCheck = datetime.datetime.now() self.timeDelta = self.timeCheck - self.start if self.timeDelta.seconds == 0: print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)" % ( self.page_num, 1e6 * float(self.page_num) / self.timeDelta.microseconds, self.rev_num, 1e6 * float(self.rev_num) / self.timeDelta.microseconds, ) self.printfile = codecs.open(self.fileErrPath, "a", "utf_8") self.printfile.write( "page " + str(self.page_num) + " (" + str(1e6 * float(self.page_num) / self.timeDelta.microseconds) + " pags. per sec.), revision " + str(self.rev_num) + " (" + str(1e6 * float(self.rev_num) / self.timeDelta.microseconds) + " revs. per sec.)\n" ) self.printfile.close() else: print >> sys.stderr, "page %d (%f pags./sec.), revision %d (%f revs./sec.)" % ( self.page_num, float(self.page_num) / self.timeDelta.seconds, self.rev_num, float(self.rev_num) / self.timeDelta.seconds, ) self.printfile = codecs.open(self.fileErrPath, "a", "utf_8") self.printfile.write( "page " + str(self.page_num) + " (" + str(float(self.page_num) / self.timeDelta.seconds) + " pags./sec.), revision " + str(self.rev_num) + " (" + str(float(self.rev_num) / self.timeDelta.seconds) + " revs./sec.)\n" ) self.printfile.close() if self.options.verbose and self.options.log is not None: if self.rev_num % 1000 == 0: self.timeCheck = datetime.datetime.now() self.timeDelta = self.timeCheck - self.start if self.timeDelta.seconds == 0: self.printfile = codecs.open(self.options.log, "a", "utf_8") self.printfile.write( "page " + str(self.page_num) + " (" + str(1e6 * float(self.page_num) / self.timeDelta.microseconds) + " pags./sec.), revision " + str(self.rev_num) + " (" + str(1e6 * float(self.rev_num) / self.timeDelta.microseconds) + " revs./sec.)\n" ) self.printfile.close() else: self.printfile = codecs.open(self.options.log, "a", "utf_8") self.printfile.write( "page " + str(self.page_num) + " (" + str(float(self.page_num) / self.timeDelta.seconds) + " pags./sec.), revision " + str(self.rev_num) + " (" + str(float(self.rev_num) / self.timeDelta.seconds) + " revs./sec.)\n" ) self.printfile.close()
def overall(self): """ Preprocessing tables for evolution of page length over time """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) dbaccess.raw_query_SQL(self.access[1], "create or replace view page_main_nored as "+\ "(select page_id from page where page_namespace=0 and page_is_redirect=0)") dbaccess.raw_query_SQL(self.access[1], "create or replace view rev_main_nored as ("+\ "select rev_id, rev_user, rev_page, rev_timestamp, rev_len from revision where rev_page in "+\ "(select page_id from page_main_nored))") self.minyear=dbaccess.raw_query_SQL(self.access[1],"select min(year(rev_timestamp)) from revision") self.years=range(int(self.minyear[0][0])+1, 2009) for self.year in self.years: dbaccess.raw_query_SQL(self.access[1],"drop table if exists max_rev_"+str(self.year)) dbaccess.raw_query_SQL(self.access[1],"create table max_rev_"+str(self.year)+\ " as (select max(rev_id) as max_id, rev_page from rev_main_nored "+\ "where year(rev_timestamp)<"+str(self.year)+" group by rev_page)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add primary key (max_id)") dbaccess.raw_query_SQL(self.access[1], "alter table max_rev_"+str(self.year)+" add index (rev_page)") dbaccess.close_Connection(self.access[0])
################################################ #We must write the las revinsert before finishing this page if self.options.fileout: self.revinsert += ";\n" # Write output to SQL file self.revfile = codecs.open(self.options.revfile, 'a', 'utf_8') self.revfile.write(self.revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.revinsert += ";" print self.revinsert.encode('utf_8') elif self.options.monitor: while 1: try: dbaccess.raw_query_SQL(self.acceso[1], self.revinsert.encode('utf_8')) except (Exception), e: print e else: break #Reset status vars self.revinsertrows = 0 self.revinsertsize = 0 ################################################ ##Same for Insert into text table if self.options.fileout: self.textinsert += ";\n" # Write output to SQL file self.textfile = codecs.open(self.options.textfile, 'a', 'utf_8') self.textfile.write(self.textinsert)
def endElement(self, name): ## Defining tasks to manage contents from the last readed tag ## Catching the namespace of this page if name == 'namespace': self.nspace_dict[self.current_text] = self.codens elif name == 'id': if self.stack[-1] == 'contributor': ##Detecting contributor's attributes inside a revision self.rev_dict['rev_user'] = self.current_text elif self.stack[-1] == 'revision': self.rev_dict[name] = self.current_text elif self.stack[-1] == 'page': self.page_dict[name] = self.current_text else: self.f = open(self.fileErrPath, 'w') if len(self.stack) > 0: self.f.write("Unsupported parent tag for '" + name + "': " + self.stack[-1]) self.f.close() elif name == 'ip': self.rev_dict['rev_user'] = '******' self.rev_dict['username'] = self.current_text elif name == 'timestamp': ##Adequate formatting of timestamps self.rev_dict['timestamp'] = self.current_text.replace( 'Z', '').replace('T', ' ') elif name == 'contributor': ##Pop contributor tag from the stack self.stack.pop() elif name == 'revision': self.rev_count += 1 ##Store whether this is a redirect or stub page or not if len(self.rev_dict['text']) > 0: if string.upper(self.rev_dict['text'][0:9]) == '#REDIRECT': self.isRedirect = '1' else: self.isRedirect = '0' ## Takes from the first argument the threshold for stub's length if str(2 * len(self.rev_dict['text'])) <= self.options.stubth: self.isStub = '1' else: self.isStub = '0' ####CONSTRUCTION OF EXTENDED INSERTS FOR REVISIONS (STANDARD VERSION)###### ##Values order: (rev_id, rev_page, [[rev_text_id=rev_id]], rev_comment, ##rev_user, rev_user_text, rev_timestamp, rev_is_minor) # Build current row for revinsert newrevinsert = "(" + self.rev_dict['id'] + "," + self.page_dict[ 'id'] + "," + self.rev_dict['id'] if self.rev_dict.has_key('comment'): newrevinsert += "," + '"' + self.rev_dict['comment'].replace( "\\", "\\\\").replace("'", "\\'").replace('"', '\\"') + '"' else: newrevinsert += ",''" newrevinsert+=","+self.rev_dict['rev_user']+","+'"'+self.rev_dict['username'].\ replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\ '"'+","+'"'+self.rev_dict['timestamp']+\ '"'+","+self.isMinor+")" if self.revinsertrows == 0: #Always allow at least one row in extended inserts self.revinsert = "INSERT INTO revision VALUES" + newrevinsert self.revinsertrows += 1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize = len(self.revinsert) * 2 elif (self.revinsertsize+(2*len(newrevinsert))<=self.options.imaxsize*1024) and\ ((self.revinsertrows+1)<=self.options.imaxrows): #Append new row to self.revinsert self.revinsert += "," + newrevinsert self.revinsertrows += 1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize = len(self.revinsert) * 2 else: #We must finish and write currrent insert and begin a new one if self.options.fileout: self.revinsert += ";\n" # Write output to SQL file self.revfile = codecs.open(self.options.revfile, 'a', 'utf_8') self.revfile.write(revinsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.revinsert += ";" print self.revinsert.encode('utf_8') elif self.options.monitor: while 1: try: dbaccess.raw_query_SQL( self.acceso[1], self.revinsert.encode('utf_8')) except (Exception), e: print e else: break self.revinsert = "INSERT INTO revision VALUES" + newrevinsert self.revinsertrows = 1 #Conservative approach: assuming 2 bytes per UTF-8 character self.revinsertsize = len(self.revinsert) * 2 ################################################## ##CONSTRUCTION OF EXTENDED INSERTS FOR TABLE TEXT ##Template for each row: ## (old_id, old_text, old_flags) newtextinsert = "(" + self.rev_dict['id'] + ',' + '"' if self.options.inject != None: newtextinsert += self.options.inject newtextinsert+=self.rev_dict['text'].replace("\\","\\\\").replace("'","\\'").replace('"', '\\"')+\ '",'+'"utf8")' if self.textinsertrows == 0: #Always allow at least one row in extended inserts self.textinsert = "INSERT INTO text VALUES" + newtextinsert self.textinsertrows += 1 #Conservative approach: assuming 2 bytes per UTF-8 character self.textinsertsize = len(self.textinsert) * 2 elif (self.textinsertsize+(2*len(newtextinsert))<=self.options.imaxsize*1024) and\ ((self.textinsertrows+1)<=self.options.imaxrows): #Append new row to self.revinsert self.textinsert += "," + newtextinsert self.textinsertrows += 1 #Conservative approach: assuming 2 bytes per UTF-8 character self.textinsertsize = len(self.textinsert) * 2 else: #We must finish and write currrent insert and begin a new one if self.options.fileout: self.textinsert += ";\n" # Write output to SQL file self.textfile = codecs.open(self.options.textfile, 'a', 'utf_8') self.textfile.write(textinsert) self.textfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.textinsert += ";" print self.textinsert.encode('utf_8') elif self.options.monitor: while 1: try: dbaccess.raw_query_SQL( self.acceso[1], self.textinsert.encode('utf_8')) except (Exception), e: print e else: break self.textinsert = "INSERT INTO text VALUES" + newtextinsert self.textinsertrows = 1 #Conservative approach: assuming 2 bytes per UTF-8 character self.textinsertsize = len(self.textinsert) * 2
def analyze(self): #Initialize file header f=open("wkp_cox_prop_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts,in_talk,in_FAs\n") f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" print "Starting language "+self.language+"\n" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) ##Create table of users in talk pages dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_talk") dbaccess.raw_query_SQL(self.access[1],"create table users_in_talk as (select distinct(rev_user) from revision "+\ "where rev_page in (select page_id from page where page_namespace=1))") dbaccess.raw_query_SQL(self.access[1],"alter table users_in_talk add primary key (rev_user)") ##Create table of users in FAs dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_in_FAs") dbaccess.raw_query_SQL(self.access[1],"create table users_in_FAs as (select distinct(rev_user) from revision_FAs)") dbaccess.raw_query_SQL(self.access[1],"alter table users_in_FAs add primary key (rev_user)") ##MIX previous info with time_range_authors --> save result in new table time_range_cox dbaccess.raw_query_SQL(self.access[1],"drop table if exists time_range_cox") dbaccess.raw_query_SQL(self.access[1],"create table time_range_cox as (select rev_user, "+\ "date(min_ts) as min_ts, date(max_ts) as max_ts, "+\ "case when rev_user in (select rev_user from users_in_talk) then 1 else 0 end as in_talk, "+\ "case when rev_user in (select rev_user from users_in_FAs) then 1 else 0 end as in_FAs "+\ "from time_range_authors)") ##IN SYSTEM print "Interm. tables created proceeding to write out data..."+self.language+"\n" results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, min_ts, max_ts, in_talk, in_FAs "+\ "from time_range_cox "+\ " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_cox_prop_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\","+\ str(int(result[3]))+","+str(int(result[4]))+"\n") f.close() print "Finished all tasks for "+self.language+"\n"
def decompress (self): """ Decompress the DB dumps into MySQL """ ##TODO: Ad-hoc acuerdate de quitar esto POR DIOSSS ##self.filename="mtwiki-latest-pages-meta-history.xml.7z" if self.dumptype=="research": program="dump_sax_research.py" elif self.dumptype=="standard": program="dump_sax.py" else: print "Error! Unexpected type of dump received" return -1 self.filename=self.filenameTemplate.safe_substitute(language=self.language,file=self.files[0]) #Then we call our parser "dump_sax_research.py" to load data into MySQL command_7z="7za e -so dumps/"+self.filename+" | "+"python "+program+\ " -u "+self.msqlu+" -p "+self.msqlp+" -d "+"wx_"+self.language+"_"+self.dumptype+\ " --log "+self.language+".log" success=os.system(command_7z) if success == 0: print "DB "+"wx_"+self.language+\ self.dumptype+" successfully decompressed...\n\n" else: print "Error! There was an error trying to decompress database --> "+\ "wx_"+self.language+self.dumptype return -1 #Loading into MySQL other interesting tables directly provided in SQL format #SQL code to generate the tables is embedded in the SQL file itself for index in range(1,len(self.files)): self.filename=self.filenameTemplate.safe_substitute(language=self.language, file=self.files[index]) command_gzip="gzip -d dumps/"+self.filename command_mysql="mysql -u "+self.msqlu+" -p"+self.msqlp+\ " wx_"+self.language+"_"+self.dumptype+\ " < dumps/"+self.filename.strip(".gz") command_comp="gzip dumps/"+self.filename.strip(".gz") print "Decompressing "+self.filename+"..." success=os.system(command_gzip) if success==0: print "Loading "+self.filename.strip(".gz")+" into MySQL database..." success=os.system(command_mysql) if success==0: print "Compressing again "+self.filename.strip(".gz")+"..." success=os.system(command_comp) if success!=0: print "Error compressing again "+self.filename.strip(".gz") return -1 else: print "Error loading "+self.filename.strip(".gz") return -1 else: print "Error decompressing "+self.filename return -1 print "Generating indexes for tables page and revision...\n" print "Depending on the dump size this may take a while...\n" acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu,\ self.msqlp, "wx_"+self.language+"_"+self.dumptype) #Generate adequate indexes and keys in tables page and revision print "Generating index for page_len...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE page ADD INDEX (page_len)") print "Modifying rev_timestamp to support DATETIME and creating index...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision MODIFY rev_timestamp DATETIME") dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX timestamp (rev_timestamp)") print "Generating index for rev_page and rev_timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)") print "Generating index for rev_user and rev_timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)") print "Generating index for rev_user_text and timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)") dbaccess.close_Connection(acceso[0]) print "Database ready for quantitative analysis...\n" print "Let's go on... Cross your fingers... ;-) \n\n\n" return success
print "An exception ocurred, the problem was the following:\n" print e print "*************\n\n" try: print "Generating index for rev_page and rev_timestamp...\n" dbaccess.raw_query_SQL( self.access[1], "ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)" ) except Exception, e: print "An exception ocurred, the problem was the following:\n" print e print "*************\n\n" try: print "Generating index for rev_user and rev_timestamp...\n" dbaccess.raw_query_SQL( self.access[1], "ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)" ) except Exception, e: print "An exception ocurred, the problem was the following:\n" print e print "*************\n\n" # try: # print "Generating index for rev_user_text and timestamp...\n" # dbaccess.raw_query_SQL(self.access[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)") # except Exception, e: # print "An exception ocurred, the problem was the following:\n" # print e # print "*************\n\n" print "Database" + "wx_" + self.language + "wiki_" + self.dumptype + " ready for quantitative analysis...\n" ##Close connection to DB server dbaccess.close_Connection(self.access[0])
def core_prepro(self): """ Creates intermediate tables with info about core members (by activity and by top % of total number of revisions """ for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) #Obtain the list of years and months, with total num. of revisions and total num of logged users dbaccess.raw_query_SQL(self.access[1],"drop table if exists core_limits_monthly") dbaccess.raw_query_SQL(self.access[1],"create table core_limits_monthly as "+\ "(select year(rev_timestamp) as year, month(rev_timestamp) as month, "+\ "count(distinct(rev_user)) num_users, count(*) num_revs from revision_logged group by year, month "+\ "order by year, month)") print "Created table core_limits_monthly "+self.language+"\n" date_range=dbaccess.raw_query_SQL(self.access[1],"select * from core_limits_monthly "+\ "order by year, month") #Core users: top-10% of total number of authors in that month #Core users with top-10% of total number of revisions in that month #Loop for each month need_create=True #LOOP FOR EACH MONTH IN LANG for adate in date_range: print "Processing year "+str(adate[0])+" month "+str(adate[1])+"\n" total_users=adate[2] #Total number of authors in that month total_revs=adate[3] #Total number of revisions in that month # To take the core of top-10% most active authors in that month limit_auth=int(round(total_users*0.1))+1 # To take the core of authors responsible for top-10% of tot num.revs in that month limit_revs=int(round(total_revs*0.1)) count_users=0 count_revs=0 insert_users=True insert_revs=True #Get the list of active logged users for that month (descendent order!) ##IMPORTANT NOTE: FIRST APPLY SUBQUERY TO FILTER ALL REVISIONS IN THIS MONTH ##THEN APPLY THE GROUP AND ORDER CLAUSES ON THAT SUBQUERY ##THIS WAY, WE SAVE **A LOT** OF TIME DURING THIS PREPROCESSING STAGE month_users=dbaccess.raw_query_SQL(self.access[1],"select rev_user, count(*) num_revs_month from "+\ "(select rev_user, rev_timestamp from revision_logged where "+\ "year(rev_timestamp)="+str(int(adate[0]))+" and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\ "order by num_revs_month desc" ) #Calculate num. of authors accumulating top-10% of revs in that month for auser in month_users: count_revs=count_revs+int(auser[1]) count_users=count_users+1 if (count_revs>limit_revs): break if (need_create): #TABLE: Monthly info for users in core (by activity) dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_core_monthly") dbaccess.raw_query_SQL(self.access[1],"create table users_core_monthly as (select "+\ "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\ "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\ "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\ " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\ "order by num_revs_month desc limit "+str(limit_auth)+")" ) #TABLE: Monthly info for users in core (by revisions) dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_rev_core_monthly") dbaccess.raw_query_SQL(self.access[1],"create table users_rev_core_monthly as (select "+\ "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\ "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\ "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\ " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\ "order by num_revs_month desc limit "+str(count_users)+")" ) print "Created tables monthly data for "+self.language+"\n" need_create=False else: #Insert info in table with monthly info for users in core (by activity) dbaccess.raw_query_SQL(self.access[1],"insert into users_core_monthly (select "+\ "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\ "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\ "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\ " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\ "order by num_revs_month desc limit "+str(limit_auth)+")" ) #Insert info in table with monthly info for users in core (by revisions) dbaccess.raw_query_SQL(self.access[1],"insert into users_rev_core_monthly (select "+\ "rev_user, min(rev_timestamp) lower_ts_month, max(rev_timestamp) upper_ts_month, "+\ "count(*) as num_revs_month from (select rev_user, rev_timestamp from "+\ "revision_logged where year(rev_timestamp)="+str(int(adate[0]))+\ " and month(rev_timestamp)="+str(int(adate[1]))+")x group by rev_user "+\ "order by num_revs_month desc limit "+str(count_users)+")" ) print "Inserted monthly data for "+self.language+"\n" ####NOTE: WE ARE SUPPOSING THAT USERS DOES NOT LEAVE THE CORE SUBSEQUENTLY, TO COME BACK AGAIN, i.e. ####ONCE THEY JOIN THE CORE, WE ASSUME THAT THE DEFINITELY LEAVE IT AT max_ts_core ####BY THE MOMENT, WE WILL STICK TO THIS ASSUMPTION. LATER ON, WE CAN SEE HOW TO IDENTIFY BLANK PERIODS #Insert in table of core users values #users_core = top-10% most active authors in each month #users_rev_core = authors accumulating top-10% of tot. num. of revs. in that month #TABLE: Users in core by activity (user, min_ts, max_ts, min_ts_core, max_ts_core) print "Creating table users_core for "+ self.language+"\n" dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_core") dbaccess.raw_query_SQL(self.access[1], "create table users_core as (select x.*, "+\ "(select min_ts from time_range_users r where r.rev_user=x.rev_user) min_ts, (select max_ts from "+\ "time_range_users s where s.rev_user=x.rev_user) max_ts "+\ "from (select rev_user, min(lower_ts_month) min_ts_core, "+\ "max(upper_ts_month) max_ts_core from users_core_monthly group by rev_user) x)") #TABLE: Users in core by activity (user, min_ts, max_ts, min_ts_core, max_ts_core) print "Creating table users_rev_core for "+ self.language+"\n" dbaccess.raw_query_SQL(self.access[1],"drop table if exists users_rev_core") dbaccess.raw_query_SQL(self.access[1], "create table users_rev_core as (select x.*, "+\ "(select min_ts from time_range_users r where r.rev_user=x.rev_user) min_ts, (select max_ts from "+\ "time_range_users s where s.rev_user=x.rev_user) max_ts "+\ "from (select rev_user, min(lower_ts_month) min_ts_core, "+\ "max(upper_ts_month) max_ts_core from users_rev_core_monthly group by rev_user) x)") print "All core_prepro tasks finished for"+ self.language+"\n" #Close DB connection dbaccess.close_Connection(self.access[0])
print "*************\n\n" try: print "Retrieving list of logged users..." users=dbaccess.raw_query_SQL(self.access[1],"select distinct(rev_user) from revision where rev_user!=0 "+\ "and rev_user not in (select ug_user from user_groups where ug_group='bot')") except Exception, e: print "An exception ocurred, the problem was the following:\n" print e print "*************\n\n" print "Composing lag info, and inserting in db table...\n" for user in users: history=[] try: print "User "+str(int(user[0]))+"..." history=dbaccess.raw_query_SQL(self.access[1],"select rev_user, rev_timestamp from revision "+\ "where rev_user="******" order by rev_timestamp") except Exception, e: print "An exception ocurred in user processing, the problem was the following:\n" print e print "*************\n\n" # It only makes sense to insert information if there are at least 2 editions for a certain user if length(history)>1: j=0 result=[] query="" for item in history: if (j+1)<len(history): result.append((item[0], item[1], history[j+1][1])) j=j+1 k=0 for item in result:
def surv_files(self): """ Creates all data files used as input for demography scripts in GNU R """ #Initialize all files headers #FILE: Survival data for all users (including editors out of MAIN) f=open("wkp_surv_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() #FILE: Survival data for all logged users who edited in MAIN f=open("wkp_surv_main_all.dat",'w') f.write("Project,rev_user,min_ts,max_ts\n") f.close() #FILE: Survival data for all logged editors until they join the core (activity) f=open("wkp_surv_join_core_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() #FILE: Survival data for logged editors since they join the core until they leave it (activity) f=open("wkp_surv_in_core_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() #FILE: Survival data for loged editors since they leave the core until death (activity) f=open("wkp_surv_core_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() #FILE: Survival data for all logged editors until they join the core (revisions) f=open("wkp_surv_join_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts,min_ts_core\n") f.close() #FILE: Survival data for logged editors since they join the core until they leave it (revisions) f=open("wkp_surv_in_core_rev_all.dat",'w') f.write("Project,rev_user,min_ts_core,max_ts_core\n") f.close() #FILE: Survival data for loged editors since they leave the core until death (revisions) f=open("wkp_surv_core_rev_to_max_ts_all.dat",'w') f.write("Project,rev_user,max_ts_core,max_ts\n") f.close() for self.language in self.languages: self.dbname="wx_"+self.language+"wiki_research" print "Starting language "+self.language+"\n" ##IN SYSTEM self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_authors "+\ " where rev_user!=0 and rev_user not in (select ug_user from user_groups where ug_group='bot')") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN MAIN self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(max_ts) from time_range_users ") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_main_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished core users by activity for language "+self.language+"\n" ########################### ##REV CORE ##JOIN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts), date(min_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_join_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##IN CORE self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(min_ts_core), date(max_ts_core) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_in_core_rev_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() ##CORE TO DEATH self.access=dbaccess.get_Connection("localhost", 3306, self.dbuser, self.dbpassw, self.dbname) results=dbaccess.raw_query_SQL(self.access[1],"SELECT rev_user, date(max_ts_core), date(max_ts) from users_rev_core") #Close DB connection dbaccess.close_Connection(self.access[0]) f=open("wkp_surv_core_rev_to_max_ts_all.dat",'a') for result in results: f.write(self.language+","+str(int(result[0]))+",\""+str(result[1])+"\",\""+str(result[2])+"\""+"\n") f.close() print "Finished all surv_file tasks for "+self.language+"\n"
def decompress (self): """ Decompress the DB dumps into MySQL """ ##TODO: Ad-hoc acuerdate de quitar esto POR DIOSSS ##self.filename="mtwiki-latest-pages-meta-history.xml.7z" if self.dumptype=="research": program="dump_sax_research.py" elif self.dumptype=="standard": program="dump_sax.py" else: print "Error! Unexpected type of dump received" return -1 self.filename=self.filenameTemplate.safe_substitute(language=self.language,file=self.files[0]) #Then we call our parser "dump_sax_research.py" to load data into MySQL command_7z="7za e -so dumps/"+self.filename+" | "+"python "+program+\ " -u "+self.msqlu+" -p "+self.msqlp+" -d "+"wx_"+self.language+"_"+self.dumptype+\ " --log "+self.language+".log" success=os.system(command_7z) if success == 0: print "DB "+"wx_"+self.language+\ self.dumptype+" successfully decompressed...\n\n" else: print "Error! There was an error trying to decompress database --> "+\ "wx_"+self.language+self.dumptype return -1 #Loading into MySQL other interesting tables directly provided in SQL format #SQL code to generate the tables is embedded in the SQL file itself ## for index in range(1,len(self.files)): ## self.filename=self.filenameTemplate.safe_substitute(language=self.language, file=self.files[index]) ## command_gzip="gzip -d dumps/"+self.filename ## command_mysql="mysql -u "+self.msqlu+" -p"+self.msqlp+\ ## " wx_"+self.language+"_"+self.dumptype+\ ## " < dumps/"+self.filename.rstrip(".gz") ## command_comp="gzip dumps/"+self.filename.rstrip(".gz") ## print "Decompressing "+self.filename+"..." ## success=os.system(command_gzip) ## if success==0: ## print "Loading "+self.filename.rstrip(".gz")+" into MySQL database..." ## success=os.system(command_mysql) ## if success==0: ## print "Compressing again "+self.filename.rstrip(".gz")+"..." ## success=os.system(command_comp) ## if success!=0: ## print "Error compressing again "+self.filename.rstrip(".gz") ## return -1 ## else: ## print "Error loading "+self.filename.rstrip(".gz") ## return -1 ## else: ## print "Error decompressing "+self.filename ## return -1 print "Generating indexes for tables page and revision...\n" print "Depending on the dump size this may take a while...\n" acceso = dbaccess.get_Connection("localhost", 3306, self.msqlu,\ self.msqlp, "wx_"+self.language+"_"+self.dumptype) #Generate adequate indexes and keys in tables page and revision print "Generating index for page_len...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE page ADD INDEX (page_len)") print "Modifying rev_timestamp to support DATETIME and creating index...\n" #dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision MODIFY rev_timestamp DATETIME") dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX timestamp (rev_timestamp)") print "Generating index for rev_page and rev_timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX page_timestamp(rev_page, rev_timestamp)") print "Generating index for rev_user and rev_timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX user_timestamp(rev_user, rev_timestamp)") print "Generating index for rev_user_text and timestamp...\n" dbaccess.raw_query_SQL(acceso[1],"ALTER TABLE revision ADD INDEX usertext_timestamp(rev_user_text(15), rev_timestamp)") dbaccess.close_Connection(acceso[0]) print "Database ready for quantitative analysis...\n" print "Let's go on... Cross your fingers... ;-) \n\n\n" return success
#We must finish and write currrent insert and begin a new one if self.options.fileout: self.loginsert+=";\n" # Write output to SQL file self.revfile = codecs.open(self.options.sqlfile,'a','utf_8') self.revfile.write(self.loginsert) self.revfile.close() elif self.options.streamout: # DON'T WRITE SQL TO FILES, GENERATE ENCONDED SQL STREAM FOR MYSQL self.loginsert+=";" print self.loginsert.encode('utf_8') elif self.options.monitor: chances=0 while chances<5: try: dbaccess.raw_query_SQL(self.acceso[1], self.loginsert.encode('utf_8')) except (Exception), e: self.printfile = codecs.open("error_"+self.options.database,'a','utf_8') self.printfile.write(str(e)+"\n") self.printfile.write(self.loginsert[0:30]+"\n**********************************") self.printfile.close() chances+=1 else: break self.loginsert="INSERT INTO logging VALUES"+newloginsert self.loginsertrows=1 #Conservative approach: assuming 2 bytes per UTF-8 character self.loginsertsize=len(self.loginsert)*2 ##################################################