def contributions(idiomas): """ Create some graphs and files with statistical results about authors contributions @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ for idioma in idiomas: acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") #acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages") #dbaccess.query_SQL(acceso[1], "page_id, page_namespace", "page", where="page_namespace=0", create="pag_namespace") tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma) tcauthor=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_author_"+idioma) #tc_ann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_Annons_author_text_"+idioma) dbaccess.close_Connection(acceso[0]) data=__tup_to_list(tcnoann) listay_tcnoann=data.pop() listax=data.pop() data=__tup_to_list(tcauthor) listay_tcauthor=data.pop() listax=data.pop() #data=__tup_to_list(tc_ann) #listay_tc_ann=data.pop() #listax=data.pop() r.png("graphics/"+idioma+"/gini_TContrib_NoAnn_"+idioma+".png") __lorenz_Curve(listay_tcnoann) r.png("graphics/"+idioma+"/gini_TContrib_"+idioma+".png") __lorenz_Curve(listay_tcauthor)
def UserNumContribsGini(self, cursor): """ A class to perform analysis on contributions with Gini graphs """ ## Retrieve info from DB and plot Gini graph tcnoann = dbaccess.query_SQL(cursor, select=" * ", table="stats_Contrib_NoAnnons_author_" + self.language) giniGraph.createGraphic("Gini graph for " + self.language, (tcnoann,), self.graphType)
def UserNumContribsCompGini(self, cursor, languages): ## Retrieve info from DB and plot Gini comparative graph dataSeries = [] for language in languages: dataSeries.append(dbaccess.query_SQL(cursor,\ select=" * ", table="stats_Contrib_NoAnnons_author_"+self.language)) giniGraph.createGraphic("Gini_Comparative", dataSeries, self.graphType)
def UserNumContribsCompGini(self, cursor, languages): ## Retrieve info from DB and plot Gini comparative graph dataSeries = [] for language in languages: dataSeries.append( dbaccess.query_SQL(cursor, select=" * ", table="stats_Contrib_NoAnnons_author_" + self.language) ) giniGraph.createGraphic("Gini_Comparative", dataSeries, self.graphType)
def UserNumContribsGini(self, cursor): """ A class to perform analysis on contributions with Gini graphs """ ## Retrieve info from DB and plot Gini graph tcnoann = dbaccess.query_SQL(cursor, select=" * ", table="stats_Contrib_NoAnnons_author_" + self.language) giniGraph.createGraphic("Gini graph for " + self.language, (tcnoann, ), self.graphType)
def articleSizeHistogram(self, cursor): """ Histogram for the size of articles and split in two subpopulations """ ## Retrive dataset with length of pages from DB pageLen = dbaccess.query_SQL(cursor, select="page_id, page_len", table="aux") ## Plot aggregate histogram and split histograms for subpopulations splitHistGraph.createGraphic("Histogram", (pageLen,),"eps", xlabst="Page length (log)",\ ylabst="Probability densities", mainTitle="Histogram for length of articles")
def articleSizeHistogram(self, cursor): """ Histogram for the size of articles and split in two subpopulations """ ## Retrive dataset with length of pages from DB pageLen = dbaccess.query_SQL(cursor, select="page_id, page_len", table="aux") ## Plot aggregate histogram and split histograms for subpopulations splitHistGraph.createGraphic( "Histogram", (pageLen,), "eps", xlabst="Page length (log)", ylabst="Probability densities", mainTitle="Histogram for length of articles", )
def comparative_contributions(): listaidiomas=["dewiki", "jawiki", "frwiki", "plwiki", "nlwiki", "itwiki", "ptwiki", "eswiki", "svwiki"] ## lista=["eswiki", "svwiki"] r.png("graphics/AAA/gini_comparative_top10.png") flag=0 for idioma in listaidiomas: print "Starting comparative Gini analysis for language..."+idioma+"\n" acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma) dbaccess.close_Connection(acceso[0]) data=__tup_to_list(tcnoann) listay_tcnoann=data.pop() listax=data.pop() if flag==0: _lorenz_Comp_Curves(listay_tcnoann,flag) flag=1 else: _lorenz_Comp_Curves(listay_tcnoann,flag) r.dev_off() print "Comparative graphic for Gini curves finished!!"
def histogram(idiomas): """ Create histograms depicting article size distribution for a certain language version @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ filenames=["boxplot_log.png", "histogram_log.png", "histogram_log_low.png", "histogram_log_high.png", "ecdf_log_low.png", "ecdf_log_high.png", "data/page_len_log.data", "/data/histograms.info", "ecdf_total.png"] for idioma in idiomas: print "Creando histogramas para el idioma ... "+idioma #Print to another file the names of graphics files, following the order in the GNU R script histogram.R f=open("./data/hist_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") #Considering only database pages corresponding to articles, with NAMESPACE=MAIN=0 #dbaccess.dropTab_SQL(acceso[1], "aux") #dbaccess.query_SQL(acceso[1],"page_id, page_len","page", where="page_namespace=0", order="page_len", create="aux") result=dbaccess.query_SQL(acceso[1], "page_id, page_len", "aux") dbaccess.close_Connection(acceso[0]) data=__tup_to_list(result) page_len=data.pop() for i in range(len(page_len)): if page_len[i]!=0: page_len[i]=math.log10(page_len[i]) #Print to another file a list with article sizes to plot histograms f=open("./graphics/"+idioma+"/data/page_len_log.data", 'w') for value in page_len: f.writelines(str(value)+"\n") f.close() #CALL THE GNU R SCRIPT Histogram.R succ=os.system("R --vanilla < ./histogram.R > debug_R") if succ==0: print "Funcion histogram ejecutada con exito para el lenguage... "+idioma
def UserNumContribsGenerations(self): """ Same 3D study as in LibreSoftware """ self.periodCommitsCommiter = \ dbaccess.query_SQL(self.acceso[1],\ select="period, author, contribs",\ tables="contribs_period_author_"+self.language,\ order="period, contribs DESC") self.lastPeriod = int(self.periodCommitsCommiter[-1][0]) # Perform all the analysis print('Performing analysis with period = months\n') #self.commitsPerPeriodPerCommiter() self.commitsPerPeriod() self.largestCommiters() self.topFractionCommits(0.1) #self.topFractionCommits(0.5) #self.topFractionCommits(1.0) ## FIXME: repeat executions with different percentages ## TODO: add periodified plotbars for topFractionCommiters self.topFractionCommiters(0.05)
def UserNumContribsGenerations(self): """ Same 3D study as in LibreSoftware """ self.periodCommitsCommiter = dbaccess.query_SQL( self.acceso[1], select="period, author, contribs", tables="contribs_period_author_" + self.language, order="period, contribs DESC", ) self.lastPeriod = int(self.periodCommitsCommiter[-1][0]) # Perform all the analysis print ("Performing analysis with period = months\n") # self.commitsPerPeriodPerCommiter() self.commitsPerPeriod() self.largestCommiters() self.topFractionCommits(0.1) # self.topFractionCommits(0.5) # self.topFractionCommits(1.0) ## FIXME: repeat executions with different percentages ## TODO: add periodified plotbars for topFractionCommiters self.topFractionCommiters(0.05)
def UserNumContribsGroup(self, cursor): """ A class to plot comparative graphics with contributions from different groups """ ###Reproduction of the article Power of the few... ## Admins and bots IDs can be retrieved from DB as subselects in the where clause ########################## ##Drop bots contribs from DB source view ########################## ##CREATE VIEW FOR PERIODS FROM 0 IN MONTHS minYear = dbaccess.query_SQL( cursor, select="MIN(year)", tables="stats_Contrib_NoAnnons_months_author_" + self.language ) minMonth = dbaccess.query_SQL( cursor, select="MIN(month)", tables="stats_Contrib_NoAnnons_months_author_" + self.language, where="year=" + str(int(minYear[0][0])), ) dbaccess.createView( cursor, view="contribs_period_author_" + self.language, columns="period, author, contribs", query="SELECT ((year*12)+month-(" + str(int(minYear[0][0])) + "*12)-" + str(int(minMonth[0][0])) + ") as period, author, theCount FROM " + "stats_Contrib_NoAnnons_months_author_" + self.language + " WHERE author NOT IN " + "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot') ORDER BY period", ) ## Retrieve number of revision made by admins per month revsAdminsPerMonth = dbaccess.query_SQL( cursor, select="period, SUM(contribs)", tables="contribs_period_author_" + self.language, where="author IN (SELECT ug_user FROM user_groups where ug_group='sysop')", group="period", order="period", ) ## Plot FIG 2 self.simpleGraph.createGraphic( "revs_admins_per_month", (revsAdminsPerMonth,), xlabst="Months", ylabst="Revisions", mainTitle="Revisions per month for admins " + self.language, graphType=self.graphType, log=False, ) contribsMonth = dbaccess.query_SQL( cursor, select="period, SUM(contribs)", tables="contribs_period_author_" + self.language, group="period", order="period", ) ## divide element by element ## Supposedly, there is at least one rev per month made by an admin percContribsAdmins = [] for totAdminContrib in revsAdminsPerMonth: for totContrib in contribsMonth: if totAdminContrib[0] == totContrib[0]: ## append (period, adminsContribs/totContribs) perc = float(totAdminContrib[1]) / float(totContrib[1]) percContribsAdmins.append((totAdminContrib[0], perc * 100)) break ## Plot FIG 1 % of total revs per month made by admins self.simpleGraph.createGraphic( "perc_revs_admins_per_month", (percContribsAdmins,), xlabst="Months", ylabst="% revisions", mainTitle="% of total revisions per month made by admins " + self.language, graphType=self.graphType, log=False, ) ## FIG 4 TOTAL EDITS MADE BY USERS WITH DIFFERENT EDIT LEVELS ## CREATE WHERE CLAUSES FOR CLUSTER OF USERS IDENTIFIED BY CONTRIBUTIONS LEVEL ## 5 LEVELS: <100, 100-1K, 1K-5K, 5K-10K, >10K usersLevel1 = ( "author IN (SELECT DISTINCT(author) FROM stats_Contrib_NoAnnons_author_" + self.language + " WHERE theCount<=100 AND author NOT IN " + "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot'))" ) usersLevel2 = ( "author IN (SELECT DISTINCT(author) FROM stats_Contrib_NoAnnons_author_" + self.language + " WHERE theCount BETWEEN 101 AND 1000 AND author NOT IN" + "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot'))" ) usersLevel3 = ( "author IN (SELECT DISTINCT(author) FROM stats_Contrib_NoAnnons_author_" + self.language + " WHERE theCount BETWEEN 1001 AND 5000 AND author NOT IN " + "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot'))" ) usersLevel4 = ( "author IN (SELECT DISTINCT(author) FROM stats_Contrib_NoAnnons_author_" + self.language + " WHERE theCount BETWEEN 5001 AND 10000 AND author NOT IN " + "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot'))" ) usersLevel5 = ( "author IN (SELECT DISTINCT(author) FROM stats_Contrib_NoAnnons_author_" + self.language + " WHERE theCount>10000 AND author NOT IN " + "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot'))" ) ## Some vars used in for iterations levels = (usersLevel1, usersLevel2, usersLevel3, usersLevel4, usersLevel5) listContribsLevel = [] listAvgContribsLevel = [] listPercContribsLevel = [] listUsersLevel = [] listPercUsersLevel = [] ## Retrieve tot num of users per month usersMonth = dbaccess.query_SQL( cursor, select="period, COUNT(DISTINCT(author))", tables="contribs_period_author_" + self.language, group="period", order="period", ) for level in levels: ## Retrieve contribs per month for this level contribsLevelMonth = dbaccess.query_SQL( cursor, select="period, SUM(contribs)", tables="contribs_period_author_" + self.language, where=level, group="period", order="period", ) listContribsLevel.append(contribsLevelMonth) percContribsLevel = [] ## Append (period, contribsLevel/totContribs) checking periods correspondence for totLevelContrib in contribsLevelMonth: for totContrib in contribsMonth: if totLevelContrib[0] == totContrib[0]: perc = float(totLevelContrib[1]) / float(totContrib[1]) percContribsLevel.append((totLevelContrib[0], perc * 100)) break listPercContribsLevel.append(percContribsLevel) ## Retrieve number of users per level per month usersLevelMonth = dbaccess.query_SQL( cursor, select="period, COUNT(DISTINCT(author))", tables="contribs_period_author_" + self.language, where=level, group="period", order="period", ) ##Append to the list of users per level per month listUsersLevel.append(usersLevelMonth) avgUsersLevel = [] ## Retrieve avg number of revs per user in each group, per month for contribs in contribsLevelMonth: for totUsers in usersLevelMonth: if contribs[0] == totUsers[0]: avg = float(contribs[1]) / float(totUsers[1]) avgUsersLevel.append((contribs[0], avg)) listAvgContribsLevel.append(avgUsersLevel) percUsersLevel = [] ## Append (period, usersLevel/totUsers) for users, totUsers in zip(usersLevelMonth, usersMonth): for totUsers in usersMonth: if users[0] == totUsers[0]: perc = float(users[1]) / float(totUsers[1]) percUsersLevel.append((users[0], perc * 100)) break listPercUsersLevel.append(percUsersLevel) ## 2D graph for FIG 4 self.multiGraph.createGraphic( "perc_revs_per_userlevel_month", listPercContribsLevel, xlabst="months", ylabst="% revisions", mainTitle="% of total revs per user level per month", graphType=self.graphType, format=[], log=False, ) ## Plot 2D multi graph (FIG 5) self.multiGraph.createGraphic( "revs_per_userlevel_month", listContribsLevel, xlabst="months", ylabst="revisions", mainTitle="Total revisions per user level per month", graphType=self.graphType, format=[], log=True, ) ## FIG 6 AVERAGE NUMBER OF EDITS PER USER PER MONTH FOR EACH LEVEL self.multiGraph.createGraphic( "avg_revs_per_userlevel_month", listAvgContribsLevel, xlabst="months", ylabst="avg. revisions", mainTitle="Avg revisions per user level per month", graphType=self.graphType, format=[], log=True, ) ## FIG 7 POPULATION GROWTH FOR EACH USER GROUP self.multiGraph.createGraphic( "users_per_level_month", listUsersLevel, xlabst="months", ylabst="log(num users)", mainTitle="Growth of each user group per month", graphType=self.graphType, format=[], log=True, ) ## FIG 8 % OF TOTAL POPULATION OF EACH USER GROUP self.multiGraph.createGraphic( "perc_users_per_level_month", listPercUsersLevel, xlabst="months", ylabst="% users", mainTitle="% of users in each user group per month", graphType=self.graphType, format=[], log=False, ) """
def community_contrib(idiomas): for idioma in idiomas: list_admins=test_admins.process_admins(idioma) num_admins=list_admins.pop() where_clause1=list_admins.pop() acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") admins_ids=dbaccess.raw_query_SQL(acceso[1], "SELECT DISTINCT(author) FROM stats_"+idioma+" WHERE "+where_clause1+" LIMIT "+str(num_admins)) ## MONTAR WHERE CLAUSE CON ADMINS IDS list_admins_ids=[] for item in list_admins_ids: list_admins_ids.append(int(item[0])) where_clause2=test_admins.process_users_ids(list_admins_ids,idioma) edits_admin_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_NoAnnons_months_author_"+idioma+" ", where=where_clause2, group="year, month ", order="year, month") dates_admins=[] admins_contribs=[] for element in edits_admin_month: dates_admins.append(list(element[0:2])) admins_contribs.append(int(element[2])) ## PASAR A UN ARCHIVO PARA PLOT (FIG 2) ## RECUPERAMOS CONTRIBUCIONES TOTALES POR MESES total_edits_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, group="year, month ") dates_contribs=[] total_contribs=[] for element in total_edits_month: dates_contribs.append(list(element[0:2])) total_contribs.append(int(element[2])) ## DIVIDIR LA PRIMERA LISTA POR LA SEGUNDA perc_contribs_admins=[] for admin_contrib, total_contrib in zip(admins_contribs, total_contribs): perc_contribs_admins.append((float(admin_contrib)/total_contrib)) ## PASAR A UN ARCHIVO PARA PLOT (FIG 1) ## FIG 4 TOTAL EDITS MADE BY USERS WITH DIFFERENT EDIT LEVELS ## CREATE CLUSTER OF USERS IDENTIFIED BY CONTRIBUTIONS LEVEL ## 5 LEVELS: <100, 100-1K, 1K-5K, 5K-10K, >10K users_level1=[] users_level2=[] users_level3=[] users_level4=[] users_level5=[] level1=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount<=100") for userid in level1: users_level1.append(int(userid[0])) level2=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>100 AND theCount<=1000") for userid in level2: users_level2.append(int(userid[0])) level3=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>1000 AND theCount<=5000") for userid in level3: users_level3.append(int(userid[0])) level4=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>5000 AND theCount<=10000") for userid in level4: users_level4.append(int(userid[0])) level5=dbaccess.query_SQL(acceso[1], select="DISTINCT(author)", tables="stats_Contrib_author_"+idioma, where="theCount>10000") for userid in level5: users_level5.append(int(userid[0])) where_clause_level1=test_admins.process_users_ids(users_level1,idioma) where_clause_level2=test_admins.process_users_ids(users_level2,idioma) where_clause_level3=test_admins.process_users_ids(users_level3,idioma) where_clause_level4=test_admins.process_users_ids(users_level4,idioma) where_clause_level5=test_admins.process_users_ids(users_level5,idioma) contribs_level1_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level1, group="year, month") contribs_level2_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level2, group="year, month") contribs_level3_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level3, group="year, month") contribs_level4_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level4, group="year, month") contribs_level5_month=dbaccess.query_SQL(acceso[1], select="year, month, SUM(theCount)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level5, group="year, month") list_level1=__process_contribs(contribs_level1_month, total_contribs) perc_contribs_level1=list_level1.pop() contribs_level1=list_level1.pop() dates_level1=list_level1.pop() list_level2=__process_contribs(contribs_level2_month, total_contribs) perc_contribs_level2=list_level2.pop() contribs_level2=list_level2.pop() dates_level2=list_level2.pop() list_level3=__process_contribs(contribs_level3_month, total_contribs) perc_contribs_level3=list_level3.pop() contribs_level3=list_level3.pop() dates_level3=list_level1.pop() list_level4=__process_contribs(contribs_level4_month, total_contribs) perc_contribs_level4=list_level4.pop() contribs_level4=list_level4.pop() dates_level4=list_level4.pop() list_level5=__process_contribs(contribs_level5_month, total_contribs) perc_contribs_level5=list_level5.pop() contribs_level5=list_level5.pop() dates_level5=list_level5.pop() ## FIG 5 PLOT 4b ## FIG 6 AVERAGE NUMBER OF EDITS PER USER PER MONTH FOR EACH LEVEL ## RETRIEVE NUM USERS FOR EACH MONTH IN EACH LEVEL WHO HAVE MADE AT LEAST ONE CONTRIB num_users_1_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level1, group="year, month") num_users_2_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level2, group="year, month") num_users_3_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level3, group="year, month") num_users_4_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level4, group="year, month") num_users_5_month=dbaccess.query_SQL(acceso[1], select="COUNT(DISTINCT author)", tables="stats_Contrib_months_author_"+idioma, where=where_clause_level5, group="year, month") list_users_1_month=[] for element in num_users_1_month: list_users_1_month.append(int(element[0])) list_users_2_month=[] for element in num_users_2_month: list_users_2_month.append(int(element[0])) list_users_3_month=[] for element in num_users_3_month: list_users_3_month.append(int(element[0])) list_users_4_month=[] for element in num_users_4_month: list_users_4_month.append(int(element[0])) list_users_5_month=[] for element in num_users_5_month: list_users_5_month.append(int(element[0])) ## DIVIDE TOT NUM CONTRIBS PER LEVEL PER MONTH BY THE NUM USERS FOR EACH MONTH IN EACH LEVEL avg_contribs_user_1_month=[] for contribmonth, usermonth in zip(contribs_level1, list_users_1_month): avg_contribs_user_1_month.append(float(contribmonth)/usermonth) avg_contribs_user_2_month=[] for contribmonth, usermonth in zip(contribs_level2, list_users_2_month): avg_contribs_user_2_month.append(float(contribmonth)/usermonth) avg_contribs_user_3_month=[] for contribmonth, usermonth in zip(contribs_level3, list_users_3_month): avg_contribs_user_3_month.append(float(contribmonth)/usermonth) avg_contribs_user_4_month=[] for contribmonth, usermonth in zip(contribs_level4, list_users_4_month): avg_contribs_user_4_month.append(float(contribmonth)/usermonth) avg_contribs_user_5_month=[] for contribmonth, usermonth in zip(contribs_level5, list_users_5_month): avg_contribs_user_5_month.append(float(contribmonth)/usermonth) ## FIG 7 POPULATION GROWTH FOR EACH USER GROUP ## SIMPLY RETRIEVE list_users_X_month ## FIG 8 % OF TOTAL POPULATION OF EACH USER GROUP perc_users_1_months=[] perc_users_2_months=[] perc_users_3_months=[] perc_users_4_months=[] perc_users_5_months=[] for e1, e2, e3, e4, e5 in zip(list_users_1_month,list_users_2_month,list_users_3_month,list_users_4_month,list_users_5_month): total_users_month=e1+e2+e3+e4+e5 perc_users_1_months.append((float(e1)/total_users_month)) perc_users_2_months.append((float(e2)/total_users_month)) perc_users_3_months.append((float(e3)/total_users_month)) perc_users_4_months.append((float(e4)/total_users_month)) perc_users_5_months.append((float(e5)/total_users_month)) ############################### ## FINAL DUTIES, TRANSFER DATA AND EXECUTE R SCRIPT filenames=["dates_admin_contrib.data","contribs_admins_months.data", "perc_contribs_months.data","dates_level1_contrib.data", "contribs_level1_months.data", "perc_contribs_level1_months.data", "dates_level2_contrib.data", "contribs_level2_months.data", "perc_contribs_level2_months.data","dates_level3_contrib.data", "contribs_level3_months.data", "perc_contribs_level3_months.data","dates_level4_contrib.data", "contribs_level4_months.data", "perc_contribs_level4_months.data","dates_level5_contrib.data" ,"contribs_level5_months.data", "perc_contribs_level5_months.data", "avg_contribs_user_1_month.data", "avg_contribs_user_2_month.data", "avg_contribs_user_3_month.data", "avg_contribs_user_4_month.data", "avg_contribs_user_5_month.data", "users_1_month.data", "users_2_month.data", "users_3_month.data", "users_4_month.data", "users_5_month.data", "perc_users_1_months.data","perc_users_2_months.data", "perc_users_3_months.data", "perc_users_4_months.data", "perc_users_5_months.data"] filenames_out=["Figure1.png", "Figure_2.png", "Figure4.png", "Figure5.png", "Figure6.png", "Figure7.png", "Figure8.png"] dataList=[dates_contribs, admins_contribs, perc_contribs_admins, dates_level1, contribs_level1, perc_contribs_level1,dates_level2, contribs_level2, perc_contribs_level2,dates_level3, contribs_level3, perc_contribs_level3, dates_level4, contribs_level4, perc_contribs_level4,dates_level5, contribs_level5, perc_contribs_level5, avg_contribs_user_1_month, avg_contribs_user_2_month, avg_contribs_user_3_month, avg_contribs_user_4_month, avg_contribs_user_5_month, list_users_1_month, list_users_2_month, list_users_3_month, list_users_4_month, list_users_5_month, perc_users_1_months, perc_users_2_months, perc_users_3_months, perc_users_4_months, perc_users_5_months] for filename, data in zip (filenames, dataList): if(filename.find('date')!=-1): f=open("./graphics/"+idioma+"/data/"+filename, 'w') for adate in data: f.writelines(str(adate)+"\n") f.close() else: __makeDataFile(idioma, filename, data) #Pass data filenames to the GNU R script with a file f=open("./data/community_contrib_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/data/"+line+"\n") f.close() #Idem with graphic output filenames f=open("./data/community_contrib_files_out.data",'w') for line in filenames_out: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() #CALL GNU R SCRIPT measuring_Wiki.R succ=os.system("R --vanilla < ./community_contrib.R > debug_R") if succ==0: print "Funcion community_contrib.R ejecutada con exito para el lenguage... "+idioma
def measuring(idiomas): """ Create some graphs following the research presented by Jakob Voss in his paper Mesuring Wikipedia (ISSI 2005) @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ ## Generates some graphics reproducing those in Measuring Wikipedia article filenames=["total_edits.data", "noannons_edits.data", "annon_edits.data", "authors_per_article_desc.data", "articles_per_logged_author_desc.data", "articles_per_anonymous_author_desc.data"] filenames_out=["total_edits_per_author.png", "total_edits_per_noannon_author.png", "total_edits_per_annon_author.png", "diff_authors_per_article_descending.png", "diff_articles_per_logged_author_descending.png", "diff_articles_per_anonymous_author_descending.png"] for idioma in idiomas: acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") ## acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages") #Combined evolution graphics #ALL THESE GRAPHICS ARE ALREADY GENERATED BY ERIK ZATCHE'S OFFICIAL PERL SCRIPTS #Database size #Total number of words #Total number of internal links #Number of articles (including redirects) #Number of active wikipedians (more than 5 contributions in a given month) #Number of very active wikipedians (more than 100 contributions in a given month) #Namespace size #OK, it is generated in summary_evol() method #Evolution in time of article size (histogram) #IDEA: Download page.sql files for a language for each semester period #Number of distinct authors per article (descending sorted graphic) #Already generated in summary_evol, ONLY NEED TO SORT AND ADJUST IN GNU R diffAuthorperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Article_NoAnnons_page_id_"+idioma) #Number of distinct articles per author (descending sorted graphic) #Idem as in the previous case diffArticlesNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_NoAnnons_author_"+idioma) diffArticlesAnn=dbaccess.query_SQL(acceso[1], "author_text, theCount", "stats_Article_Annons_author_text_"+idioma) data=__tup_to_list(diffAuthorperArticle) lisdiffauthorartic=data.pop() data=__tup_to_list(diffArticlesNoann) lisdiffarticleaut=data.pop() data=__tup_to_list(diffArticlesAnn,2) lisdiffarticleannon=data.pop() ## Ordenamos los resultados para que se puedan ajustar a una Power Law lisdiffauthorartic.sort(reverse=True) lisdiffarticleaut.sort(reverse=True) lisdiffarticleannon.sort(reverse=True) #Number of edtis per author #Retrieve results from database #We have already created GINI graphics for this parameter #ALSO AVAILABLE DATABASE TABLES WITH EVOLUTION IN TIME OF THIS PARAMETER tcnoann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_NoAnnons_author_"+idioma) tcauthor=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_author_"+idioma) tc_ann=dbaccess.query_SQL(acceso[1]," * ","stats_Contrib_Annons_author_text_"+idioma) data=__tup_to_list(tcnoann) listcnoann=data.pop() data=__tup_to_list(tcauthor) listcauthors=data.pop() #BTW, we are also obtaining but not using the IP adresses of annon users data=__tup_to_list(tc_ann,2) listcann=data.pop() ## Arranging results in a decreasing way to adjust them to a power law listcnoann.sort(reverse=True) listcauthors.sort(reverse=True) listcann.sort(reverse=True) #Ingoing and outgoing number of links per article #STILL TO BE DEVELOPED #NEED TO FIRST IDENTIFY LINKS FOR A GIVEN ARTICLE IN THE DATABASE #LINKS TABLES MAY HELP, but in these dump versions they are all empty!!! #BROKEN LINKS also need to be considered dbaccess.close_Connection(acceso[0]) dataList=[listcauthors, listcnoann, listcann, lisdiffauthorartic, lisdiffarticleaut, lisdiffarticleannon] for filename, data in zip (filenames, dataList): if(filename.find('date')!=-1): __makeDatesFile(idioma, filename, data) else: __makeDataFile(idioma, filename, data) #Pass data filenames to the GNU R script with a file f=open("./data/measuring_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/data/"+line+"\n") f.close() #Idem with graphic output filenames f=open("./data/measuring_files_out.data",'w') for line in filenames_out: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() #CALL GNU R SCRIPT measuring_Wiki.R succ=os.system("R --vanilla < ./measuring_Wiki.R > debug_R") if succ==0: print "Funcion measuring_Wiki.R ejecutada con exito para el lenguage... "+idioma
def summary_evol(idiomas): """ Create some graphs summarizing the evolution in time of critical quantitative parameters for each language version to explore @type idiomas: list of strings @param idiomas: list of strings indicating the language versions to process """ ## ¡¡WARNING!! Please be careful when selecting values from tables storing evolution in time of number of articles, size etc. ## You must always use a GROUP BY(pageCount, limitDate) clause, due to ## periods of inactivity that could generate duplicate entries in the graphics filenames=["page_dates.data", "page_Count_evol.data", "page_Len_Sum_log.data", "contribs_evol.data", "nspaces.data", "nspace_distrib.data", "diffArticles.data", "authors.data", "diff_authors_x_article.data", "authors_authors_per_pagelen.data", "pagelen_authors_per_pagelen.data"] filenames_out=["Tot_num_articles_absx_absy.png", "Tot_num_articles_absx_logy.png", "Tot_num_articles_logx_logy.png", "Tot_pagelensum_absx_absy.png", "Tot_pagelensum_absx_logy.png", "Tot_pagelensum_logx_logy.png", "Tot_contribs_absx_absy.png", "Tot_contribs_absx_logy.png", "Tot_contribs_logx_logy.png", "Diffs_articles_per_author.png", "Diffs_authors_per_article.png", "Diff_authors_against_page_len.png"] for idioma in idiomas: acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_stub") #acceso = dbaccess.get_Connection("localhost", 3306, "root", "phoenix", idioma+"_pages") result=dbaccess.query_SQL(acceso[1], "pageCount, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)") result2=dbaccess.query_SQL(acceso[1], "pageLenSum, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)") result3=dbaccess.query_SQL(acceso[1], "contribs, limitDate", "stats_Evolution_Content_months_"+idioma, group="(limitDate)") resultnspace=dbaccess.query_SQL(acceso[1], "pages_nspace, namespace", "stats_nspace_"+idioma) diffArticlesNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_NoAnnons_author_"+idioma) diffInitNoann=dbaccess.query_SQL(acceso[1], "author, theCount", "stats_Article_Init_NoAnnons_author_"+idioma) totRevperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Contrib_NoAnnons_page_id_"+idioma) diffAuthorperArticle=dbaccess.query_SQL(acceso[1], "page_id, theCount", "stats_Article_NoAnnons_page_id_"+idioma) dautxplen=dbaccess.query_SQL(acceso[1], "page_len, authors", "stats_pagelen_difauthors_"+idioma) dbaccess.close_Connection(acceso[0]) data=__tup_to_list(result, 1) dates_x=data.pop() page_Count=data.pop() ## if idioma=="frwiki": data2=__tup_to_list(result2, 2) dates_x=data2.pop() dates_x.pop(0) dates_x.pop(0) page_Len_Sum=data2.pop() page_Len_Sum.pop(0) page_Len_Sum.pop(0) ## else: ## data2=__tup_to_list(result2, 1) ## dates_x=data2.pop() ## page_Len_Sum=data2.pop() data3=__tup_to_list(result3, 1) dates_x=data3.pop() contribs=data3.pop() datanspace=__tup_to_list(resultnspace) namespaces=datanspace.pop() pages_nspace=datanspace.pop() dataDiffArticlesNoann=__tup_to_list(diffArticlesNoann) diffArticles=dataDiffArticlesNoann.pop() authors=dataDiffArticlesNoann.pop() dataDiffInitNoann=__tup_to_list(diffInitNoann) diffInitArticles=dataDiffInitNoann.pop() authors=dataDiffInitNoann.pop() datatotRevperArticle=__tup_to_list(totRevperArticle) totalRev=datatotRevperArticle.pop() article=datatotRevperArticle.pop() datadiffAuthorperArticle=__tup_to_list(diffAuthorperArticle) diffAuthors=datadiffAuthorperArticle.pop() article=datadiffAuthorperArticle.pop() datadautxplen=__tup_to_list(dautxplen) autxplen=datadautxplen.pop() lenautxplen=datadautxplen.pop() ## Introduce in data list results form queries in the proper order ## corresponding with the name files we pass to the GNU R script summary_evol.R for i in range(len(page_Len_Sum)): if page_Len_Sum[i]!=0: page_Len_Sum[i]=math.log10(page_Len_Sum[i]) dataList=[dates_x, page_Count, page_Len_Sum, contribs, namespaces, pages_nspace, diffArticles, authors, diffAuthors, autxplen, lenautxplen] for filename, data in zip (filenames, dataList): if(filename.find('date')!=-1): __makeDatesFile(idioma, filename, data) else: __makeDataFile(idioma, filename, data) ###################################### #Pass data filenames to the GNU R script with a file f=open("./data/summary_files_names.data",'w') for line in filenames: f.write("./graphics/"+idioma+"/data/"+line+"\n") f.close() #Idem with graphic output filenames f=open("./data/summary_files_out.data",'w') for line in filenames_out: f.write("./graphics/"+idioma+"/"+line+"\n") f.close() #CALL THE GNU R SCRIPT summary_evol.R succ=os.system("R --vanilla < ./summary_evol.R > debug_R") if succ==0: print "Funcion summary_evol ejecutada con exito para el lenguage... "+idioma
def UserNumContribsGroup(self, cursor): """ A class to plot comparative graphics with contributions from different groups """ ###Reproduction of the article Power of the few... ## Admins and bots IDs can be retrieved from DB as subselects in the where clause ########################## ##Drop bots contribs from DB source view ########################## ##CREATE VIEW FOR PERIODS FROM 0 IN MONTHS minYear=dbaccess.query_SQL(cursor, select="MIN(year)",\ tables="stats_Contrib_NoAnnons_months_author_"+self.language) minMonth=dbaccess.query_SQL(cursor, select="MIN(month)",\ tables="stats_Contrib_NoAnnons_months_author_"+self.language,\ where="year="+str(int(minYear[0][0]))) dbaccess.createView(cursor, view="contribs_period_author_"+self.language,\ columns="period, author, contribs",\ query="SELECT ((year*12)+month-("+str(int(minYear[0][0]))+"*12)-"+str(int(minMonth[0][0]))\ +") as period, author, theCount FROM "+\ "stats_Contrib_NoAnnons_months_author_"+self.language+" WHERE author NOT IN "+\ "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot') ORDER BY period") ## Retrieve number of revision made by admins per month revsAdminsPerMonth=dbaccess.query_SQL(cursor,select="period, SUM(contribs)",\ tables="contribs_period_author_"+self.language,\ where="author IN (SELECT ug_user FROM user_groups where ug_group='sysop')",\ group="period",order="period") ## Plot FIG 2 self.simpleGraph.createGraphic("revs_admins_per_month", (revsAdminsPerMonth,),\ xlabst="Months", ylabst="Revisions",mainTitle="Revisions per month for admins "+\ self.language, graphType=self.graphType, log=False) contribsMonth=dbaccess.query_SQL(cursor,\ select="period, SUM(contribs)",\ tables="contribs_period_author_"+self.language,\ group="period",order="period") ## divide element by element ## Supposedly, there is at least one rev per month made by an admin percContribsAdmins = [] for totAdminContrib in revsAdminsPerMonth: for totContrib in contribsMonth: if totAdminContrib[0] == totContrib[0]: ## append (period, adminsContribs/totContribs) perc = float(totAdminContrib[1]) / float(totContrib[1]) percContribsAdmins.append((totAdminContrib[0], perc * 100)) break ## Plot FIG 1 % of total revs per month made by admins self.simpleGraph.createGraphic("perc_revs_admins_per_month", (percContribsAdmins,),\ xlabst="Months", ylabst="% revisions",\ mainTitle="% of total revisions per month made by admins "+self.language,\ graphType=self.graphType, log=False) ## FIG 4 TOTAL EDITS MADE BY USERS WITH DIFFERENT EDIT LEVELS ## CREATE WHERE CLAUSES FOR CLUSTER OF USERS IDENTIFIED BY CONTRIBUTIONS LEVEL ## 5 LEVELS: <100, 100-1K, 1K-5K, 5K-10K, >10K usersLevel1="author IN (SELECT DISTINCT(author) FROM stats_Contrib_NoAnnons_author_"+\ self.language+" WHERE theCount<=100 AND author NOT IN "+\ "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot'))" usersLevel2="author IN (SELECT DISTINCT(author) FROM stats_Contrib_NoAnnons_author_"+\ self.language+" WHERE theCount BETWEEN 101 AND 1000 AND author NOT IN" +\ "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot'))" usersLevel3="author IN (SELECT DISTINCT(author) FROM stats_Contrib_NoAnnons_author_"+\ self.language+" WHERE theCount BETWEEN 1001 AND 5000 AND author NOT IN "+\ "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot'))" usersLevel4="author IN (SELECT DISTINCT(author) FROM stats_Contrib_NoAnnons_author_"+\ self.language+" WHERE theCount BETWEEN 5001 AND 10000 AND author NOT IN "+\ "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot'))" usersLevel5="author IN (SELECT DISTINCT(author) FROM stats_Contrib_NoAnnons_author_"+\ self.language+" WHERE theCount>10000 AND author NOT IN "+\ "(SELECT DISTINCT(ug_user) FROM user_groups WHERE ug_group='bot'))" ## Some vars used in for iterations levels = (usersLevel1, usersLevel2, usersLevel3, usersLevel4, usersLevel5) listContribsLevel = [] listAvgContribsLevel = [] listPercContribsLevel = [] listUsersLevel = [] listPercUsersLevel = [] ## Retrieve tot num of users per month usersMonth=dbaccess.query_SQL(cursor,\ select="period, COUNT(DISTINCT(author))",\ tables="contribs_period_author_"+self.language,\ group="period",order="period") for level in levels: ## Retrieve contribs per month for this level contribsLevelMonth=dbaccess.query_SQL(cursor,\ select="period, SUM(contribs)",\ tables="contribs_period_author_"+self.language,\ where=level, group="period", order="period") listContribsLevel.append(contribsLevelMonth) percContribsLevel = [] ## Append (period, contribsLevel/totContribs) checking periods correspondence for totLevelContrib in contribsLevelMonth: for totContrib in contribsMonth: if totLevelContrib[0] == totContrib[0]: perc = float(totLevelContrib[1]) / float(totContrib[1]) percContribsLevel.append( (totLevelContrib[0], perc * 100)) break listPercContribsLevel.append(percContribsLevel) ## Retrieve number of users per level per month usersLevelMonth=dbaccess.query_SQL(cursor,\ select="period, COUNT(DISTINCT(author))",\ tables="contribs_period_author_"+self.language,\ where=level, group="period",order="period") ##Append to the list of users per level per month listUsersLevel.append(usersLevelMonth) avgUsersLevel = [] ## Retrieve avg number of revs per user in each group, per month for contribs in contribsLevelMonth: for totUsers in usersLevelMonth: if contribs[0] == totUsers[0]: avg = float(contribs[1]) / float(totUsers[1]) avgUsersLevel.append((contribs[0], avg)) listAvgContribsLevel.append(avgUsersLevel) percUsersLevel = [] ## Append (period, usersLevel/totUsers) for users, totUsers in zip(usersLevelMonth, usersMonth): for totUsers in usersMonth: if users[0] == totUsers[0]: perc = float(users[1]) / float(totUsers[1]) percUsersLevel.append((users[0], perc * 100)) break listPercUsersLevel.append(percUsersLevel) ## 2D graph for FIG 4 self.multiGraph.createGraphic("perc_revs_per_userlevel_month",\ listPercContribsLevel, xlabst="months", ylabst="% revisions",\ mainTitle="% of total revs per user level per month", graphType=self.graphType,\ format=[],log=False) ## Plot 2D multi graph (FIG 5) self.multiGraph.createGraphic("revs_per_userlevel_month",\ listContribsLevel, xlabst="months", ylabst="revisions",\ mainTitle="Total revisions per user level per month", graphType=self.graphType,\ format=[],log=True) ## FIG 6 AVERAGE NUMBER OF EDITS PER USER PER MONTH FOR EACH LEVEL self.multiGraph.createGraphic("avg_revs_per_userlevel_month",\ listAvgContribsLevel, xlabst="months", ylabst="avg. revisions",\ mainTitle="Avg revisions per user level per month", graphType=self.graphType,\ format=[],log=True) ## FIG 7 POPULATION GROWTH FOR EACH USER GROUP self.multiGraph.createGraphic("users_per_level_month",\ listUsersLevel, xlabst="months", ylabst="log(num users)",\ mainTitle="Growth of each user group per month", graphType=self.graphType,\ format=[],log=True) ## FIG 8 % OF TOTAL POPULATION OF EACH USER GROUP self.multiGraph.createGraphic("perc_users_per_level_month",\ listPercUsersLevel, xlabst="months", ylabst="% users",\ mainTitle="% of users in each user group per month", graphType=self.graphType,\ format=[],log=False) """