def processSQLrow(self,row): # try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return totaledits = 0 if row['add_edits'] is not None: totaledits += int(row['add_edits']) if row['remove_edits'] is not None: totaledits += int(row['remove_edits']) if row['noop_edits'] is not None: totaledits += int(row['noop_edits']) for i,al in enumerate(self.cohorts): # for each activity level al if totaledits >= al: self.data['edits'][i,time_index] += totaledits self.data['editors'][i,time_index] += 1 self.data['added'][i,time_index] += int(row['len_added']) self.data['removed'][i,time_index] += -int(row['len_removed']) self.data['net'][i,time_index] += int(row['len_added']) + int(row['len_removed'])
def processMongoDocument(self, editor): if 'edit_count' not in editor: return editor_id = editor['user_id'] if utils.isBot(editor_id): return for year, edity in editor['edit_count'].items(): for month, editm in edity.items(): # extract year and month 20xxxx ym = '%s%02d' % (year, int(month)) time_index = self.time_stamps_index.get(ym, None) if time_index is None: continue # cohort index depends on the aggregate of the namespaces nedits = 0 for ns, ecount in editm.items(): if ns in self.NS: # count edits nedits += ecount if nedits > 0: # Sparse representation. Only collecting >0 edits because it is possible that an editor has no edits in that namespace, but edits in other namespaces. cohorts_index = self.getIndex(nedits) # increment the histogram bin for that editor and month # note: as the data is sparse, it means we don't count the number of people with zero edits in that month self.data['editsHistogram'][cohorts_index, time_index] += 1
def processSQLrow(self, row): # try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ym = '%d%02d' % (year, month) time_index = self.time_stamps_index.get(ym, None) if time_index is None: return totaledits = 0 if row['add_edits'] is not None: totaledits += int(row['add_edits']) if row['remove_edits'] is not None: totaledits += int(row['remove_edits']) if row['noop_edits'] is not None: totaledits += int(row['noop_edits']) for i, al in enumerate(self.cohorts): # for each activity level al if totaledits >= al: self.data['edits'][i, time_index] += totaledits self.data['editors'][i, time_index] += 1 self.data['added'][i, time_index] += int(row['len_added']) self.data['removed'][i, time_index] += -int(row['len_removed']) self.data['net'][i, time_index] += int(row['len_added']) + int( row['len_removed'])
def processSQLrow(self,row): # try: editor_id = row['%s_user_id'%self.reverttype] if utils.isBot(editor_id): return year = row['%s_year'%self.reverttype] month = row['%s_month'%self.reverttype] ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return firstedit = '%d%02d'%(row['%s_first_edit_year'%self.reverttype],row['%s_first_edit_month'%self.reverttype]) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(fe_index) reverts = row['reverts'] if reverts < self.activation: return self.data['%s_editors'%self.reverttype][cohorts_index,time_index] += 1 self.data['%s_edits'%self.reverttype][cohorts_index,time_index] += reverts
def processSQLrow(self,row): editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ns = str(row['namespace']) ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(fe_index) if ns in self.NS: self.data['added'][cohorts_index,time_index] += int(row['len_added']) self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
def processSQLrow(self, row): try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ns = str(row['namespace']) ym = '%d%02d' % (year, month) time_index = self.time_stamps_index.get(ym, None) if time_index is None: return firstedit = '%d%02d' % (row['first_edit_year'], row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit, None) if fe_index is None: return cohorts_index = self.getIndex(fe_index) if ns in self.NS: if row['len_added'] is not None: self.data['added'][cohorts_index, time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index, time_index] += -int( row['len_removed']) if row['len_removed'] is not None and row[ 'len_removed'] is not None: self.data['net'][cohorts_index, time_index] += int( row['len_added']) + int(row['len_removed']) if row['len_removed'] is not None and row[ 'remove_edits'] is not None: self.data['edits'][cohorts_index, time_index] += int( row['add_edits']) + int(row['remove_edits']) if editor_id != self.old_user_id: #counting the editor we encountered self.data['editors'][cohorts_index, time_index] += 1 self.old_user_id = editor_id except: raise Exception('row:\n%s' % row)
def processSQLrow(self, row): # try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ym = '%d%02d' % (year, month) time_index = self.time_stamps_index.get(ym, None) if time_index is None: return firstedit = '%d%02d' % (row['first_edit_year'], row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit, None) if fe_index is None: return cohorts_index = self.getIndex(time_index, fe_index) edits = 0 if row['add_edits'] is not None: edits += int(row['add_edits']) if row['remove_edits'] is not None: edits += int(row['remove_edits']) if row['noop_edits'] is not None: edits += int(row['noop_edits']) if edits < self.minedits or (edits > self.maxedits and self.maxedits is not None): return self.data['editors'][cohorts_index, time_index] += 1 if row['len_added'] is not None: self.data['added'][cohorts_index, time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index, time_index] += -int(row['len_removed']) if row['len_added'] is not None and row['len_removed'] is not None: self.data['net'][cohorts_index, time_index] += int( row['len_added']) + int(row['len_removed']) self.data['edits'][cohorts_index, time_index] += edits
def processSQLrow(self,row): try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ns = str(row['namespace']) ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(fe_index) if ns in self.NS: if row['len_added'] is not None: self.data['added'][cohorts_index,time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) if row['len_removed'] is not None and row['len_removed'] is not None: self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) if row['len_removed'] is not None and row['remove_edits'] is not None: self.data['edits'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits']) if editor_id != self.old_user_id: #counting the editor we encountered self.data['editors'][cohorts_index,time_index] += 1 self.old_user_id = editor_id except: raise Exception('row:\n%s'%row)
def processSQLrow(self,row): # try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return firstedit = '%d%02d'%(row['first_edit_year'],row['first_edit_month']) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(time_index, fe_index) edits = 0 if row['add_edits'] is not None: edits += int(row['add_edits']) if row['remove_edits'] is not None: edits += int(row['remove_edits']) if row['noop_edits'] is not None: edits += int(row['noop_edits']) if edits < self.minedits or (edits > self.maxedits and self.maxedits is not None): return self.data['editors'][cohorts_index,time_index] += 1 if row['len_added'] is not None: self.data['added'][cohorts_index,time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) if row['len_added'] is not None and row['len_removed'] is not None: self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) self.data['edits'][cohorts_index,time_index] += edits
def processSQLrow(self, row): # try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ym = '%d%02d' % (year, month) time_index = self.time_stamps_index.get(ym, None) if time_index is None: return edits = 0 if row['add_edits'] is not None: edits += int(row['add_edits']) if row['remove_edits'] is not None: edits += int(row['remove_edits']) if row['noop_edits'] is not None: edits += int(row['noop_edits']) cohorts_index = self.getIndex(edits) self.data['editors'][cohorts_index, time_index] += 1 if row['len_added'] is not None: self.data['added'][cohorts_index, time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index, time_index] += -int(row['len_removed']) if row['len_added'] is not None and row['len_removed'] is not None: self.data['net'][cohorts_index, time_index] += int( row['len_added']) + int(row['len_removed']) self.data['edits'][cohorts_index, time_index] += edits
def processSQLrow(self, row): editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] day = row['rev_day'] ns = str(row['namespace']) ymd = '%d%02d%02d' % (year, month, day) time_index = self.time_stamps_index.get(ymd, None) if time_index is None: return fe = str(row['first_edit_year']) firstedit = fe[:8] # time.strptime(fe,"%Y%m%d%H%M%S") # firstedit = '%d%02d%02d'%(fe[0:4],fe[4:6],fe[6:8]) fe_index = self.time_stamps_index.get(firstedit, None) if fe_index is None: return cohorts_index = self.getIndex(time_index, fe_index) if ns in self.NS: self.data['addedPerDay'][cohorts_index, time_index] += int(row['len_added']) self.data['removedPerDay'][cohorts_index, time_index] += -int(row['len_removed']) self.data['netPerDay'][cohorts_index, time_index] += int( row['len_added']) + int(row['len_removed']) self.data['editsPerDay'][cohorts_index, time_index] += int( row['add_edits']) + int(row['remove_edits'])
def processSQLrow(self,row): # try: editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] ym = '%d%02d'%(year,month) time_index = self.time_stamps_index.get(ym,None) if time_index is None: return edits = 0 if row['add_edits'] is not None: edits += int(row['add_edits']) if row['remove_edits'] is not None: edits += int(row['remove_edits']) if row['noop_edits'] is not None: edits += int(row['noop_edits']) cohorts_index = self.getIndex(edits) self.data['editors'][cohorts_index,time_index] += 1 if row['len_added'] is not None: self.data['added'][cohorts_index,time_index] += int(row['len_added']) if row['len_removed'] is not None: self.data['removed'][cohorts_index,time_index] += -int(row['len_removed']) if row['len_added'] is not None and row['len_removed'] is not None: self.data['net'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) self.data['edits'][cohorts_index,time_index] += edits
def processSQLrow(self,row): editor_id = row['user_id'] if utils.isBot(editor_id): return year = row['rev_year'] month = row['rev_month'] day = row['rev_day'] ns = str(row['namespace']) ymd = '%d%02d%02d'%(year,month,day) time_index = self.time_stamps_index.get(ymd,None) if time_index is None: return fe = str(row['first_edit_year']) firstedit = fe[:8] # time.strptime(fe,"%Y%m%d%H%M%S") # firstedit = '%d%02d%02d'%(fe[0:4],fe[4:6],fe[6:8]) fe_index = self.time_stamps_index.get(firstedit,None) if fe_index is None: return cohorts_index = self.getIndex(time_index, fe_index) if ns in self.NS: self.data['addedPerDay'][cohorts_index,time_index] += int(row['len_added']) self.data['removedPerDay'][cohorts_index,time_index] += -int(row['len_removed']) self.data['netPerDay'][cohorts_index,time_index] += int(row['len_added']) + int(row['len_removed']) self.data['editsPerDay'][cohorts_index,time_index] += int(row['add_edits'])+int(row['remove_edits'])
def processMongoDocument(self,editor): if 'edit_count' not in editor: return editor_id = editor['user_id'] if utils.isBot(editor_id): return for year,edity in editor['edit_count'].items(): for month,editm in edity.items(): # extract year and month 20xxxx ym = '%s%02d'%(year,int(month)) time_index = self.time_stamps_index.get(ym,None) if time_index is None: continue # cohort index depends on the aggregate of the namespaces nedits = 0 for ns,ecount in editm.items(): if ns in self.NS: # count edits nedits += ecount if nedits > 0: # Sparse representation. Only collecting >0 edits because it is possible that an editor has no edits in that namespace, but edits in other namespaces. cohorts_index = self.getIndex(nedits) # increment the histogram bin for that editor and month # note: as the data is sparse, it means we don't count the number of people with zero edits in that month self.data['editsHistogram'][cohorts_index,time_index] += 1
def processSQLrow(self, row): # try: editor_id = row['user_id'] if utils.isBot(editor_id): return if editor_id != self.old_user_id: '''When encountering a new editor, we extract the firstedit and the last ym of interest ''' if self.old_user_id is not None: # we just finished aggregating information for an editor #getting bin of the number of edits the editor has done cohorts_index = self.getIndex(self.edits) self.data['editors'][cohorts_index, self.fe_index] += 1 self.data['added'][cohorts_index, self.fe_index] = self.added self.data['removed'][cohorts_index, self.fe_index] = self.removed self.data['net'][cohorts_index, self.fe_index] = self.net self.data['edits'][cohorts_index, self.fe_index] = self.edits # initializing info for current editor self.old_user_id = editor_id year = row['first_edit_year'] month = row['first_edit_month'] self.firstedit = '%d%02d' % (year, month) self.fe_index = self.time_stamps_index.get(self.firstedit, None) if self.fe_index is None: return (m, y) = ((month + self.period) % 12, year + (month + self.period) / 12) self.lastym = '%d%02d' % (y, m) self.edits = 0 self.added = 0 self.removed = 0 self.net = 0 if self.fe_index is None: return year = row['rev_year'] month = row['rev_month'] ym = '%d%02d' % (year, month) if ym > self.lastym: return if row['add_edits'] is not None: self.edits += int(row['add_edits']) if row['remove_edits'] is not None: self.edits += int(row['remove_edits']) if row['noop_edits'] is not None: self.edits += int(row['noop_edits']) if row['len_added'] is not None: self.added += int(row['len_added']) if row['len_removed'] is not None: self.removed += -int(row['len_removed']) if row['len_added'] is not None and row['len_removed'] is not None: self.net += int(row['len_added']) + int(row['len_removed'])
def processSQLrow(self,row): # try: editor_id = row['user_id'] if utils.isBot(editor_id): return if editor_id != self.old_user_id: '''When encountering a new editor, we extract the firstedit and the last ym of interest ''' if self.old_user_id is not None: # we just finished aggregating information for an editor #getting bin of the number of edits the editor has done cohorts_index = self.getIndex(self.edits) self.data['editors'][cohorts_index,self.fe_index] += 1 self.data['added'][cohorts_index,self.fe_index] = self.added self.data['removed'][cohorts_index,self.fe_index] = self.removed self.data['net'][cohorts_index,self.fe_index] = self.net self.data['edits'][cohorts_index,self.fe_index] = self.edits # initializing info for current editor self.old_user_id = editor_id year = row['first_edit_year'] month = row['first_edit_month'] self.firstedit = '%d%02d'%(year,month) self.fe_index = self.time_stamps_index.get(self.firstedit,None) if self.fe_index is None: return (m,y) = ((month+self.period)%12,year+(month+self.period)/12) self.lastym = '%d%02d'%(y,m) self.edits = 0 self.added = 0 self.removed = 0 self.net = 0 if self.fe_index is None: return year = row['rev_year'] month = row['rev_month'] ym = '%d%02d'%(year,month) if ym > self.lastym: return if row['add_edits'] is not None: self.edits += int(row['add_edits']) if row['remove_edits'] is not None: self.edits += int(row['remove_edits']) if row['noop_edits'] is not None: self.edits += int(row['noop_edits']) if row['len_added'] is not None: self.added += int(row['len_added']) if row['len_removed'] is not None: self.removed += -int(row['len_removed']) if row['len_added'] is not None and row['len_removed'] is not None: self.net += int(row['len_added']) + int(row['len_removed'])