Ejemplo n.º 1
0
def run(similar_titles, session):

    grouped_similar_titles = aggregate(similar_titles,
                                       by=lambda r:r.input_title)

    writer = tsv.Writer(sys.stdout, headers=HEADERS)
    for input_title, rows in grouped_similar_titles:

        try:
            content = list(session.revisions.query(titles={input_title}, properties={'content'}, limit=1))[0]['*']
        except Exception as e:
            content = ""
            sys.stderr.write(traceback.format_exc())
            
            
        parsed = mwparserfromhell.parse(content)
        lead_bit = "".join(str(v) for v in parsed.strip_code())[:200] + "..."
            
        
        sys.stderr.write(".");sys.stderr.flush()
        for row in rows:
            writer.write([
                re.sub(re.compile(r"\#.*"), "", 
                       row.input_title.replace("_", " ")),
                lead_bit,
                row.similar_title.replace("_", " "),
                row.rank,
                row.snippet
            ])
        
    
    sys.stderr.write("\n");sys.stderr.flush()
Ejemplo n.º 2
0
def main():
	
	reader = tsv.Reader(sys.stdin, headers=tsv.Reader.FIRST_LINE)
	writer = tsv.Writer(sys.stdout)
	
	for i, (id, user_rows) in enumerate(aggregate(reader, by=lambda row: row['event_experimentId'])):
		if i % 100 == 0: sys.stderr.write(".")
		for user_i, row in enumerate(user_rows):
			
			writer.write(row.values() + [user_i])
		
	
	sys.stderr.write("\n")
def run(similar_titles, n):
    
    grouped_similar_titles = aggregate(similar_titles,
                                       by=lambda r:r.input_title)
    
    writer = tsv.Writer(sys.stdout, headers=HEADERS)
    for input_title, similar_titles in grouped_similar_titles:
        similar_titles = list(similar_titles)
        
        random.shuffle(similar_titles)
        
        for similar_title in similar_titles[:n]:
            writer.write(similar_title.values())
Ejemplo n.º 4
0
def run(revs, radius, cutoff):
    writer = None
    for wiki, revs in aggregate(revs, by=lambda r: r.wiki):

        sys.stderr.write("Conn({0}): ".format(wiki))
        db = DB.from_params(
            host="analytics-store.eqiad.wmnet", user="******", read_default_file="~/.my.research.cnf", db=wiki
        )
        for rev in revs:
            if writer == None:
                writer = tsv.Writer(sys.stdout, headers=rev.keys() + ["reverted", "archived"])

            rev_doc = dict(rev)

            try:
                # sys.stderr.write("<");sys.stderr.flush()
                rev_row = db.revisions.get(int(rev.rev_id))
                # sys.stderr.write(str(int(rev_row==None)))
                # sys.stderr.write("|");sys.stderr.flush()
                rev_doc["archived"] = False

                revert = reverts.database.check_row(
                    db, rev_row, radius=radius, before=Timestamp(rev_row["rev_timestamp"]) + cutoff
                )
                if revert != None:
                    rev_doc["reverted"] = True
                    sys.stderr.write("r")
                    sys.stderr.flush()
                else:
                    rev_doc["reverted"] = False
                    sys.stderr.write(".")
                    sys.stderr.flush()

            except KeyError:
                rev_doc["archived"] = False
                rev_doc["reverted"] = None
                sys.stderr.write("a")
                sys.stderr.flush()
            finally:
                # sys.stderr.write(">");sys.stderr.flush()
                pass

            writer.write([rev_doc[k] for k in rev.keys() + ["reverted", "archived"]])

            sys.stderr.flush()

        sys.stderr.write("\n")
        sys.stderr.flush()
def run(wiki_editor_months, active_edits=5):
	
	writer = tsv.Writer(sys.stdout, headers=HEADERS)
	
	for wiki, editor_months in aggregate(wiki_editor_months, by=lambda em:em.wiki):
		mae = deque([MonthlyActiveEditors(), MonthlyActiveEditors(), MonthlyActiveEditors()], maxlen=3)
		previously_active = set()
		for month, editors in aggregate(editor_months, by=lambda em:em.month):
			sys.stderr.write("{0}, {1}\n".format(wiki, month))
			first_actives = 0
			for editor in editors:
				
				user_id = editor.user_id
				user_registration = editor.user_registration
				attached_method = editor.attached_method
				revisions = editor.revisions or 0
				
				if user_id == 0: pass
				elif revisions >= active_edits:
					# Active editor
					
					if user_id not in previously_active:
						first_actives += 1
						previously_active.add(user_id)
					
					if user_registration != None and \
					   user_registration > (month[:4] + month[4:]) and \
					   attached_method != 'login':
						# New active editor
						
						mae[0].new.add(user_id)
						
					elif user_id in mae[1].new:
						# Surviving new active editor
						
						mae[0].surviving.add(user_id)
						
					elif user_id in mae[1]:
						# Old active editor
						
						mae[0].old.add(user_id)
						
					else:
						# Other active editor
						
						mae[0].reactivated.add(user_id)
					
			
		
			inactivated = len(mae[1] - mae[0])
			writer.write([
				wiki,
				month,
				len(mae[0]),
				len(mae[0].new),
				len(mae[0].surviving),
				len(mae[0].surviving)/len(mae[1].new) if len(mae[1].new) > 0 else None,
				len(mae[0].old),
				len(mae[0].old)/(len(mae[1])-len(mae[1].new)) if len(mae[1])-len(mae[1].new) > 0 else None,
				len(mae[0].reactivated),
				inactivated,
				inactivated/len(mae[1]) if len(mae[1]) > 0 else None,
				first_actives
			])
			
			mae.appendleft(MonthlyActiveEditors()) # Updating current
def run(wiki_editor_months, active_edits=5):

    writer = tsv.Writer(sys.stdout, headers=HEADERS)

    for wiki, editor_months in aggregate(wiki_editor_months,
                                         by=lambda em: em.wiki):
        mae = deque([
            MonthlyActiveEditors(),
            MonthlyActiveEditors(),
            MonthlyActiveEditors()
        ],
                    maxlen=3)
        previously_active = set()
        for month, editors in aggregate(editor_months, by=lambda em: em.month):
            sys.stderr.write("{0}, {1}\n".format(wiki, month))
            first_actives = 0
            for editor in editors:

                user_id = editor.user_id
                user_registration = editor.user_registration
                attached_method = editor.attached_method
                revisions = editor.revisions or 0

                if user_id == 0: pass
                elif revisions >= active_edits:
                    # Active editor

                    if user_id not in previously_active:
                        first_actives += 1
                        previously_active.add(user_id)

                    if user_registration != None and \
                       user_registration > (month[:4] + month[4:]) and \
                       attached_method != 'login':
                        # New active editor

                        mae[0].new.add(user_id)

                    elif user_id in mae[1].new:
                        # Surviving new active editor

                        mae[0].surviving.add(user_id)

                    elif user_id in mae[1]:
                        # Old active editor

                        mae[0].old.add(user_id)

                    else:
                        # Other active editor

                        mae[0].reactivated.add(user_id)

            inactivated = len(mae[1] - mae[0])
            writer.write([
                wiki, month,
                len(mae[0]),
                len(mae[0].new),
                len(mae[0].surviving),
                len(mae[0].surviving) /
                len(mae[1].new) if len(mae[1].new) > 0 else None,
                len(mae[0].old),
                len(mae[0].old) / (len(mae[1]) - len(mae[1].new))
                if len(mae[1]) - len(mae[1].new) > 0 else None,
                len(mae[0].reactivated), inactivated,
                inactivated / len(mae[1]) if len(mae[1]) > 0 else None,
                first_actives
            ])

            mae.appendleft(MonthlyActiveEditors())  # Updating current