Esempio n. 1
0
def main():
	
	reader = tsv.Reader(sys.stdin, headers=tsv.Reader.FIRST_LINE)
	writer = tsv.Writer(sys.stdout)
	
	for i, (id, user_rows) in enumerate(aggregate(reader, by=lambda row: row['event_experimentId'])):
		if i % 100 == 0: sys.stderr.write(".")
		for user_i, row in enumerate(user_rows):
			
			writer.write(row.values() + [user_i])
		
	
	sys.stderr.write("\n")
Esempio n. 2
0
def main():
    reader = tsv.Reader(sys.stdin, types=[int, str])
    writer = tsv.Writer(sys.stdout)

    for i, row in enumerate(reader):
        ua = user_agents.parse(row.event_userAgent)
        if i % 10000 == 0: sys.stderr.write("%6d: " % i)
        if (i + 1) % 10000 == 0: sys.stderr.write("\n")
        if i % 100 == 0: sys.stderr.write(".")

        writer.write(
            [row.id, ua.browser.family, ua.os.family, ua.device.family])

    sys.stderr.write("\n")
Esempio n. 3
0
def run(revisions, lang, move_re):

    writer = tsv.Writer(sys.stdout)
    """
	headers=[
		'page_id',
		'page_namespace', 
		'page_title',
		'rev_id',
		'timestamp',
		'from_namespace',
		'from_title',
		'to_namespace',
		'to_title',
		'comment'
	]
	"""
    errors = 0

    for rev in revisions:

        match = move_re.match(rev.rev_comment)

        if match == None:
            errors += 1
            sys.stderr.write("Could not extract move from: " +
                             rev.rev_comment + "\n")
        else:
            from_ns, from_title = parse_page_name(match.group("from"), lang)
            to_ns, to_title = parse_page_name(match.group("to"), lang)
            comment = match.group("comment")

            writer.write([
                rev.page_id, rev.page_namespace, rev.page_title, rev.rev_id,
                rev.rev_timestamp, from_ns, from_title, to_ns, to_title,
                comment
            ])
def run(wiki_editor_months, active_edits=5):

    writer = tsv.Writer(sys.stdout, headers=HEADERS)

    for wiki, editor_months in aggregate(wiki_editor_months,
                                         by=lambda em: em.wiki):
        mae = deque([
            MonthlyActiveEditors(),
            MonthlyActiveEditors(),
            MonthlyActiveEditors()
        ],
                    maxlen=3)
        previously_active = set()
        for month, editors in aggregate(editor_months, by=lambda em: em.month):
            sys.stderr.write("{0}, {1}\n".format(wiki, month))
            first_actives = 0
            for editor in editors:

                user_id = editor.user_id
                user_registration = editor.user_registration
                attached_method = editor.attached_method
                revisions = editor.revisions or 0

                if user_id == 0: pass
                elif revisions >= active_edits:
                    # Active editor

                    if user_id not in previously_active:
                        first_actives += 1
                        previously_active.add(user_id)

                    if user_registration != None and \
                       user_registration > (month[:4] + month[4:]) and \
                       attached_method != 'login':
                        # New active editor

                        mae[0].new.add(user_id)

                    elif user_id in mae[1].new:
                        # Surviving new active editor

                        mae[0].surviving.add(user_id)

                    elif user_id in mae[1]:
                        # Old active editor

                        mae[0].old.add(user_id)

                    else:
                        # Other active editor

                        mae[0].reactivated.add(user_id)

            inactivated = len(mae[1] - mae[0])
            writer.write([
                wiki, month,
                len(mae[0]),
                len(mae[0].new),
                len(mae[0].surviving),
                len(mae[0].surviving) /
                len(mae[1].new) if len(mae[1].new) > 0 else None,
                len(mae[0].old),
                len(mae[0].old) / (len(mae[1]) - len(mae[1].new))
                if len(mae[1]) - len(mae[1].new) > 0 else None,
                len(mae[0].reactivated), inactivated,
                inactivated / len(mae[1]) if len(mae[1]) > 0 else None,
                first_actives
            ])

            mae.appendleft(MonthlyActiveEditors())  # Updating current
def initialize_writer(headers):
    return tsv.Writer(sys.stdout, headers=headers)