def main(): reader = tsv.Reader(sys.stdin, headers=tsv.Reader.FIRST_LINE) writer = tsv.Writer(sys.stdout) for i, (id, user_rows) in enumerate(aggregate(reader, by=lambda row: row['event_experimentId'])): if i % 100 == 0: sys.stderr.write(".") for user_i, row in enumerate(user_rows): writer.write(row.values() + [user_i]) sys.stderr.write("\n")
def main(): reader = tsv.Reader(sys.stdin, types=[int, str]) writer = tsv.Writer(sys.stdout) for i, row in enumerate(reader): ua = user_agents.parse(row.event_userAgent) if i % 10000 == 0: sys.stderr.write("%6d: " % i) if (i + 1) % 10000 == 0: sys.stderr.write("\n") if i % 100 == 0: sys.stderr.write(".") writer.write( [row.id, ua.browser.family, ua.os.family, ua.device.family]) sys.stderr.write("\n")
def run(revisions, lang, move_re): writer = tsv.Writer(sys.stdout) """ headers=[ 'page_id', 'page_namespace', 'page_title', 'rev_id', 'timestamp', 'from_namespace', 'from_title', 'to_namespace', 'to_title', 'comment' ] """ errors = 0 for rev in revisions: match = move_re.match(rev.rev_comment) if match == None: errors += 1 sys.stderr.write("Could not extract move from: " + rev.rev_comment + "\n") else: from_ns, from_title = parse_page_name(match.group("from"), lang) to_ns, to_title = parse_page_name(match.group("to"), lang) comment = match.group("comment") writer.write([ rev.page_id, rev.page_namespace, rev.page_title, rev.rev_id, rev.rev_timestamp, from_ns, from_title, to_ns, to_title, comment ])
def run(wiki_editor_months, active_edits=5): writer = tsv.Writer(sys.stdout, headers=HEADERS) for wiki, editor_months in aggregate(wiki_editor_months, by=lambda em: em.wiki): mae = deque([ MonthlyActiveEditors(), MonthlyActiveEditors(), MonthlyActiveEditors() ], maxlen=3) previously_active = set() for month, editors in aggregate(editor_months, by=lambda em: em.month): sys.stderr.write("{0}, {1}\n".format(wiki, month)) first_actives = 0 for editor in editors: user_id = editor.user_id user_registration = editor.user_registration attached_method = editor.attached_method revisions = editor.revisions or 0 if user_id == 0: pass elif revisions >= active_edits: # Active editor if user_id not in previously_active: first_actives += 1 previously_active.add(user_id) if user_registration != None and \ user_registration > (month[:4] + month[4:]) and \ attached_method != 'login': # New active editor mae[0].new.add(user_id) elif user_id in mae[1].new: # Surviving new active editor mae[0].surviving.add(user_id) elif user_id in mae[1]: # Old active editor mae[0].old.add(user_id) else: # Other active editor mae[0].reactivated.add(user_id) inactivated = len(mae[1] - mae[0]) writer.write([ wiki, month, len(mae[0]), len(mae[0].new), len(mae[0].surviving), len(mae[0].surviving) / len(mae[1].new) if len(mae[1].new) > 0 else None, len(mae[0].old), len(mae[0].old) / (len(mae[1]) - len(mae[1].new)) if len(mae[1]) - len(mae[1].new) > 0 else None, len(mae[0].reactivated), inactivated, inactivated / len(mae[1]) if len(mae[1]) > 0 else None, first_actives ]) mae.appendleft(MonthlyActiveEditors()) # Updating current
def initialize_writer(headers): return tsv.Writer(sys.stdout, headers=headers)