def main(): from functools import partial import optparse from operator import itemgetter p = optparse.OptionParser( usage="usage: %prog [options] current_dump rich_graph" ) _, files = p.parse_args() if len(files) != 2: p.error("Give me a file, please ;-)") xml_filename = files[0] rich_fn = files[1] global lang_user_talk, lang_user, tag, templates src = BZ2File(xml_filename) tag = mwlib.get_tags(src) translations = mwlib.get_translations(src) lang_user, lang_user_talk = translations['User'], translations['User talk'] assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" user_classes = dict(sg_load(rich_fn).get_user_class('username', ('anonymous', 'bot', 'bureaucrat','sysop'))) p = Process(target=get_freq_dist, args=(queue, done_queue)) p.start() ## XML Reader Process partial_process_page = partial(process_page, queue=queue) mwlib.fast_iter(etree.iterparse(src, tag=tag['page']), partial_process_page) print >>sys.stderr, "end of XML processing" queue.put(None) ## this STOPS the process templates = done_queue.get() p.join() for k, v in sorted(templates.items(), key=itemgetter(1), reverse=True): print v, k.encode('utf-8')
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dump enriched_pickle" ) _, args = p.parse_args() if len(args) != 2: p.error("Too few or too many arguments") xml, rich_fn = args global lang_user_talk, lang_user, tag, user_classes ## pipe to send data to the subprocess p_receiver, p_sender = Pipe(duplex=False) ## pipe to get elaborated data from the subprocess done_p_receiver, done_p_sender = Pipe(duplex=False) src = BZ2File(xml) tag = mwlib.get_tags(src) lang, date, _ = mwlib.explode_dump_filename(xml) g = sg_load(rich_fn) user_classes = dict(g.get_user_class('username', ('anonymous', 'bot', 'bureaucrat', 'sysop'))) p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender)) p.start() translations = mwlib.get_translations(src) lang_user, lang_user_talk = translations['User'], translations['User talk'] assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" ## open with a faster decompressor (but that probably cannot seek) src.close() src = lib.BZ2FileExt(xml, parallel=False) partial_process_page = partial(process_page, send=p_sender) mwlib.fast_iter(etree.iterparse(src, tag=tag['page']), partial_process_page) logging.info('Users missing in the rich file: %d', count_missing) p_sender.send(0) # this STOPS the process print >> sys.stderr, "end of parsing" ## SAVE DATA g.set_weighted_degree() users_cache = {} # get a list of pair (class name, frequency distributions) for cls, fd in done_p_receiver.recv(): with open("%swiki-%s-words-%s.dat" % (lang, date, cls.replace(' ', '_')), 'w') as out: # users in this group try: users = users_cache[cls] except KeyError: users = get_class(g, cls) users_cache[cls] = users print >> out, '#users: ', len(users) print >> out, '#msgs: ', sum(users['weighted_indegree']) for k, v in fd: print >> out, v, k del fd for cls, counters in done_p_receiver.recv(): with open("%swiki-%s-smile-%s.dat" % (lang, date, cls.replace(' ', '_')), 'w') as out: # users in this group try: users = users_cache[cls] except KeyError: users = get_class(g, cls) users_cache[cls] = users print >> out, '#users: ', len(users) print >> out, '#msgs: ', sum(users['weighted_indegree']) for k, v in counters: print >> out, v, k del counters p.join() print >> sys.stderr, "end of FreqDist"
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dump enriched_pickle") _, args = p.parse_args() if len(args) != 2: p.error("Too few or too many arguments") xml, rich_fn = args global lang_user_talk, lang_user, tag, user_classes ## pipe to send data to the subprocess p_receiver, p_sender = Pipe(duplex=False) ## pipe to get elaborated data from the subprocess done_p_receiver, done_p_sender = Pipe(duplex=False) src = BZ2File(xml) tag = mwlib.get_tags(src) lang, date, _ = mwlib.explode_dump_filename(xml) g = sg_load(rich_fn) user_classes = dict( g.get_user_class('username', ('anonymous', 'bot', 'bureaucrat', 'sysop'))) p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender)) p.start() translations = mwlib.get_translations(src) lang_user, lang_user_talk = translations['User'], translations['User talk'] assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" ## open with a faster decompressor (but that probably cannot seek) src.close() src = lib.BZ2FileExt(xml, parallel=False) partial_process_page = partial(process_page, send=p_sender) mwlib.fast_iter(etree.iterparse(src, tag=tag['page']), partial_process_page) logging.info('Users missing in the rich file: %d', count_missing) p_sender.send(0) # this STOPS the process print >> sys.stderr, "end of parsing" ## SAVE DATA g.set_weighted_degree() users_cache = {} # get a list of pair (class name, frequency distributions) for cls, fd in done_p_receiver.recv(): with open( "%swiki-%s-words-%s.dat" % (lang, date, cls.replace(' ', '_')), 'w') as out: # users in this group try: users = users_cache[cls] except KeyError: users = get_class(g, cls) users_cache[cls] = users print >> out, '#users: ', len(users) print >> out, '#msgs: ', sum(users['weighted_indegree']) for k, v in fd: print >> out, v, k del fd for cls, counters in done_p_receiver.recv(): with open( "%swiki-%s-smile-%s.dat" % (lang, date, cls.replace(' ', '_')), 'w') as out: # users in this group try: users = users_cache[cls] except KeyError: users = get_class(g, cls) users_cache[cls] = users print >> out, '#users: ', len(users) print >> out, '#msgs: ', sum(users['weighted_indegree']) for k, v in counters: print >> out, v, k del counters p.join() print >> sys.stderr, "end of FreqDist"