Example #1
0
def main():
    from functools import partial
    import optparse
    from operator import itemgetter

    p = optparse.OptionParser(
        usage="usage: %prog [options] current_dump rich_graph"
    )
    _, files = p.parse_args()

    if len(files) != 2:
        p.error("Give me a file, please ;-)")
    xml_filename = files[0]
    rich_fn = files[1]

    global lang_user_talk, lang_user, tag, templates

    src = BZ2File(xml_filename)

    tag = mwlib.get_tags(src)

    translations = mwlib.get_translations(src)
    lang_user, lang_user_talk = translations['User'], translations['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    user_classes = dict(sg_load(rich_fn).get_user_class('username',
                            ('anonymous', 'bot', 'bureaucrat','sysop')))

    p = Process(target=get_freq_dist, args=(queue, done_queue))
    p.start()

    ## XML Reader Process
    partial_process_page = partial(process_page, queue=queue)
    mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
                    partial_process_page)

    print >>sys.stderr, "end of XML processing"

    queue.put(None) ## this STOPS the process
    templates = done_queue.get()
    p.join()

    for k, v in sorted(templates.items(), key=itemgetter(1), reverse=True):
        print v, k.encode('utf-8')
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] dump enriched_pickle"
    )

    _, args = p.parse_args()

    if len(args) != 2:
        p.error("Too few or too many arguments")
    xml, rich_fn = args

    global lang_user_talk, lang_user, tag, user_classes
    ## pipe to send data to the  subprocess
    p_receiver, p_sender = Pipe(duplex=False)
    ## pipe to get elaborated data from the subprocess
    done_p_receiver, done_p_sender = Pipe(duplex=False)

    src = BZ2File(xml)

    tag = mwlib.get_tags(src)
    lang, date, _ = mwlib.explode_dump_filename(xml)
    g = sg_load(rich_fn)
    user_classes = dict(g.get_user_class('username',
                                  ('anonymous', 'bot', 'bureaucrat', 'sysop')))

    p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender))
    p.start()

    translations = mwlib.get_translations(src)
    lang_user, lang_user_talk = translations['User'], translations['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    ## open with a faster decompressor (but that probably cannot seek)
    src.close()
    src = lib.BZ2FileExt(xml, parallel=False)

    partial_process_page = partial(process_page, send=p_sender)
    mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
                    partial_process_page)
    logging.info('Users missing in the rich file: %d', count_missing)

    p_sender.send(0)  # this STOPS the process

    print >> sys.stderr, "end of parsing"

    ## SAVE DATA
    g.set_weighted_degree()
    users_cache = {}
    # get a list of pair (class name, frequency distributions)
    for cls, fd in done_p_receiver.recv():
        with open("%swiki-%s-words-%s.dat" %
                  (lang, date,
                   cls.replace(' ', '_')), 'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in fd:
                print >> out, v, k
        del fd

    for cls, counters in done_p_receiver.recv():
        with open("%swiki-%s-smile-%s.dat" %
                  (lang, date,
                   cls.replace(' ', '_')), 'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in counters:
                print >> out, v, k
        del counters

    p.join()

    print >> sys.stderr, "end of FreqDist"
Example #3
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] dump enriched_pickle")

    _, args = p.parse_args()

    if len(args) != 2:
        p.error("Too few or too many arguments")
    xml, rich_fn = args

    global lang_user_talk, lang_user, tag, user_classes
    ## pipe to send data to the  subprocess
    p_receiver, p_sender = Pipe(duplex=False)
    ## pipe to get elaborated data from the subprocess
    done_p_receiver, done_p_sender = Pipe(duplex=False)

    src = BZ2File(xml)

    tag = mwlib.get_tags(src)
    lang, date, _ = mwlib.explode_dump_filename(xml)
    g = sg_load(rich_fn)
    user_classes = dict(
        g.get_user_class('username',
                         ('anonymous', 'bot', 'bureaucrat', 'sysop')))

    p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender))
    p.start()

    translations = mwlib.get_translations(src)
    lang_user, lang_user_talk = translations['User'], translations['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    ## open with a faster decompressor (but that probably cannot seek)
    src.close()
    src = lib.BZ2FileExt(xml, parallel=False)

    partial_process_page = partial(process_page, send=p_sender)
    mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
                    partial_process_page)
    logging.info('Users missing in the rich file: %d', count_missing)

    p_sender.send(0)  # this STOPS the process

    print >> sys.stderr, "end of parsing"

    ## SAVE DATA
    g.set_weighted_degree()
    users_cache = {}
    # get a list of pair (class name, frequency distributions)
    for cls, fd in done_p_receiver.recv():
        with open(
                "%swiki-%s-words-%s.dat" % (lang, date, cls.replace(' ', '_')),
                'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in fd:
                print >> out, v, k
        del fd

    for cls, counters in done_p_receiver.recv():
        with open(
                "%swiki-%s-smile-%s.dat" % (lang, date, cls.replace(' ', '_')),
                'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in counters:
                print >> out, v, k
        del counters

    p.join()

    print >> sys.stderr, "end of FreqDist"