def setUp(self):
        xml = "tests/utpedits2graph/" + \
              "vecwiki-20100307-stub-meta-history-TEST.xml.bz2"
        self.lang, self.date_, self.type_ = mwlib.explode_dump_filename(xml)

        deflate, _lineno = find_open_for_this_file(xml)
        welcome = defaultdict(str)
        welcome.update({'it': r'Benvenut',
                        'en': r'Welcome'})
        if _lineno:
            src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
        else:
            src = deflate(xml)
        tag = mwlib.get_tags(src,
                        tags='page,title,revision,timestamp,contributor,'
                                  'username,ip,comment,id')
        translations = mwlib.get_translations(src)

        try:
            lang_user = unicode(translations['User'])
            lang_user_talk = unicode(translations['User talk'])
        except UnicodeDecodeError:
            lang_user = smart_str(translations['User'])
            lang_user_talk = smart_str(translations['User talk'])
        src.close()
        src = deflate(xml)
        self.processor = HistoryPageProcessor(tag=tag,
                         user_talk_names=(lang_user_talk, u"User talk"))
        self.processor.welcome_pattern = welcome[self.lang]
        self.processor.start(src)
        self.g = self.processor.get_network()
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file desired_list output_file")
    p.add_option('-t', '--type', action="store", dest="type", default="all",
                 help="Type of page to analize (content|talk|all)")
    p.add_option('-e', '--encoding', action="store", dest="encoding",
                 default="latin-1", help="encoding of the desired_list file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    desired_pages_fn = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    src.close()
    src = deflate(xml)

    out = open(output, 'w')
    processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang,
                                              output=out,
                                              userns=translation['User'])
    processor.talkns = translation['Talk']
    if opts.type == 'talk':
        processor.get_articles = False
    elif opts.type == 'content':
        processor.get_talks = False
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding)
    with Timr('Processing'):
        processor.start(src) ## PROCESSING
    processor.flush()
    out.close()
Exemple #3
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file gender_file output_file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-e',
                 '--min-edits',
                 default=0,
                 dest="min_edits",
                 metavar="MIN_EDITS",
                 type=int,
                 help="pages with less than MIN_EIDTS edits "
                 "are skipped (default: %(default)s)")

    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    gender_data = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src,
                   tags="page,redirect,timestamp,ip,"
                   "contributor,title,username")
    src.close()
    src = deflate(xml)

    out = open(output, "w")
    processor = GenderPageProcessor(tag=tag,
                                    lang=lang,
                                    output=out,
                                    userns=translation['User'],
                                    gender_data=gender_data,
                                    min_edits=opts.min_edits)
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
    def setUp(self):
        xml = "tests/utpedits2graph/" + \
              "vecwiki-20100307-stub-meta-history-TEST.xml.bz2"
        self.lang, self.date_, self.type_ = mwlib.explode_dump_filename(xml)

        deflate, _lineno = find_open_for_this_file(xml)
        welcome = defaultdict(str)
        welcome.update({'it': r'Benvenut', 'en': r'Welcome'})
        if _lineno:
            src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
        else:
            src = deflate(xml)
        tag = mwlib.get_tags(src,
                             tags='page,title,revision,timestamp,contributor,'
                             'username,ip,comment,id')
        translations = mwlib.get_translations(src)

        try:
            lang_user = unicode(translations['User'])
            lang_user_talk = unicode(translations['User talk'])
        except UnicodeDecodeError:
            lang_user = smart_str(translations['User'])
            lang_user_talk = smart_str(translations['User talk'])
        src.close()
        src = deflate(xml)
        self.processor = HistoryPageProcessor(tag=tag,
                                              user_talk_names=(lang_user_talk,
                                                               u"User talk"))
        self.processor.welcome_pattern = welcome[self.lang]
        self.processor.start(src)
        self.g = self.processor.get_network()
def main():

    logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)  # filename="graph_longiudinal_analysis.log",
    logging.info("---------------------START---------------------")

    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    welcome = defaultdict(str)

    welcome.update({"it": r"Benvenut", "en": r"Welcome"})

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src, tags="page,title,revision,timestamp,contributor,username,ip,comment")

    translations = mwlib.get_translations(src)

    try:
        lang_user = unicode(translations["User"])
        lang_user_talk = unicode(translations["User talk"])
    except UnicodeDecodeError:
        lang_user = smart_str(translations["User"])
        lang_user_talk = smart_str(translations["User talk"])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag, user_talk_names=(lang_user_talk, u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    processor.welcome_pattern = welcome[lang]

    with Timr("Processing"):
        processor.start(src)  ## PROCESSING

    with Timr("Getting network"):
        g = processor.get_network()

    logging.info("Nodes: %d" % len(g.vs))
    logging.info("Edges: %d" % len(g.es))

    with Timr("Saving graph"):
        save_graph(g, lang, type_, date_)
Exemple #6
0
def main():
    import optparse
    import csv

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import sys, logging
        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    with open(desired_pages_fn, 'rb') as f:
        desired_pages = [
            l[0].decode('latin-1') for l in csv.reader(f)
            if l and not l[0][0] == '#'
        ]

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,ip,username')

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired(desired_pages)
    with Timr('Retrieving bots'):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
    processor.flush()
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file gender_file output_file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-e', '--min-edits', default=0, dest="min_edits",
                 metavar="MIN_EDITS", type=int,
                 help="pages with less than MIN_EIDTS edits "
                      "are skipped (default: %(default)s)")

    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    gender_data = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src,
                   tags="page,redirect,timestamp,ip,"
                        "contributor,title,username")
    src.close()
    src = deflate(xml)

    out = open(output, "w")
    processor = GenderPageProcessor(tag=tag, lang=lang,
                                    output=out,
                                    userns=translation['User'],
                                    gender_data=gender_data,
                                    min_edits=opts.min_edits
                                   )
    with Timr('Processing'):
        processor.start(src) ## PROCESSING
    processor.flush()
    out.close()
def main():
    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(
        src,
        tags='page,title,revision,timestamp,contributor,username,ip,comment')

    translations = mwlib.get_translations(src)
    lang_user = unicode(translations['User'])
    lang_user_talk = unicode(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    print >> sys.stderr, "BEGIN PARSING"
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
                                     user_talk_names=(lang_user_talk,
                                                      u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'
    with Timr('Processing'):
        processor.start(src)  ## PROCESSING

    with Timr('EdgeCache.get_network()'):
        g = processor.get_network()

    print >> sys.stderr, "Nodes:", len(g.vs)
    print >> sys.stderr, "Edges:", len(g.es)

    for e in g.es:
        e['weight'] = len(e['timestamp'])
        #e['timestamp'] = str(e['timestamp'])
    with Timr('Pickling'):
        g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
def main():
    import optparse
    import csv

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import sys, logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    with open(desired_pages_fn, 'rb') as f:
        desired_pages = [l[0].decode('latin-1') for l in csv.reader(f)
                                        if l and not l[0][0] == '#']

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,ip,username')

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired(desired_pages)
    with Timr('Retrieving bots'):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
    processor.flush()
def main():
    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
        tags='page,title,revision,timestamp,contributor,username,ip,comment')

    translations = mwlib.get_translations(src)
    lang_user = unicode(translations['User'])
    lang_user_talk = unicode(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    print >>sys.stderr, "BEGIN PARSING"
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
        user_talk_names=(lang_user_talk, u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'
    with Timr('Processing'):
        processor.start(src) ## PROCESSING

    with Timr('EdgeCache.get_network()'):
        g = processor.get_network()

    print >>sys.stderr, "Nodes:", len(g.vs)
    print >>sys.stderr, "Edges:", len(g.es)

    for e in g.es:
        e['weight'] = len(e['timestamp'])
        #e['timestamp'] = str(e['timestamp'])
    with Timr('Pickling'):
        g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
Exemple #11
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import logging
        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if not files:
        p.error("Give me a file, please ;-)")

    xml, desired_pages_fn, desired_words_fn = files[0:3]
    threshold = float(files[3])

    desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)]

    lang, _, _ = explode_dump_filename(xml)

    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,text')

    src.close()
    src = deflate(xml)

    processor = HistoryWordsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired_from_csv(desired_pages_fn)
    processor.words = desired_words

    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
def main():
    import optparse
    import csv

    p = optparse.OptionParser(usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option("-v", action="store_true", dest="verbose", default=False, help="Verbose output (like timings)")
    p.add_option(
        "-e", "--encoding", action="store", dest="encoding", default="latin-1", help="encoding of the desired_list file"
    )
    opts, files = p.parse_args()
    if opts.verbose:
        import sys, logging

        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags="page,title,revision," + "minor,timestamp,redirect,ip,username")

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation["Talk"]
    processor.threshold = threshold
    processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding)
    with Timr("Retrieving bots"):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr("Parsing"):
        processor.start(src)
    processor.flush()
Exemple #13
0
def main():
    from functools import partial
    import optparse
    from operator import itemgetter

    p = optparse.OptionParser(
        usage="usage: %prog [options] current_dump rich_graph"
    )
    _, files = p.parse_args()

    if len(files) != 2:
        p.error("Give me a file, please ;-)")
    xml_filename = files[0]
    rich_fn = files[1]

    global lang_user_talk, lang_user, tag, templates

    src = BZ2File(xml_filename)

    tag = mwlib.get_tags(src)

    translations = mwlib.get_translations(src)
    lang_user, lang_user_talk = translations['User'], translations['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    user_classes = dict(sg_load(rich_fn).get_user_class('username',
                            ('anonymous', 'bot', 'bureaucrat','sysop')))

    p = Process(target=get_freq_dist, args=(queue, done_queue))
    p.start()

    ## XML Reader Process
    partial_process_page = partial(process_page, queue=queue)
    mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
                    partial_process_page)

    print >>sys.stderr, "end of XML processing"

    queue.put(None) ## this STOPS the process
    templates = done_queue.get()
    p.join()

    for k, v in sorted(templates.items(), key=itemgetter(1), reverse=True):
        print v, k.encode('utf-8')
Exemple #14
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG)

    if not files:
        p.error("Give me a file, please ;-)")

    xml, desired_pages_fn, desired_words_fn = files[0:3]
    threshold = float(files[3])

    desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)]

    lang, _, _ = explode_dump_filename(xml)

    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,text')

    src.close()
    src = deflate(xml)

    processor = HistoryWordsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired_from_csv(desired_pages_fn)
    processor.words = desired_words

    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
Exemple #15
0
def main():

    logging.basicConfig(#filename="random_page_extractor.log",
                                stream=sys.stderr,
                                level=logging.DEBUG)

    op = create_option_parser()
    args = op.parse_args()

    with open(args.desired_pages_fn, 'rb') as f:
        desired_pages = [l[0].decode('latin-1') for l in csv.reader(f)
                                        if l and not l[0][0] == '#']

    lang, date_, type_ = explode_dump_filename(args.xml_fn)
    deflate, _lineno = lib.find_open_for_this_file(args.xml_fn)

    dumps_checker(args, type_)

    logging.info('---------------------START---------------------')

    if _lineno:
        src = deflate(args.xml_fn, 51)
    else:
        src = deflate(args.xml_fn)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,redirect,text,username,ip,timestamp')

    src.close()
    src = deflate(args.xml_fn)

    output = open(args.output, 'w') if args.output else None

    processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang,
                                              output=output,
                                              threshold=args.ratio,
                                              min_text=args.min_text_length,
                                              n_users=args.editors_number,
                                              start_revision=args.initial_revision)
    
    processor.talkns = translation['Talk']
    processor.desired_page_type = args.type
    processor.set_desired(desired_pages)
    with Timr('processing'):
        processor.start(src)
Exemple #16
0
def main():

    logging.basicConfig(#filename="random_page_extractor.log",
                                stream=sys.stderr,
                                level=logging.DEBUG)

    op = create_option_parser()
    args = op.parse_args()

    lang, date_, type_ = explode_dump_filename(args.xml_fn)
    deflate, _lineno = lib.find_open_for_this_file(args.xml_fn)

    dumps_checker(args, type_)

    logging.info('---------------------START---------------------')

    if _lineno:
        src = deflate(args.xml_fn, 51)
    else:
        src = deflate(args.xml_fn)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,redirect,text,username,ip,timestamp')

    src.close()
    src = deflate(args.xml_fn)

    output = open(args.output, 'w') if args.output else None

    processor = HistoryRevisionsPageProcessor(
                    tag=tag,
                    lang=lang,
                    output=output,
                    threshold=args.ratio,
                    min_text=args.min_text_length,
                    min_revisions=args.revisions_number,
                    n_users=args.editors_number,
                    start_revision=args.initial_revision)

    processor.talkns = translation['Talk']
    processor.desired_page_type = args.type
    processor.set_desired_from_csv(desired_pages_fn, encoding=args.encoding)
    with Timr('processing'):
        processor.start(src)
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import logging
        logging.basicConfig(stream=sys.stderr,level=logging.DEBUG)

    if not files:
        p.error("Error: No file received.")

    xml, desired_pages_fn, desired_words_fn = files[0:3]
    threshold = float(files[3])

    desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)]

    lang, _, _ = explode_dump_filename(xml)

    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,minor,timestamp,redirect,text')

    src.close()
    src = deflate(xml)

    analyzer = EditsAnalyzer(tag=tag, lang=lang)
    analyzer.set_desired_from_csv(desired_pages_fn)
    analyzer.words = desired_words

    with Timr('Analyzing...'):
        analyzer.start(src)
def main():
    import optparse

    p = optparse.OptionParser(usage="usage: %prog [options] input_file geoip_db output_file")
    p.add_option("-v", action="store_true", dest="verbose", default=False, help="Verbose output (like timings)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(
            stream=sys.stderr,
            level=logging.DEBUG,
            format="%(asctime)s %(levelname)s %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
        )

    xml = files[0]
    geoip_db = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags="page,redirect,timestamp,ip,revision,title")
    src.close()
    src = deflate(xml)

    processor = CountriesPageProcessor(tag=tag, lang=lang, output=output, userns=translation["User"], geoip=geoip_db)
    with Timr("Processing"):
        processor.start(src)  ## PROCESSING
    processor.flush()
Exemple #19
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file desired_list output_file")
    p.add_option('-t', '--type', action="store", dest="type", default="all",
                 help="Type of page to analize (content|talk|all)")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    output = files[2]

    with open(desired_pages_fn, 'rb') as f:
        desired_pages = [l[0].decode('latin-1') for l in csv.reader(f)
                                        if l and not l[0][0] == '#']
    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    src.close()
    src = deflate(xml)

    processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang,
                                              output=output)
    processor.talkns = translation['Talk']
    processor.desired_page_type = opts.type
    processor.set_desired(desired_pages)
    processor.start(src)
    processor.flush()
def main():
    opts, args = opt_parse()
    xml = args[0]
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
    logging.info('---------------------START---------------------')

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    welcome = defaultdict(str)

    welcome.update({'it': r'Benvenut',
                    'en': r'Welcome'})

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
                         tags='page,title,revision,timestamp,contributor,'
                              'username,ip,comment,id')

    translations = mwlib.get_translations(src)

    try:
        lang_user = unicode(translations['User'])
        lang_user_talk = unicode(translations['User talk'])
    except UnicodeDecodeError:
        lang_user = smart_str(translations['User'])
        lang_user_talk = smart_str(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
        user_talk_names=(lang_user_talk, u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    processor.welcome_pattern = welcome[lang]

    with Timr('Processing'):
        processor.start(src) ## PROCESSING

    with Timr('Getting network'):
        g = processor.get_network()

    logging.info("Nodes: %d", len(g.vs))
    logging.info("Edges: %d", len(g.es))

    with Timr('Saving graph'):
        save_graph(g, lang, type_, date_)
Exemple #21
0
def main():
    opts, args = opt_parse()
    xml = args[0]
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
    logging.info('---------------------START---------------------')

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    welcome = defaultdict(str)

    welcome.update({'it': r'Benvenut', 'en': r'Welcome'})

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
                         tags='page,title,revision,timestamp,contributor,'
                         'username,ip,comment,id')

    translations = mwlib.get_translations(src)

    try:
        lang_user = unicode(translations['User'])
        lang_user_talk = unicode(translations['User talk'])
    except UnicodeDecodeError:
        lang_user = smart_str(translations['User'])
        lang_user_talk = smart_str(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
                                     user_talk_names=(lang_user_talk,
                                                      u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    processor.welcome_pattern = welcome[lang]

    with Timr('Processing'):
        processor.start(src)  ## PROCESSING

    with Timr('Getting network'):
        g = processor.get_network()

    logging.info("Nodes: %d", len(g.vs))
    logging.info("Edges: %d", len(g.es))

    with Timr('Saving graph'):
        save_graph(g, lang, type_, date_)
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file dictionary output_file")
    p.add_option('-t', '--type', action="store", dest="type", default="all",
                 help="Type of page to analize (content|talk|all)")
    p.add_option('-e', '--encoding', action="store", dest="encoding",
                 default="latin-1", help="encoding of the desired_list file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    p.add_option('-C', '--charlimit', action="store", dest="charlimit",
                 type="int", default=100000,
                 help="Maximim characters per line (default=100000)")
    p.add_option('-r', action="store_true", dest="regex", default=False,
                 help="Use a dictionary composed by regex (default=false)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    dic = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)]
    src.close()
    src = deflate(xml)

    out = open(output, 'w')
    processor = PyWCProcessor(tag=tag, lang=lang, dic=dic,
                              output=out, userns=translation['User'])
    processor.namespaces = namespaces
    if opts.type == 'talk':
        processor.get_articles = False
    elif opts.type == 'content':
        processor.get_talks = False
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean

    with Timr('Processing'):
        processor.start(src) ## PROCESSING
    processor.flush()
    out.close()
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] dump enriched_pickle"
    )

    _, args = p.parse_args()

    if len(args) != 2:
        p.error("Too few or too many arguments")
    xml, rich_fn = args

    global lang_user_talk, lang_user, tag, user_classes
    ## pipe to send data to the  subprocess
    p_receiver, p_sender = Pipe(duplex=False)
    ## pipe to get elaborated data from the subprocess
    done_p_receiver, done_p_sender = Pipe(duplex=False)

    src = BZ2File(xml)

    tag = mwlib.get_tags(src)
    lang, date, _ = mwlib.explode_dump_filename(xml)
    g = sg_load(rich_fn)
    user_classes = dict(g.get_user_class('username',
                                  ('anonymous', 'bot', 'bureaucrat', 'sysop')))

    p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender))
    p.start()

    translations = mwlib.get_translations(src)
    lang_user, lang_user_talk = translations['User'], translations['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    ## open with a faster decompressor (but that probably cannot seek)
    src.close()
    src = lib.BZ2FileExt(xml, parallel=False)

    partial_process_page = partial(process_page, send=p_sender)
    mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
                    partial_process_page)
    logging.info('Users missing in the rich file: %d', count_missing)

    p_sender.send(0)  # this STOPS the process

    print >> sys.stderr, "end of parsing"

    ## SAVE DATA
    g.set_weighted_degree()
    users_cache = {}
    # get a list of pair (class name, frequency distributions)
    for cls, fd in done_p_receiver.recv():
        with open("%swiki-%s-words-%s.dat" %
                  (lang, date,
                   cls.replace(' ', '_')), 'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in fd:
                print >> out, v, k
        del fd

    for cls, counters in done_p_receiver.recv():
        with open("%swiki-%s-smile-%s.dat" %
                  (lang, date,
                   cls.replace(' ', '_')), 'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in counters:
                print >> out, v, k
        del counters

    p.join()

    print >> sys.stderr, "end of FreqDist"
Exemple #24
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file geoip_db output_file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-p',
                 '--per-page',
                 action="store",
                 dest="per_page_stats",
                 help="Per page stats output")
    p.add_option('-e',
                 '--min-edits',
                 action="store",
                 type=int,
                 dest="min_edits",
                 help="Skip if page has less than min-edit edits")
    p.add_option('-a',
                 '--min-anon',
                 action="store",
                 type=int,
                 dest="min_anon",
                 help="Skip if page has less than min-anon anonymous edits")
    p.add_option('-E',
                 '--exclude',
                 action="store",
                 dest="exclude_countries",
                 help="Countries to exclude, colon (;) separated")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    geoip_db = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,redirect,timestamp,ip,revision,title')
    src.close()
    src = deflate(xml)

    processor = CountriesPageProcessor(tag=tag,
                                       lang=lang,
                                       output=output,
                                       userns=translation['User'],
                                       geoip=geoip_db)
    if opts.per_page_stats:
        processor.per_page_stats = opts.per_page_stats
    if opts.exclude_countries:
        processor.exclude_countries = opts.exclude_countries.split(";")
    processor.min_edits = opts.min_edits
    processor.min_anon = opts.min_anon
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
Exemple #25
0
def main():
    import optparse

    p = optparse.OptionParser(usage="usage: %prog file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-s', action="store", dest="signature", default=None,
                 help="Signature in this language (e.g. sig, firma..)")
    opts, files = p.parse_args()
    if opts.verbose:
        import sys
        import logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    try:
        xml = files[0]
    except IndexError:
        p.error("Give me one file, please")

    en_user, en_user_talk = u"User", u"User talk"

    lang, date, type_ = mwlib.explode_dump_filename(xml)

    src = BZ2File(xml)

    tag = mwlib.get_tags(src)

    ns_translation = mwlib.get_translations(src)
    lang_user, lang_user_talk = ns_translation['User'], \
             ns_translation['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    lang_user = unicode(lang_user, "utf8")
    en_user = unicode(en_user)

    # open dump with an external process to use multiple cores
    _fast = True
    if _fast:
        src.close()
        src = lib.BZ2FileExt(xml)

    if opts.signature is not None:
        processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag,
                              user_talk_names=(lang_user_talk, en_user_talk),
                              search=(lang_user, en_user), lang=lang,
                              signature=opts.signature)
    else:
        processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag,
                              user_talk_names=(lang_user_talk, en_user_talk),
                              search=(lang_user, en_user), lang=lang)

    with Timr('Processing'):
        processor.start(src)

    with Timr('Create network'):
        g = processor.ecache.get_network()

    logging.info("Len:", len(g.vs))
    logging.info("Edges:", len(g.es))

    g.write("%swiki-%s%s.pickle" % (lang, date, type_), format="pickle")
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file geoip_db output_file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-p', '--per-page', action="store",
                 dest="per_page_stats", help="Per page stats output")
    p.add_option('-e', '--min-edits', action="store", type=int,
                 dest="min_edits",
                 help="Skip if page has less than min-edit edits")
    p.add_option('-a', '--min-anon', action="store", type=int,
                 dest="min_anon",
                 help="Skip if page has less than min-anon anonymous edits")
    p.add_option('-E', '--exclude', action="store",
                 dest="exclude_countries",
                 help="Countries to exclude, colon (;) separated")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    geoip_db = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src,
                   tags='page,redirect,timestamp,ip,revision,title')
    src.close()
    src = deflate(xml)

    processor = CountriesPageProcessor(tag=tag, lang=lang,
                                       output=output,
                                       userns=translation['User'],
                                       geoip=geoip_db
                                      )
    if opts.per_page_stats:
        processor.per_page_stats = opts.per_page_stats
    if opts.exclude_countries:
        processor.exclude_countries = opts.exclude_countries.split(";")
    processor.min_edits = opts.min_edits
    processor.min_anon = opts.min_anon
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
Exemple #27
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file desired_list output_file")
    p.add_option('-t',
                 '--type',
                 action="store",
                 dest="type",
                 default="all",
                 help="Type of page to analize (content|talk|all)")
    p.add_option('-e',
                 '--encoding',
                 action="store",
                 dest="encoding",
                 default="latin-1",
                 help="encoding of the desired_list file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T',
                 "--timeout",
                 action="store",
                 dest="timeout",
                 type=float,
                 default=0.5,
                 help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c',
                 '--clean',
                 action="store_true",
                 dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    desired_pages_fn = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    src.close()
    src = deflate(xml)

    out = open(output, 'w')
    processor = HistoryRevisionsPageProcessor(tag=tag,
                                              lang=lang,
                                              output=out,
                                              userns=translation['User'])
    processor.talkns = translation['Talk']
    if opts.type == 'talk':
        processor.get_articles = False
    elif opts.type == 'content':
        processor.get_talks = False
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding)
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Exemple #28
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] dump enriched_pickle")

    _, args = p.parse_args()

    if len(args) != 2:
        p.error("Too few or too many arguments")
    xml, rich_fn = args

    global lang_user_talk, lang_user, tag, user_classes
    ## pipe to send data to the  subprocess
    p_receiver, p_sender = Pipe(duplex=False)
    ## pipe to get elaborated data from the subprocess
    done_p_receiver, done_p_sender = Pipe(duplex=False)

    src = BZ2File(xml)

    tag = mwlib.get_tags(src)
    lang, date, _ = mwlib.explode_dump_filename(xml)
    g = sg_load(rich_fn)
    user_classes = dict(
        g.get_user_class('username',
                         ('anonymous', 'bot', 'bureaucrat', 'sysop')))

    p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender))
    p.start()

    translations = mwlib.get_translations(src)
    lang_user, lang_user_talk = translations['User'], translations['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    ## open with a faster decompressor (but that probably cannot seek)
    src.close()
    src = lib.BZ2FileExt(xml, parallel=False)

    partial_process_page = partial(process_page, send=p_sender)
    mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
                    partial_process_page)
    logging.info('Users missing in the rich file: %d', count_missing)

    p_sender.send(0)  # this STOPS the process

    print >> sys.stderr, "end of parsing"

    ## SAVE DATA
    g.set_weighted_degree()
    users_cache = {}
    # get a list of pair (class name, frequency distributions)
    for cls, fd in done_p_receiver.recv():
        with open(
                "%swiki-%s-words-%s.dat" % (lang, date, cls.replace(' ', '_')),
                'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in fd:
                print >> out, v, k
        del fd

    for cls, counters in done_p_receiver.recv():
        with open(
                "%swiki-%s-smile-%s.dat" % (lang, date, cls.replace(' ', '_')),
                'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in counters:
                print >> out, v, k
        del counters

    p.join()

    print >> sys.stderr, "end of FreqDist"
def main():
    import optparse
    from sonet.lib import SonetOption

    p = optparse.OptionParser(
            usage="usage: %prog [options] input_file dictionary output_file",
            option_class=SonetOption
        )
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    p.add_option('-S', '--detailed-start', action="store",
        dest='detailed_start', type="yyyymmdd", metavar="YYYYMMDD",
        default=None, help="Detailed output start date")
    p.add_option('-E', '--detailed-end', action="store",
        dest='detailed_end', type="yyyymmdd", metavar="YYYYMMDD", default=None,
        help="Detailed output end date")
    p.add_option('-n', '--detailed-namespace', action="store",
                 dest="detailed_ns", default="Normal",
                 help="Namespace of desired detailed data (default: Normal)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    dic = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags=('page,title,revision,timestamp,text,redirect,'
                              'contributor,username,ip'))
    namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)]
    src.close()
    src = deflate(xml)

    if os.path.exists(output):
        logging.error("File %s already exists!", output)
        sys.exit(0)

    out = open(output, 'w')
    processor = PyWCProcessor(tag=tag, lang=lang, dic=dic,
                              output=out, userns=translation['User'])
    processor.namespaces = namespaces
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean
    if opts.detailed_start and opts.detailed_end:
        print """
        You are going to run the script with detailed output on %d days.
        This is going to produce some CSV files on your disk, one for each
        day. Is this want you really want to do? [press enter to continue]
        """ % (opts.detailed_end - opts.detailed_start).days
        raw_input()
        processor.pywc.detailed = True
        processor.detailed_start = opts.detailed_start
        processor.detailed_end = opts.detailed_end
        processor.detailed_ns = opts.detailed_ns

    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
def main():
    import optparse
    from sonet.lib import SonetOption

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio",
        option_class=SonetOption
    )
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-E', '--encoding', action="store", dest="encoding",
                 default="latin-1", help="encoding of the desired_list file")
    p.add_option('-d', '--delimiter', action="store", dest="delimiter",
                 default=",", help="CSV delimiter")
    p.add_option('-s', '--start', action="store", dest='start',
                 type="yyyymmdd", metavar="YYYYMMDD", default=None,
                 help="Look for revisions starting from this date")
    p.add_option('-e', '--end', action="store", dest='end', type="yyyymmdd",
                 metavar="YYYYMMDD", default=None,
                 help="Look for revisions until this date")

    opts, files = p.parse_args()
    if opts.verbose:
        import sys
        import logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,' + \
                   'minor,timestamp,redirect,ip,username')

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.start_date = opts.start
    processor.end_date = opts.end
    processor.set_desired_from_csv(desired_pages_fn,
                                   encoding=opts.encoding,
                                   delimiter=opts.delimiter)
    with Timr('Retrieving bots'):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
    processor.flush()
Exemple #31
0
def main():
    import optparse
    from sonet.lib import SonetOption

    p = optparse.OptionParser(
            usage="usage: %prog [options] input_file dictionary output_file",
            option_class=SonetOption
        )
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    p.add_option('-S', '--detailed-start', action="store",
        dest='detailed_start', type="yyyymmdd", metavar="YYYYMMDD",
        default=None, help="Detailed output start date")
    p.add_option('-E', '--detailed-end', action="store",
        dest='detailed_end', type="yyyymmdd", metavar="YYYYMMDD", default=None,
        help="Detailed output end date")
    p.add_option('-n', '--detailed-namespace', action="store",
                 dest="detailed_ns", default="Normal",
                 help="Namespace of desired detailed data (default: Normal)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    dic = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags=('page,title,revision,timestamp,text,redirect,'
                              'contributor,username,ip'))
    namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)]
    src.close()
    src = deflate(xml)

    if os.path.exists(output):
        logging.error("File %s already exists!", output)
        sys.exit(0)

    out = open(output, 'w')
    processor = PyWCProcessor(tag=tag, lang=lang, dic=dic,
                              output=out, userns=translation['User'])
    processor.namespaces = namespaces
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean
    if opts.detailed_start and opts.detailed_end:
        print """
        You are going to run the script with detailed output on %d days.
        This is going to produce some CSV files on your disk, one for each
        day. Is this want you really want to do? [press enter to continue]
        """ % (opts.detailed_end - opts.detailed_start).days
        raw_input()
        processor.pywc.detailed = True
        processor.detailed_start = opts.detailed_start
        processor.detailed_end = opts.detailed_end
        processor.detailed_ns = opts.detailed_ns

    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()