Esempio n. 1
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file gender_file output_file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-e',
                 '--min-edits',
                 default=0,
                 dest="min_edits",
                 metavar="MIN_EDITS",
                 type=int,
                 help="pages with less than MIN_EIDTS edits "
                 "are skipped (default: %(default)s)")

    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    gender_data = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src,
                   tags="page,redirect,timestamp,ip,"
                   "contributor,title,username")
    src.close()
    src = deflate(xml)

    out = open(output, "w")
    processor = GenderPageProcessor(tag=tag,
                                    lang=lang,
                                    output=out,
                                    userns=translation['User'],
                                    gender_data=gender_data,
                                    min_edits=opts.min_edits)
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Esempio n. 2
0
    def setUp(self):
        xml = "tests/utpedits2graph/" + \
              "vecwiki-20100307-stub-meta-history-TEST.xml.bz2"
        self.lang, self.date_, self.type_ = mwlib.explode_dump_filename(xml)

        deflate, _lineno = find_open_for_this_file(xml)
        welcome = defaultdict(str)
        welcome.update({'it': r'Benvenut', 'en': r'Welcome'})
        if _lineno:
            src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
        else:
            src = deflate(xml)
        tag = mwlib.get_tags(src,
                             tags='page,title,revision,timestamp,contributor,'
                             'username,ip,comment,id')
        translations = mwlib.get_translations(src)

        try:
            lang_user = unicode(translations['User'])
            lang_user_talk = unicode(translations['User talk'])
        except UnicodeDecodeError:
            lang_user = smart_str(translations['User'])
            lang_user_talk = smart_str(translations['User talk'])
        src.close()
        src = deflate(xml)
        self.processor = HistoryPageProcessor(tag=tag,
                                              user_talk_names=(lang_user_talk,
                                                               u"User talk"))
        self.processor.welcome_pattern = welcome[self.lang]
        self.processor.start(src)
        self.g = self.processor.get_network()
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file desired_list output_file")
    p.add_option('-t', '--type', action="store", dest="type", default="all",
                 help="Type of page to analize (content|talk|all)")
    p.add_option('-e', '--encoding', action="store", dest="encoding",
                 default="latin-1", help="encoding of the desired_list file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    desired_pages_fn = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    src.close()
    src = deflate(xml)

    out = open(output, 'w')
    processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang,
                                              output=out,
                                              userns=translation['User'])
    processor.talkns = translation['Talk']
    if opts.type == 'talk':
        processor.get_articles = False
    elif opts.type == 'content':
        processor.get_talks = False
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding)
    with Timr('Processing'):
        processor.start(src) ## PROCESSING
    processor.flush()
    out.close()
Esempio n. 4
0
    def setUp(self):
        xml = "tests/utpedits2graph/" + \
              "vecwiki-20100307-stub-meta-history-TEST.xml.bz2"
        self.lang, self.date_, self.type_ = mwlib.explode_dump_filename(xml)

        deflate, _lineno = find_open_for_this_file(xml)
        welcome = defaultdict(str)
        welcome.update({'it': r'Benvenut',
                        'en': r'Welcome'})
        if _lineno:
            src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
        else:
            src = deflate(xml)
        tag = mwlib.get_tags(src,
                        tags='page,title,revision,timestamp,contributor,'
                                  'username,ip,comment,id')
        translations = mwlib.get_translations(src)

        try:
            lang_user = unicode(translations['User'])
            lang_user_talk = unicode(translations['User talk'])
        except UnicodeDecodeError:
            lang_user = smart_str(translations['User'])
            lang_user_talk = smart_str(translations['User talk'])
        src.close()
        src = deflate(xml)
        self.processor = HistoryPageProcessor(tag=tag,
                         user_talk_names=(lang_user_talk, u"User talk"))
        self.processor.welcome_pattern = welcome[self.lang]
        self.processor.start(src)
        self.g = self.processor.get_network()
Esempio n. 5
0
def main():
    from bz2 import BZ2File
    from csv import DictWriter

    logging.basicConfig(#filename="usercontributions_export.log",
                        stream=sys.stderr,
                        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    op = create_option_parser()
    args = op.parse_args()

    xml, out, threshold = args.dump, args.out, args.threshold

    lang, date_, _ = mwlib.explode_dump_filename(xml)
    deflate, _lineno = find_open_for_this_file(xml)

    date_ = yyyymmdd_to_datetime(date_, 1)

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tmp = ["Normal"]+[v for _, (_, v) in enumerate(mwlib.get_namespaces(src))]
    namespaces = []
    # fix for quartiles
    for ns in tmp:
        for n in range(1, 5):
            namespaces.append("%s_%d" % (ns, n))
    print namespaces

    fout = BZ2File(out, 'w')

    fields = ['username', 'normal_edits', 'comments_count', 'comments_avg',
              'minor', 'revert', 'npov', 'welcome', 'please', 'thanks',
              'first_edit', 'last_edit', 'tot_edits', 'active_days',
              'days_since_first_edit', 'left_since', 'diversity_score',
              'first_edit_year', 'first_edit_month', 'first_edit_day',
              'last_edit_year', 'last_edit_month', 'last_edit_day', ]
    fields[2:2] = namespaces
    dw = DictWriter(fout, fields)
    dw.writeheader()

    ## to get only the first 1000 users:
    #from itertools import islice
    #data_iterator = islice(prepare_data(namespaces), 1000)
    data_iterator = prepare_data(namespaces, lang, date_, threshold)

    count = 0
    for user in data_iterator:
        for k, v in user.iteritems():
            if type(v) in [int, float]:
                assert v >= 0, "%s is negative" % (k,)
        dw.writerow(user)

        count += 1
        if not count % 5000:
            logging.info(count)
Esempio n. 6
0
def main():

    logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)  # filename="graph_longiudinal_analysis.log",
    logging.info("---------------------START---------------------")

    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    welcome = defaultdict(str)

    welcome.update({"it": r"Benvenut", "en": r"Welcome"})

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src, tags="page,title,revision,timestamp,contributor,username,ip,comment")

    translations = mwlib.get_translations(src)

    try:
        lang_user = unicode(translations["User"])
        lang_user_talk = unicode(translations["User talk"])
    except UnicodeDecodeError:
        lang_user = smart_str(translations["User"])
        lang_user_talk = smart_str(translations["User talk"])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag, user_talk_names=(lang_user_talk, u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    processor.welcome_pattern = welcome[lang]

    with Timr("Processing"):
        processor.start(src)  ## PROCESSING

    with Timr("Getting network"):
        g = processor.get_network()

    logging.info("Nodes: %d" % len(g.vs))
    logging.info("Edges: %d" % len(g.es))

    with Timr("Saving graph"):
        save_graph(g, lang, type_, date_)
Esempio n. 7
0
def main():
    import optparse
    import csv

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import sys, logging
        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    with open(desired_pages_fn, 'rb') as f:
        desired_pages = [
            l[0].decode('latin-1') for l in csv.reader(f)
            if l and not l[0][0] == '#'
        ]

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,ip,username')

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired(desired_pages)
    with Timr('Retrieving bots'):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
    processor.flush()
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file gender_file output_file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-e', '--min-edits', default=0, dest="min_edits",
                 metavar="MIN_EDITS", type=int,
                 help="pages with less than MIN_EIDTS edits "
                      "are skipped (default: %(default)s)")

    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    gender_data = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src,
                   tags="page,redirect,timestamp,ip,"
                        "contributor,title,username")
    src.close()
    src = deflate(xml)

    out = open(output, "w")
    processor = GenderPageProcessor(tag=tag, lang=lang,
                                    output=out,
                                    userns=translation['User'],
                                    gender_data=gender_data,
                                    min_edits=opts.min_edits
                                   )
    with Timr('Processing'):
        processor.start(src) ## PROCESSING
    processor.flush()
    out.close()
def main():
    import optparse
    import csv

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import sys, logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    with open(desired_pages_fn, 'rb') as f:
        desired_pages = [l[0].decode('latin-1') for l in csv.reader(f)
                                        if l and not l[0][0] == '#']

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,ip,username')

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired(desired_pages)
    with Timr('Retrieving bots'):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
    processor.flush()
Esempio n. 10
0
def main():
    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(
        src,
        tags='page,title,revision,timestamp,contributor,username,ip,comment')

    translations = mwlib.get_translations(src)
    lang_user = unicode(translations['User'])
    lang_user_talk = unicode(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    print >> sys.stderr, "BEGIN PARSING"
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
                                     user_talk_names=(lang_user_talk,
                                                      u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'
    with Timr('Processing'):
        processor.start(src)  ## PROCESSING

    with Timr('EdgeCache.get_network()'):
        g = processor.get_network()

    print >> sys.stderr, "Nodes:", len(g.vs)
    print >> sys.stderr, "Edges:", len(g.es)

    for e in g.es:
        e['weight'] = len(e['timestamp'])
        #e['timestamp'] = str(e['timestamp'])
    with Timr('Pickling'):
        g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
Esempio n. 11
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import logging
        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if not files:
        p.error("Give me a file, please ;-)")

    xml, desired_pages_fn, desired_words_fn = files[0:3]
    threshold = float(files[3])

    desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)]

    lang, _, _ = explode_dump_filename(xml)

    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,text')

    src.close()
    src = deflate(xml)

    processor = HistoryWordsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired_from_csv(desired_pages_fn)
    processor.words = desired_words

    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
def main():
    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
        tags='page,title,revision,timestamp,contributor,username,ip,comment')

    translations = mwlib.get_translations(src)
    lang_user = unicode(translations['User'])
    lang_user_talk = unicode(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    print >>sys.stderr, "BEGIN PARSING"
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
        user_talk_names=(lang_user_talk, u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'
    with Timr('Processing'):
        processor.start(src) ## PROCESSING

    with Timr('EdgeCache.get_network()'):
        g = processor.get_network()

    print >>sys.stderr, "Nodes:", len(g.vs)
    print >>sys.stderr, "Edges:", len(g.es)

    for e in g.es:
        e['weight'] = len(e['timestamp'])
        #e['timestamp'] = str(e['timestamp'])
    with Timr('Pickling'):
        g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
Esempio n. 13
0
def main():
    logging.basicConfig(  #filename="usercontributions.log",
        stream=sys.stderr,
        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    receiver, sender = Pipe(duplex=False)

    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, _, _ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
        tags='page,title,revision,timestamp,contributor,username,ip'+ \
             ',comment,id,minor')

    namespaces = [(0, "Normal")] + mwlib.get_namespaces(src)

    src.close()
    logging.info("BEGIN PARSING")
    src = deflate(xml)

    processor = UserContributionsPageProcessor(tag=tag, lang=lang)
    processor.sender = sender
    processor.namespaces = namespaces
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'

    p = Process(target=use_contrib_dict,
                args=(receiver, processor.namespaces, lang))
    p.start()

    with Timr('PROCESSING'):
        processor.start(src)  ## PROCESSING

    sender.send(None)
    p.join()  ## wait until save is complete
Esempio n. 14
0
def main():
    logging.basicConfig(#filename="usercontributions.log",
                        stream=sys.stderr,
                        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    receiver, sender = Pipe(duplex=False)

    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, _, _ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
        tags='page,title,revision,timestamp,contributor,username,ip'+ \
             ',comment,id,minor')

    namespaces = [(0, "Normal")]+mwlib.get_namespaces(src)

    src.close()
    logging.info("BEGIN PARSING")
    src = deflate(xml)

    processor = UserContributionsPageProcessor(tag=tag, lang=lang)
    processor.sender = sender
    processor.namespaces = namespaces
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'

    p = Process(target=use_contrib_dict, args=(receiver, processor.namespaces,
                                               lang))
    p.start()

    with Timr('PROCESSING'):
        processor.start(src) ## PROCESSING

    sender.send(None)
    p.join() ## wait until save is complete
Esempio n. 15
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG)

    if not files:
        p.error("Give me a file, please ;-)")

    xml, desired_pages_fn, desired_words_fn = files[0:3]
    threshold = float(files[3])

    desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)]

    lang, _, _ = explode_dump_filename(xml)

    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,text')

    src.close()
    src = deflate(xml)

    processor = HistoryWordsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired_from_csv(desired_pages_fn)
    processor.words = desired_words

    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
Esempio n. 16
0
def main():
    import optparse
    import csv

    p = optparse.OptionParser(usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option("-v", action="store_true", dest="verbose", default=False, help="Verbose output (like timings)")
    p.add_option(
        "-e", "--encoding", action="store", dest="encoding", default="latin-1", help="encoding of the desired_list file"
    )
    opts, files = p.parse_args()
    if opts.verbose:
        import sys, logging

        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags="page,title,revision," + "minor,timestamp,redirect,ip,username")

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation["Talk"]
    processor.threshold = threshold
    processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding)
    with Timr("Retrieving bots"):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr("Parsing"):
        processor.start(src)
    processor.flush()
Esempio n. 17
0
def main():

    logging.basicConfig(#filename="random_page_extractor.log",
                                stream=sys.stderr,
                                level=logging.DEBUG)

    op = create_option_parser()
    args = op.parse_args()

    with open(args.desired_pages_fn, 'rb') as f:
        desired_pages = [l[0].decode('latin-1') for l in csv.reader(f)
                                        if l and not l[0][0] == '#']

    lang, date_, type_ = explode_dump_filename(args.xml_fn)
    deflate, _lineno = lib.find_open_for_this_file(args.xml_fn)

    dumps_checker(args, type_)

    logging.info('---------------------START---------------------')

    if _lineno:
        src = deflate(args.xml_fn, 51)
    else:
        src = deflate(args.xml_fn)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,redirect,text,username,ip,timestamp')

    src.close()
    src = deflate(args.xml_fn)

    output = open(args.output, 'w') if args.output else None

    processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang,
                                              output=output,
                                              threshold=args.ratio,
                                              min_text=args.min_text_length,
                                              n_users=args.editors_number,
                                              start_revision=args.initial_revision)
    
    processor.talkns = translation['Talk']
    processor.desired_page_type = args.type
    processor.set_desired(desired_pages)
    with Timr('processing'):
        processor.start(src)
Esempio n. 18
0
def main():

    logging.basicConfig(#filename="random_page_extractor.log",
                                stream=sys.stderr,
                                level=logging.DEBUG)

    op = create_option_parser()
    args = op.parse_args()

    lang, date_, type_ = explode_dump_filename(args.xml_fn)
    deflate, _lineno = lib.find_open_for_this_file(args.xml_fn)

    dumps_checker(args, type_)

    logging.info('---------------------START---------------------')

    if _lineno:
        src = deflate(args.xml_fn, 51)
    else:
        src = deflate(args.xml_fn)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,redirect,text,username,ip,timestamp')

    src.close()
    src = deflate(args.xml_fn)

    output = open(args.output, 'w') if args.output else None

    processor = HistoryRevisionsPageProcessor(
                    tag=tag,
                    lang=lang,
                    output=output,
                    threshold=args.ratio,
                    min_text=args.min_text_length,
                    min_revisions=args.revisions_number,
                    n_users=args.editors_number,
                    start_revision=args.initial_revision)

    processor.talkns = translation['Talk']
    processor.desired_page_type = args.type
    processor.set_desired_from_csv(desired_pages_fn, encoding=args.encoding)
    with Timr('processing'):
        processor.start(src)
Esempio n. 19
0
def main():
    from bz2 import BZ2File
    from csv import DictWriter

    logging.basicConfig(  #filename="usercontributions_export.log",
        stream=sys.stderr,
        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    xml, out = get_xml_file()

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    namespaces = [v for _, v in mwlib.get_namespaces(src)]

    fout = BZ2File(out, 'w')

    fields = [
        'username', 'normal_edits', 'comments_count', 'comments_avg', 'minor',
        'revert', 'npov', 'welcome', 'please', 'thanks', 'first_edit',
        'last_edit'
    ]
    fields[2:2] = namespaces
    dw = DictWriter(fout, fields)
    dw.writeheader()

    ## to get only the first 1000 users:
    #from itertools import islice
    #data_iterator = islice(prepare_data(namespaces), 1000)
    data_iterator = prepare_data(namespaces)

    count = 0
    for user in data_iterator:
        dw.writerow(user)

        count += 1
        if not count % 5000:
            logging.info(count)
def main():
    from bz2 import BZ2File
    from csv import DictWriter

    logging.basicConfig(#filename="usercontributions_export.log",
                        stream=sys.stderr,
                        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    xml, out = get_xml_file()

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    namespaces = [v for _,v in mwlib.get_namespaces(src)]

    fout = BZ2File(out, 'w')

    fields = ['username', 'normal_edits', 'comments_count', 'comments_avg',
              'minor', 'revert', 'npov', 'welcome', 'please', 'thanks',
              'first_edit', 'last_edit']
    fields[2:2] = namespaces
    dw = DictWriter(fout, fields)
    dw.writeheader()

    ## to get only the first 1000 users:
    #from itertools import islice
    #data_iterator = islice(prepare_data(namespaces), 1000)
    data_iterator = prepare_data(namespaces)

    count = 0
    for user in data_iterator:
        dw.writerow(user)

        count += 1
        if not count % 5000:
            logging.info(count)
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import logging
        logging.basicConfig(stream=sys.stderr,level=logging.DEBUG)

    if not files:
        p.error("Error: No file received.")

    xml, desired_pages_fn, desired_words_fn = files[0:3]
    threshold = float(files[3])

    desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)]

    lang, _, _ = explode_dump_filename(xml)

    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,minor,timestamp,redirect,text')

    src.close()
    src = deflate(xml)

    analyzer = EditsAnalyzer(tag=tag, lang=lang)
    analyzer.set_desired_from_csv(desired_pages_fn)
    analyzer.words = desired_words

    with Timr('Analyzing...'):
        analyzer.start(src)
def main():
    import optparse

    p = optparse.OptionParser(usage="usage: %prog [options] input_file geoip_db output_file")
    p.add_option("-v", action="store_true", dest="verbose", default=False, help="Verbose output (like timings)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(
            stream=sys.stderr,
            level=logging.DEBUG,
            format="%(asctime)s %(levelname)s %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
        )

    xml = files[0]
    geoip_db = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags="page,redirect,timestamp,ip,revision,title")
    src.close()
    src = deflate(xml)

    processor = CountriesPageProcessor(tag=tag, lang=lang, output=output, userns=translation["User"], geoip=geoip_db)
    with Timr("Processing"):
        processor.start(src)  ## PROCESSING
    processor.flush()
Esempio n. 23
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file desired_list output_file")
    p.add_option('-t', '--type', action="store", dest="type", default="all",
                 help="Type of page to analize (content|talk|all)")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    output = files[2]

    with open(desired_pages_fn, 'rb') as f:
        desired_pages = [l[0].decode('latin-1') for l in csv.reader(f)
                                        if l and not l[0][0] == '#']
    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    src.close()
    src = deflate(xml)

    processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang,
                                              output=output)
    processor.talkns = translation['Talk']
    processor.desired_page_type = opts.type
    processor.set_desired(desired_pages)
    processor.start(src)
    processor.flush()
Esempio n. 24
0
def main():
    opts, args = opt_parse()
    xml = args[0]
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
    logging.info('---------------------START---------------------')

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    welcome = defaultdict(str)

    welcome.update({'it': r'Benvenut',
                    'en': r'Welcome'})

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
                         tags='page,title,revision,timestamp,contributor,'
                              'username,ip,comment,id')

    translations = mwlib.get_translations(src)

    try:
        lang_user = unicode(translations['User'])
        lang_user_talk = unicode(translations['User talk'])
    except UnicodeDecodeError:
        lang_user = smart_str(translations['User'])
        lang_user_talk = smart_str(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
        user_talk_names=(lang_user_talk, u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    processor.welcome_pattern = welcome[lang]

    with Timr('Processing'):
        processor.start(src) ## PROCESSING

    with Timr('Getting network'):
        g = processor.get_network()

    logging.info("Nodes: %d", len(g.vs))
    logging.info("Edges: %d", len(g.es))

    with Timr('Saving graph'):
        save_graph(g, lang, type_, date_)
Esempio n. 25
0
def main():
    opts, args = opt_parse()
    xml = args[0]
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
    logging.info('---------------------START---------------------')

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    welcome = defaultdict(str)

    welcome.update({'it': r'Benvenut', 'en': r'Welcome'})

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
                         tags='page,title,revision,timestamp,contributor,'
                         'username,ip,comment,id')

    translations = mwlib.get_translations(src)

    try:
        lang_user = unicode(translations['User'])
        lang_user_talk = unicode(translations['User talk'])
    except UnicodeDecodeError:
        lang_user = smart_str(translations['User'])
        lang_user_talk = smart_str(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
                                     user_talk_names=(lang_user_talk,
                                                      u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    processor.welcome_pattern = welcome[lang]

    with Timr('Processing'):
        processor.start(src)  ## PROCESSING

    with Timr('Getting network'):
        g = processor.get_network()

    logging.info("Nodes: %d", len(g.vs))
    logging.info("Edges: %d", len(g.es))

    with Timr('Saving graph'):
        save_graph(g, lang, type_, date_)
Esempio n. 26
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file dictionary output_file")
    p.add_option('-t', '--type', action="store", dest="type", default="all",
                 help="Type of page to analize (content|talk|all)")
    p.add_option('-e', '--encoding', action="store", dest="encoding",
                 default="latin-1", help="encoding of the desired_list file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    p.add_option('-C', '--charlimit', action="store", dest="charlimit",
                 type="int", default=100000,
                 help="Maximim characters per line (default=100000)")
    p.add_option('-r', action="store_true", dest="regex", default=False,
                 help="Use a dictionary composed by regex (default=false)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    dic = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)]
    src.close()
    src = deflate(xml)

    out = open(output, 'w')
    processor = PyWCProcessor(tag=tag, lang=lang, dic=dic,
                              output=out, userns=translation['User'])
    processor.namespaces = namespaces
    if opts.type == 'talk':
        processor.get_articles = False
    elif opts.type == 'content':
        processor.get_talks = False
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean

    with Timr('Processing'):
        processor.start(src) ## PROCESSING
    processor.flush()
    out.close()
Esempio n. 27
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file geoip_db output_file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-p', '--per-page', action="store",
                 dest="per_page_stats", help="Per page stats output")
    p.add_option('-e', '--min-edits', action="store", type=int,
                 dest="min_edits",
                 help="Skip if page has less than min-edit edits")
    p.add_option('-a', '--min-anon', action="store", type=int,
                 dest="min_anon",
                 help="Skip if page has less than min-anon anonymous edits")
    p.add_option('-E', '--exclude', action="store",
                 dest="exclude_countries",
                 help="Countries to exclude, colon (;) separated")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    geoip_db = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src,
                   tags='page,redirect,timestamp,ip,revision,title')
    src.close()
    src = deflate(xml)

    processor = CountriesPageProcessor(tag=tag, lang=lang,
                                       output=output,
                                       userns=translation['User'],
                                       geoip=geoip_db
                                      )
    if opts.per_page_stats:
        processor.per_page_stats = opts.per_page_stats
    if opts.exclude_countries:
        processor.exclude_countries = opts.exclude_countries.split(";")
    processor.min_edits = opts.min_edits
    processor.min_anon = opts.min_anon
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
Esempio n. 28
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file geoip_db output_file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-p',
                 '--per-page',
                 action="store",
                 dest="per_page_stats",
                 help="Per page stats output")
    p.add_option('-e',
                 '--min-edits',
                 action="store",
                 type=int,
                 dest="min_edits",
                 help="Skip if page has less than min-edit edits")
    p.add_option('-a',
                 '--min-anon',
                 action="store",
                 type=int,
                 dest="min_anon",
                 help="Skip if page has less than min-anon anonymous edits")
    p.add_option('-E',
                 '--exclude',
                 action="store",
                 dest="exclude_countries",
                 help="Countries to exclude, colon (;) separated")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    geoip_db = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,redirect,timestamp,ip,revision,title')
    src.close()
    src = deflate(xml)

    processor = CountriesPageProcessor(tag=tag,
                                       lang=lang,
                                       output=output,
                                       userns=translation['User'],
                                       geoip=geoip_db)
    if opts.per_page_stats:
        processor.per_page_stats = opts.per_page_stats
    if opts.exclude_countries:
        processor.exclude_countries = opts.exclude_countries.split(";")
    processor.min_edits = opts.min_edits
    processor.min_anon = opts.min_anon
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
Esempio n. 29
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file desired_list output_file")
    p.add_option('-t',
                 '--type',
                 action="store",
                 dest="type",
                 default="all",
                 help="Type of page to analize (content|talk|all)")
    p.add_option('-e',
                 '--encoding',
                 action="store",
                 dest="encoding",
                 default="latin-1",
                 help="encoding of the desired_list file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T',
                 "--timeout",
                 action="store",
                 dest="timeout",
                 type=float,
                 default=0.5,
                 help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c',
                 '--clean',
                 action="store_true",
                 dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    desired_pages_fn = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    src.close()
    src = deflate(xml)

    out = open(output, 'w')
    processor = HistoryRevisionsPageProcessor(tag=tag,
                                              lang=lang,
                                              output=out,
                                              userns=translation['User'])
    processor.talkns = translation['Talk']
    if opts.type == 'talk':
        processor.get_articles = False
    elif opts.type == 'content':
        processor.get_talks = False
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding)
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Esempio n. 30
0
def main():
    import optparse
    from sonet.lib import SonetOption

    p = optparse.OptionParser(
            usage="usage: %prog [options] input_file dictionary output_file",
            option_class=SonetOption
        )
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    p.add_option('-S', '--detailed-start', action="store",
        dest='detailed_start', type="yyyymmdd", metavar="YYYYMMDD",
        default=None, help="Detailed output start date")
    p.add_option('-E', '--detailed-end', action="store",
        dest='detailed_end', type="yyyymmdd", metavar="YYYYMMDD", default=None,
        help="Detailed output end date")
    p.add_option('-n', '--detailed-namespace', action="store",
                 dest="detailed_ns", default="Normal",
                 help="Namespace of desired detailed data (default: Normal)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    dic = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags=('page,title,revision,timestamp,text,redirect,'
                              'contributor,username,ip'))
    namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)]
    src.close()
    src = deflate(xml)

    if os.path.exists(output):
        logging.error("File %s already exists!", output)
        sys.exit(0)

    out = open(output, 'w')
    processor = PyWCProcessor(tag=tag, lang=lang, dic=dic,
                              output=out, userns=translation['User'])
    processor.namespaces = namespaces
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean
    if opts.detailed_start and opts.detailed_end:
        print """
        You are going to run the script with detailed output on %d days.
        This is going to produce some CSV files on your disk, one for each
        day. Is this want you really want to do? [press enter to continue]
        """ % (opts.detailed_end - opts.detailed_start).days
        raw_input()
        processor.pywc.detailed = True
        processor.detailed_start = opts.detailed_start
        processor.detailed_end = opts.detailed_end
        processor.detailed_ns = opts.detailed_ns

    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Esempio n. 31
0
def main():
    import optparse
    from sonet.lib import SonetOption

    p = optparse.OptionParser(
            usage="usage: %prog [options] input_file dictionary output_file",
            option_class=SonetOption
        )
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    p.add_option('-S', '--detailed-start', action="store",
        dest='detailed_start', type="yyyymmdd", metavar="YYYYMMDD",
        default=None, help="Detailed output start date")
    p.add_option('-E', '--detailed-end', action="store",
        dest='detailed_end', type="yyyymmdd", metavar="YYYYMMDD", default=None,
        help="Detailed output end date")
    p.add_option('-n', '--detailed-namespace', action="store",
                 dest="detailed_ns", default="Normal",
                 help="Namespace of desired detailed data (default: Normal)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    dic = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags=('page,title,revision,timestamp,text,redirect,'
                              'contributor,username,ip'))
    namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)]
    src.close()
    src = deflate(xml)

    if os.path.exists(output):
        logging.error("File %s already exists!", output)
        sys.exit(0)

    out = open(output, 'w')
    processor = PyWCProcessor(tag=tag, lang=lang, dic=dic,
                              output=out, userns=translation['User'])
    processor.namespaces = namespaces
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean
    if opts.detailed_start and opts.detailed_end:
        print """
        You are going to run the script with detailed output on %d days.
        This is going to produce some CSV files on your disk, one for each
        day. Is this want you really want to do? [press enter to continue]
        """ % (opts.detailed_end - opts.detailed_start).days
        raw_input()
        processor.pywc.detailed = True
        processor.detailed_start = opts.detailed_start
        processor.detailed_end = opts.detailed_end
        processor.detailed_ns = opts.detailed_ns

    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Esempio n. 32
0
def main():
    import optparse
    from sonet.lib import SonetOption

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio",
        option_class=SonetOption
    )
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-E', '--encoding', action="store", dest="encoding",
                 default="latin-1", help="encoding of the desired_list file")
    p.add_option('-d', '--delimiter', action="store", dest="delimiter",
                 default=",", help="CSV delimiter")
    p.add_option('-s', '--start', action="store", dest='start',
                 type="yyyymmdd", metavar="YYYYMMDD", default=None,
                 help="Look for revisions starting from this date")
    p.add_option('-e', '--end', action="store", dest='end', type="yyyymmdd",
                 metavar="YYYYMMDD", default=None,
                 help="Look for revisions until this date")

    opts, files = p.parse_args()
    if opts.verbose:
        import sys
        import logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,' + \
                   'minor,timestamp,redirect,ip,username')

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.start_date = opts.start
    processor.end_date = opts.end
    processor.set_desired_from_csv(desired_pages_fn,
                                   encoding=opts.encoding,
                                   delimiter=opts.delimiter)
    with Timr('Retrieving bots'):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
    processor.flush()