def main(): from bz2 import BZ2File from csv import DictWriter logging.basicConfig(#filename="usercontributions_export.log", stream=sys.stderr, level=logging.DEBUG) logging.info('---------------------START---------------------') op = create_option_parser() args = op.parse_args() xml, out, threshold = args.dump, args.out, args.threshold lang, date_, _ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) date_ = yyyymmdd_to_datetime(date_, 1) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) tmp = ["Normal"]+[v for _, (_, v) in enumerate(mwlib.get_namespaces(src))] namespaces = [] # fix for quartiles for ns in tmp: for n in range(1, 5): namespaces.append("%s_%d" % (ns, n)) print namespaces fout = BZ2File(out, 'w') fields = ['username', 'normal_edits', 'comments_count', 'comments_avg', 'minor', 'revert', 'npov', 'welcome', 'please', 'thanks', 'first_edit', 'last_edit', 'tot_edits', 'active_days', 'days_since_first_edit', 'left_since', 'diversity_score', 'first_edit_year', 'first_edit_month', 'first_edit_day', 'last_edit_year', 'last_edit_month', 'last_edit_day', ] fields[2:2] = namespaces dw = DictWriter(fout, fields) dw.writeheader() ## to get only the first 1000 users: #from itertools import islice #data_iterator = islice(prepare_data(namespaces), 1000) data_iterator = prepare_data(namespaces, lang, date_, threshold) count = 0 for user in data_iterator: for k, v in user.iteritems(): if type(v) in [int, float]: assert v >= 0, "%s is negative" % (k,) dw.writerow(user) count += 1 if not count % 5000: logging.info(count)
def main(): logging.basicConfig(#filename="usercontributions.log", stream=sys.stderr, level=logging.DEBUG) logging.info('---------------------START---------------------') receiver, sender = Pipe(duplex=False) opts, args = opt_parse() xml = args[0] ## SET UP FOR PROCESSING lang, _, _ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) tag = mwlib.get_tags(src, tags='page,title,revision,timestamp,contributor,username,ip'+ \ ',comment,id,minor') namespaces = [(0, "Normal")]+mwlib.get_namespaces(src) src.close() logging.info("BEGIN PARSING") src = deflate(xml) processor = UserContributionsPageProcessor(tag=tag, lang=lang) processor.sender = sender processor.namespaces = namespaces processor.time_end = opts.end ##TODO: only works on it.wikipedia.org! :-) processor.welcome_pattern = r'Benvenut' p = Process(target=use_contrib_dict, args=(receiver, processor.namespaces, lang)) p.start() with Timr('PROCESSING'): processor.start(src) ## PROCESSING sender.send(None) p.join() ## wait until save is complete
def main(): logging.basicConfig( #filename="usercontributions.log", stream=sys.stderr, level=logging.DEBUG) logging.info('---------------------START---------------------') receiver, sender = Pipe(duplex=False) opts, args = opt_parse() xml = args[0] ## SET UP FOR PROCESSING lang, _, _ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) tag = mwlib.get_tags(src, tags='page,title,revision,timestamp,contributor,username,ip'+ \ ',comment,id,minor') namespaces = [(0, "Normal")] + mwlib.get_namespaces(src) src.close() logging.info("BEGIN PARSING") src = deflate(xml) processor = UserContributionsPageProcessor(tag=tag, lang=lang) processor.sender = sender processor.namespaces = namespaces processor.time_end = opts.end ##TODO: only works on it.wikipedia.org! :-) processor.welcome_pattern = r'Benvenut' p = Process(target=use_contrib_dict, args=(receiver, processor.namespaces, lang)) p.start() with Timr('PROCESSING'): processor.start(src) ## PROCESSING sender.send(None) p.join() ## wait until save is complete
def main(): from bz2 import BZ2File from csv import DictWriter logging.basicConfig( #filename="usercontributions_export.log", stream=sys.stderr, level=logging.DEBUG) logging.info('---------------------START---------------------') xml, out = get_xml_file() deflate, _lineno = find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) namespaces = [v for _, v in mwlib.get_namespaces(src)] fout = BZ2File(out, 'w') fields = [ 'username', 'normal_edits', 'comments_count', 'comments_avg', 'minor', 'revert', 'npov', 'welcome', 'please', 'thanks', 'first_edit', 'last_edit' ] fields[2:2] = namespaces dw = DictWriter(fout, fields) dw.writeheader() ## to get only the first 1000 users: #from itertools import islice #data_iterator = islice(prepare_data(namespaces), 1000) data_iterator = prepare_data(namespaces) count = 0 for user in data_iterator: dw.writerow(user) count += 1 if not count % 5000: logging.info(count)
def main(): from bz2 import BZ2File from csv import DictWriter logging.basicConfig(#filename="usercontributions_export.log", stream=sys.stderr, level=logging.DEBUG) logging.info('---------------------START---------------------') xml, out = get_xml_file() deflate, _lineno = find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) namespaces = [v for _,v in mwlib.get_namespaces(src)] fout = BZ2File(out, 'w') fields = ['username', 'normal_edits', 'comments_count', 'comments_avg', 'minor', 'revert', 'npov', 'welcome', 'please', 'thanks', 'first_edit', 'last_edit'] fields[2:2] = namespaces dw = DictWriter(fout, fields) dw.writeheader() ## to get only the first 1000 users: #from itertools import islice #data_iterator = islice(prepare_data(namespaces), 1000) data_iterator = prepare_data(namespaces) count = 0 for user in data_iterator: dw.writerow(user) count += 1 if not count % 5000: logging.info(count)
def main(): import optparse from sonet.lib import SonetOption p = optparse.OptionParser( usage="usage: %prog [options] input_file dictionary output_file", option_class=SonetOption ) p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-T', "--timeout", action="store", dest="timeout", type=float, default=0.5, help="Diff timeout (default=0.5, 0=no timeout)") p.add_option('-c', '--clean', action="store_true", dest="clean", default=False, help="Cleans HTML, wiki syntax, acronyms and emoticons") p.add_option('-S', '--detailed-start', action="store", dest='detailed_start', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Detailed output start date") p.add_option('-E', '--detailed-end', action="store", dest='detailed_end', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Detailed output end date") p.add_option('-n', '--detailed-namespace', action="store", dest="detailed_ns", default="Normal", help="Namespace of desired detailed data (default: Normal)") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] dic = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags=('page,title,revision,timestamp,text,redirect,' 'contributor,username,ip')) namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)] src.close() src = deflate(xml) if os.path.exists(output): logging.error("File %s already exists!", output) sys.exit(0) out = open(output, 'w') processor = PyWCProcessor(tag=tag, lang=lang, dic=dic, output=out, userns=translation['User']) processor.namespaces = namespaces processor.diff_timeout = opts.timeout processor.clean = opts.clean processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean if opts.detailed_start and opts.detailed_end: print """ You are going to run the script with detailed output on %d days. This is going to produce some CSV files on your disk, one for each day. Is this want you really want to do? [press enter to continue] """ % (opts.detailed_end - opts.detailed_start).days raw_input() processor.pywc.detailed = True processor.detailed_start = opts.detailed_start processor.detailed_end = opts.detailed_end processor.detailed_ns = opts.detailed_ns with Timr('Processing'): processor.start(src) # PROCESSING processor.flush() out.close()
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file dictionary output_file") p.add_option('-t', '--type', action="store", dest="type", default="all", help="Type of page to analize (content|talk|all)") p.add_option('-e', '--encoding', action="store", dest="encoding", default="latin-1", help="encoding of the desired_list file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-T', "--timeout", action="store", dest="timeout", type=float, default=0.5, help="Diff timeout (default=0.5, 0=no timeout)") p.add_option('-c', '--clean', action="store_true", dest="clean", default=False, help="Cleans HTML, wiki syntax, acronyms and emoticons") p.add_option('-C', '--charlimit', action="store", dest="charlimit", type="int", default=100000, help="Maximim characters per line (default=100000)") p.add_option('-r', action="store_true", dest="regex", default=False, help="Use a dictionary composed by regex (default=false)") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] dic = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect') namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)] src.close() src = deflate(xml) out = open(output, 'w') processor = PyWCProcessor(tag=tag, lang=lang, dic=dic, output=out, userns=translation['User']) processor.namespaces = namespaces if opts.type == 'talk': processor.get_articles = False elif opts.type == 'content': processor.get_talks = False processor.diff_timeout = opts.timeout processor.clean = opts.clean processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean with Timr('Processing'): processor.start(src) ## PROCESSING processor.flush() out.close()