Esempio n. 1
0
def main():
    import optparse
    import csv

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import sys, logging
        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    with open(desired_pages_fn, 'rb') as f:
        desired_pages = [
            l[0].decode('latin-1') for l in csv.reader(f)
            if l and not l[0][0] == '#'
        ]

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,ip,username')

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired(desired_pages)
    with Timr('Retrieving bots'):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
    processor.flush()
Esempio n. 2
0
def save_graph(g, lang, type_, date_):

    counter = 0
    with Timr('Setting weight attribute on edges'):
        for e in g.es:
            e['weight'] = len(e['timestamp'])
            #e['timestamp'] = str(e['timestamp'])
            counter += 1
            if not counter % 10000:
                logging.debug(counter)

    with Timr('Pickling'):
        g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
Esempio n. 3
0
def main():
    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(
        src,
        tags='page,title,revision,timestamp,contributor,username,ip,comment')

    translations = mwlib.get_translations(src)
    lang_user = unicode(translations['User'])
    lang_user_talk = unicode(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    print >> sys.stderr, "BEGIN PARSING"
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
                                     user_talk_names=(lang_user_talk,
                                                      u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'
    with Timr('Processing'):
        processor.start(src)  ## PROCESSING

    with Timr('EdgeCache.get_network()'):
        g = processor.get_network()

    print >> sys.stderr, "Nodes:", len(g.vs)
    print >> sys.stderr, "Edges:", len(g.es)

    for e in g.es:
        e['weight'] = len(e['timestamp'])
        #e['timestamp'] = str(e['timestamp'])
    with Timr('Pickling'):
        g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
Esempio n. 4
0
def main():
    logging.basicConfig(  #filename="graph_longiudinal_analysis.log",
        stream=sys.stderr,
        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    op = create_option_parser()
    args = op.parse_args()

    ## explode dump filename in order to obtain wiki lang, dump date and type
    _, date_, _ = mwlib.explode_dump_filename(args.file_name)

    fn, start, tw = args.file_name, args.start, args.time_window
    ## if end argument is not specified, then use the dump date
    end = args.end if args.end else lib.yyyymmdd_to_datetime(date_)
    ## frequency not to be considered in case of cumulative analysis
    freq = args.frequency if (args.frequency and not args.cumulative) else tw

    if args.cumulative:
        logging.info("Cumulative longitudinal analysis chosen,"
                     "hence not considering following option: frequency")

    with Timr("RUNNING ANALYSIS"):
        if args.cumulative:
            cumulative_analysis(fn, start, end, freq)
        else:
            time_slice_analysis(fn, start, end, freq, tw)
Esempio n. 5
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file gender_file output_file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-e',
                 '--min-edits',
                 default=0,
                 dest="min_edits",
                 metavar="MIN_EDITS",
                 type=int,
                 help="pages with less than MIN_EIDTS edits "
                 "are skipped (default: %(default)s)")

    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    gender_data = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src,
                   tags="page,redirect,timestamp,ip,"
                   "contributor,title,username")
    src.close()
    src = deflate(xml)

    out = open(output, "w")
    processor = GenderPageProcessor(tag=tag,
                                    lang=lang,
                                    output=out,
                                    userns=translation['User'],
                                    gender_data=gender_data,
                                    min_edits=opts.min_edits)
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Esempio n. 6
0
 def process_page(self, _):
     self.count += 1
     if not self.count % 1000:
         logging.info(' ### Processed %d pages', self.count)
     self.delattr(("text"))
     if not self._skip:
         with Timr('Flushing %s' % self._title):
             self.flush()
     self._skip = False
Esempio n. 7
0
def graph_loader(file_name):
    """
    Loads a sonet.graph object from a pickle/graphml/... file
    """
    try:
        with Timr("GRAPH LOADING"):
            return sg.load(file_name)
    except IOError:
        logging.exception("unable to load a graph from passed file: %s",
                          file_name)
        sys.exit()
Esempio n. 8
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] dic_file input_file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-p', action="store_true", dest="percentage", default=False,
                 help="Output results as percentages (like LIWC) "
                      "(default=false)")
    p.add_option('-c', '--charlimit', action="store", dest="charlimit",
                 type="int", default=100000,
                 help="Maximim characters per line (default=100000)")
    p.add_option('-i', '--ignorecols', action="store", dest="ignorecols",
                 help="Coulmns numbers of the source file to ignore" + \
                      "(comma separated and starting from 0)")
    p.add_option('-I', '--id', action="store", dest="id_col", type="int",
                 help="Id column number (starting from 0)", default=0)
    p.add_option('-r', action="store_true", dest="regex", default=False,
                 help="Use a dictionary composed by regex (default=false)")
    p.add_option('-f', "--flush", action="store", dest="flush", type="int",
                 default=100,
                 help="Flushing to output every N pieces of text")
    p.add_option("--clean", action="store_true", dest="clean",
                 default=False, help="Clean text from wiki syntax/HTML")
    p.add_option('-o', "--output", action="store", dest="output",
                 help="Output file (default=STDOUT)")
    opts, files = p.parse_args()
    if len(files) != 2:
        p.error("Wrong parameters")

    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    t = PyWC()
    t.max_char_limit = opts.charlimit
    t.clean_wiki = t.clean_html = opts.clean
    if opts.ignorecols:
        t.ignorecols = [int(x) for x in opts.ignorecols.split(",")]
    t.id_col = opts.id_col
    t.dic_regex = opts.regex
    t.flush_n = opts.flush
    if opts.output is not None:
        t.csv_out = open(opts.output, 'w')
    t.percentage = opts.percentage

    t.set_dic(files[0])
    src = open(files[1], 'r')
    with Timr("Processing"):
        t.start(src)

    t.flush()
Esempio n. 9
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import logging
        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if not files:
        p.error("Give me a file, please ;-)")

    xml, desired_pages_fn, desired_words_fn = files[0:3]
    threshold = float(files[3])

    desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)]

    lang, _, _ = explode_dump_filename(xml)

    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,text')

    src.close()
    src = deflate(xml)

    processor = HistoryWordsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired_from_csv(desired_pages_fn)
    processor.words = desired_words

    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
Esempio n. 10
0
def main():
    logging.basicConfig(  #filename="usercontributions.log",
        stream=sys.stderr,
        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    receiver, sender = Pipe(duplex=False)

    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, _, _ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
        tags='page,title,revision,timestamp,contributor,username,ip'+ \
             ',comment,id,minor')

    namespaces = [(0, "Normal")] + mwlib.get_namespaces(src)

    src.close()
    logging.info("BEGIN PARSING")
    src = deflate(xml)

    processor = UserContributionsPageProcessor(tag=tag, lang=lang)
    processor.sender = sender
    processor.namespaces = namespaces
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'

    p = Process(target=use_contrib_dict,
                args=(receiver, processor.namespaces, lang))
    p.start()

    with Timr('PROCESSING'):
        processor.start(src)  ## PROCESSING

    sender.send(None)
    p.join()  ## wait until save is complete
Esempio n. 11
0
def main():

    logging.basicConfig(#filename="random_page_extractor.log",
                                stream=sys.stderr,
                                level=logging.DEBUG)

    op = create_option_parser()
    args = op.parse_args()

    lang, date_, type_ = explode_dump_filename(args.xml_fn)
    deflate, _lineno = lib.find_open_for_this_file(args.xml_fn)

    dumps_checker(args, type_)

    logging.info('---------------------START---------------------')

    if _lineno:
        src = deflate(args.xml_fn, 51)
    else:
        src = deflate(args.xml_fn)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,redirect,text,username,ip,timestamp')

    src.close()
    src = deflate(args.xml_fn)

    output = open(args.output, 'w') if args.output else None

    processor = HistoryRevisionsPageProcessor(
                    tag=tag,
                    lang=lang,
                    output=output,
                    threshold=args.ratio,
                    min_text=args.min_text_length,
                    min_revisions=args.revisions_number,
                    n_users=args.editors_number,
                    start_revision=args.initial_revision)

    processor.talkns = translation['Talk']
    processor.desired_page_type = args.type
    processor.set_desired_from_csv(desired_pages_fn, encoding=args.encoding)
    with Timr('processing'):
        processor.start(src)
Esempio n. 12
0
def cumulative_analysis(fn, start, end, freq):

    logging.info("running cumulative analysis")

    graph = graph_loader(fn)  ## loading graph

    freq_range = int(ceil(((end - start).days + 1) / float(freq)))
    for d in range(freq_range):
        s, e = start, end - timedelta(d * freq)

        if e <= s: continue

        ## processing
        if s != start or e != end:
            with Timr("SUBGRAPHING"):
                process_graph(graph, s, e)

        ## printing stats
        print_graph_stats(graph.g)
    del graph

    return
def main():
    op = create_option_parser()

    (options, args) = op.parse_args()

    if len(args) != 1:
        print "Insert one (and only one) file to process\n"
        op.print_help()
        sys.exit(2)

    fn = args[0]
    lang, date, type_ = mwlib.explode_dump_filename(fn)

    g = sg.load(fn)
    g.time_slice_subgraph(start=options.start, end=options.end)
    g.invert_edge_attr('weight', 'length')

    vn = len(g.g.vs) # number of vertexes
    en = len(g.g.es) # number of edges

    timr = Timr()

    if options.as_table:
        tablr = Tablr()
        tablr.start(1024*32, lang)

    if options.group or options.users_role or options.histogram:
        for group_name, group_attr in groups.iteritems():
            g.defineClass(group_name, group_attr)
            print ' * %s : nodes number : %d' % (group_name,
                                                 len(g.classes[group_name]))
    else:
        g.defineClass('all', {})

    print " * filename: %s" % (fn,)
    print " * lang: %s" % (lang,)
    print " * date: %s" % (date,)

    if options.details:
        with Timr("details"):
            print " * nodes number: %d" % (vn,)
            print " * edges number: %d" % (en,)

            nodes_with_outdegree = len(g.g.vs.select(_outdegree_ge=1))
            nodes_with_indegree = len(g.g.vs.select(_indegree_ge=1))
            self_loop_edges = len([edge for edge in g.g.es \
                                   if edge.target == edge.source])

            print " * nodes with out edges number: %d (%6f%%)" % (
                nodes_with_outdegree, 100.*nodes_with_outdegree/vn)
            print " * nodes with in edges number: %d (%6f%%)" % (
                nodes_with_indegree, 100.*nodes_with_indegree/vn)
            print " * max weights on edges : %s" % top(g.g.es['weight'])

            print " * self-loop edges: %d" % self_loop_edges
            #print " * diameter : %6f" % g.g.diameter(weights='length')
            #print " * average weight : %6f" % numpy.average(g.g.es['weight'])


    if options.density or options.reciprocity:
        with Timr('density&reciprocity'):
            for cls, vs in g.classes.iteritems():
                if not len(vs) > 1:
                    continue

                subgraph = vs.subgraph()

                print " * %s : density : %.10f" % (cls, subgraph.density())
                print " * %s : reciprocity : %.10f" % (cls,
                                                       subgraph.reciprocity())


    if options.degree:
        with Timr('degree'):
            g.g.vs['indegree'] = g.g.degree(type=ig.IN)
            g.g.vs['outdegree'] = g.g.degree(type=ig.OUT)

            for cls, vs in g.classes.iteritems():
                if not vs:
                    continue

                ind = numpy.array(vs['indegree'])
                outd = numpy.array(vs['outdegree'])

                print " * %s : mean IN degree (no weights): %f" % (
                    cls, numpy.average(ind))
                print " * %s : mean OUT degree (no weights): %f" % (
                    cls, numpy.average(outd))
                print " * %s : max IN degrees (no weights): %s" % (cls,
                                                                   top(ind))
                print " * %s : max OUT degrees (no weights): %s" % (cls,
                                                                    top(outd))

                print " * %s : stddev IN degree (no weights): %f" % (
                    cls, numpy.sqrt(numpy.var(ind)))
                print " * %s : stddev OUT degree (no weights): %f" % (
                    cls, numpy.sqrt(numpy.var(outd)))

    if options.transitivity:
        ##print " * transitivity: %f" % (nx.transitivity(g), )
        pass

    if options.summary:
        # don't use with --as-table
        print " * summary: %s" % (g.g.summary(), )

    if options.distance:
        with Timr('split clusters'):
            vc = g.g.clusters()
            size_clusters = vc.sizes()
            giant = vc.giant()

            print " * length of 5 max clusters: %s" % top(size_clusters)
            #print " * #node in 5 max clusters/#all nodes: %s" % top(
            #    [1.*cluster_len/vn for cluster_len in size_clusters])


    if options.distance:
        with Timr('distance'):
            gg = sg.Graph(giant)
            print " * average distance in the giant component: %f" % \
                  gg.averageDistance(weight='length')
            print " * average hops in the giant component: %f" % \
                  gg.averageDistance()

            #print "Average distance 2: %f" % giant.average_path_length(True,
            #                                                           False)


    if options.efficiency:
        with Timr('efficiency'):
            print " * efficiency: %f" % g.efficiency(weight='length')


    ##TODO: compute for centrality only if "all" or "degree"
    if (options.plot or options.histogram or options.power_law or
        options.centrality):
        with Timr('set weighted indegree'):
            g.set_weighted_degree()


    if options.centrality:
        timr.start('centrality')
        centralities = options.centrality.split(',')
        if 'all' in centralities:
            centralities = 'betweenness,pagerank,degree'.split(',')

        if set(centralities).difference(
            'betweenness,pagerank,degree'.split(',')):
            logging.error('Unknown centrality')
            sys.exit(0)

        if "betweenness" in centralities:
            print >> sys.stderr, "betweenness"
            g.g.vs['bw'] = g.g.betweenness(weights='length', directed = True)

        #g.g.vs['ev'] = g.g.evcent(weights='weight') # eigenvector centrality

        if 'pagerank' in centralities:
            print >> sys.stderr, "pagerank"
            g.g.vs['pr'] = g.g.pagerank(weights='weight') # pagerank

        if 'degree' in centralities:
            print >> sys.stderr, "outdegree"
            g.set_weighted_degree(type=ig.OUT)
        #total_weights = sum(g.g.es['weight'])
        max_edges = vn*(vn-1)

        for cls, vs in g.classes.iteritems():
            if not vs:
                continue

            if "betweenness" in centralities:
                norm_betweenness = numpy.array(g.classes[cls]['bw'])/max_edges
                print " * %s : average betweenness : %.10f" % (
                    cls, numpy.average(norm_betweenness))
                print " * %s : stddev betweenness : %.10f" % (
                    cls, numpy.sqrt(numpy.var(norm_betweenness)))
                print " * %s : max betweenness: %s" % (
                    cls, top(numpy.array(g.classes[cls]['bw'])/max_edges))

            #print " * Average eigenvector centrality : %6f" % numpy.average(
            #    g.vs['ev'])
            if 'pagerank' in centralities:
                print " * %s : average pagerank : %.10f" % (
                    cls, numpy.average(g.classes[cls]['pr']))
                print " * %s : stddev pagerank : %.10f" % (
                    cls, numpy.sqrt(numpy.var(g.classes[cls]['pr'])))
                print " * %s : max pagerank: %s" % (
                    cls, top(g.classes[cls]['pr']))

            if 'degree' in centralities:
                wi = g.classes[cls]['weighted_indegree']
                print " * %s : average IN degree centrality (weighted): %.10f" % (
                    cls, numpy.average(wi))
                print " * %s : stddev IN degree centrality (weighted): %.10f" % (
                    cls, numpy.sqrt(numpy.var(wi)))
                print " * %s : max IN degrees centrality (weighted): %s" % (
                    cls, top(wi))
                del wi

                wo = g.classes[cls]['weighted_outdegree']
                print " * %s : average OUT degree centrality (weighted) : %.10f" %\
                      (cls, numpy.average(wo))
                print " * %s : stddev OUT degree centrality (weighted) : %.10f" % \
                      (cls, numpy.sqrt(numpy.var(wo)))
                print " * %s : max OUT degrees centrality (weighted): %s" % (
                    cls, top(wo))
                del wo

        timr.stop('centrality')

    if options.power_law:
        with Timr('power law'):
            for cls, vs in g.classes.iteritems():
                if not vs:
                    continue

                indegrees = vs['weighted_indegree']

                try:
                    alpha_exp = ig.statistics.power_law_fit(indegrees, xmin=6)
                    print " * %s : alpha exp IN degree distribution : %10f " %\
                          (cls, alpha_exp)
                except ValueError:
                    print >> sys.stderr,\
                          " * %s : alpha exp IN degree distribution : ERROR" %\
                          (cls,)

    if options.histogram:
        list_with_index = lambda degrees, idx: [(degree, idx) for degree
                                                in degrees if degree]
        all_list = []

        nogrp_indegrees = g.g.vs.select(sysop_ne=True, bureaucrat_ne=True,
                                        steward_ne=True, founder_ne=True,
                                        bot_ne=True)['weighted_indegree']
        all_list += list_with_index(nogrp_indegrees, 1)

        sysops_indegrees = g.classes['sysop']['weighted_indegree']
        all_list += list_with_index(sysops_indegrees, 2)

        burs_indegrees = g.classes['bureaucrat']['weighted_indegree']
        all_list += list_with_index(burs_indegrees, 3)

        stewards_indegrees = g.classes['steward']['weighted_indegree']
        all_list += list_with_index(stewards_indegrees, 4)

        founders_indegrees = g.classes['founder']['weighted_indegree']
        all_list += list_with_index(founders_indegrees, 5)

        bots_indegrees = g.classes['bot']['weighted_indegree']
        all_list += list_with_index(bots_indegrees, 6)

        if options.gnuplot:
            f = open('hist.dat', 'w')
        else:
            f = open('%swiki-%s-hist.dat' % (lang, date), 'w')

        all_list.sort(reverse=True)

        for indegree, grp in all_list:
            for _ in range(grp - 1):
                print >> f, 0,
            print >> f, indegree,
            for _ in range(grp, 6):
                print >> f, 0,
            print >> f, ""
        f.close()

    if options.gnuplot:
        from popen2 import Popen3

        process = Popen3('gnuplot hist.gnuplot')
        process.wait()

        os.rename('hist.png', '%swiki-%s-hist.png' % (lang, date))
        os.rename('hist.dat', '%swiki-%s-hist.dat' % (lang, date))

    if options.plot:
        ## TODO: evaluate if this can be done with
        ## http://bazaar.launchpad.net/~igraph/igraph/0.6-main/revision/2018
        with Timr('plot'):
            import math

            ## filter:
            #print len(g.g.vs), len(g.g.es)
            #g.set_weighted_degree(type=ig.OUT)
            #g.g = g.g.subgraph(g.g.vs.select(weighted_indegree_ge=10,
            #                           weighted_outdegree_ge=1))
            #g.g.write_graphml('itwiki-20100729-stub-meta-history_in10_out1.graphml')
            #print len(g.g.vs), len(g.g.es)

            bots = g.g.vs.select(bot=True)
            bots['color'] = ('purple',)*len(bots)
            logging.debug('bots: ok')

            anonyms = g.g.vs.select(anonymous=True)
            anonyms['color'] = ('blue',)*len(anonyms)

            sysops = g.g.vs.select(sysop=True)
            sysops['color'] = ('yellow',)*len(sysops)

            bur_sysops = g.g.vs.select(bureaucrat=True, sysop=True)
            bur_sysops['color'] = ('orange',)*len(bur_sysops)

            g.g.vs['size'] = [math.sqrt(v['weighted_indegree']+1)*10 for v
                              in g.g.vs]

            logging.debug('plot: begin')
            ig.plot(g.g, target=lang+"_general.png", bbox=(0, 0, 8000, 8000),
                    edge_color='grey', layout='drl')
            logging.debug('plot: end')
            weights = g.g.es['weight']
            max_weight = max(weights)

            g.g.es['color'] = [(255.*e['weight']/max_weight, 0., 0.) for e
                               in g.g.es]
            g.g.es['width'] = weights

            ig.plot(g.g, target=lang+"_weighted_edges.png", bbox=(0, 0, 4000,
                                                                  2400),
                    layout='fr', vertex_label=' ')


    if options.as_table:
        tablr.stop()

        #tablr.printHeader()
        #tablr.printData()
        tablr.saveInDjangoModel()


    if options.adjacency:
        giant = g.g.clusters().giant()
        #destAdj = "%s/%swiki-%s-adj.csv" % (os.path.split(fn)[0], lang, date)
        destAdj = "%swiki-%s-adj.csv" % (lang, date)
        #destRec = "%s/%swiki-%s-rec.csv" % (os.path.split(fn)[0], lang, date)
        destRec = "%swiki-%s-rec.csv" % (lang, date)
        sg.Graph(giant).writeAdjacencyMatrix(destAdj, 'username')
        sg.Graph(giant).writeReciprocityMatrix('username', destRec)


    if options.users_role:
        l = g.get_user_class('username', ('anonymous', 'bot', 'bureaucrat',
                                        'sysop'))

        #destUR = "%s/%swiki-%s-ur.csv" % (os.path.split(fn)[0], lang, date)
        destUR = "%swiki-%s-ur.csv" % (lang, date)
        with open(destUR, 'w') as f:
            for username, role in sorted(l):
                print >> f, "%s,%s" % (username, role)

        from random import shuffle
        #destCls = "%s/%swiki-%s-%%s.csv" % (os.path.split(fn)[0], lang, date)
        destCls = "%swiki-%s-%%s.csv" % (lang, date)
        for cls in ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal_user'):
            users = g.classes[cls]['username']
            shuffle(users)
            with open(destCls % cls, 'w') as f:
                for username in users:
                    print >> f, \
                          ("%s,http://vec.wikipedia.org/w/index.php?title="+\
                          "Discussion_utente:%s&action=history&offset="+\
                          "20100000000001") % (username, username)
Esempio n. 14
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file geoip_db output_file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-p',
                 '--per-page',
                 action="store",
                 dest="per_page_stats",
                 help="Per page stats output")
    p.add_option('-e',
                 '--min-edits',
                 action="store",
                 type=int,
                 dest="min_edits",
                 help="Skip if page has less than min-edit edits")
    p.add_option('-a',
                 '--min-anon',
                 action="store",
                 type=int,
                 dest="min_anon",
                 help="Skip if page has less than min-anon anonymous edits")
    p.add_option('-E',
                 '--exclude',
                 action="store",
                 dest="exclude_countries",
                 help="Countries to exclude, colon (;) separated")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    geoip_db = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,redirect,timestamp,ip,revision,title')
    src.close()
    src = deflate(xml)

    processor = CountriesPageProcessor(tag=tag,
                                       lang=lang,
                                       output=output,
                                       userns=translation['User'],
                                       geoip=geoip_db)
    if opts.per_page_stats:
        processor.per_page_stats = opts.per_page_stats
    if opts.exclude_countries:
        processor.exclude_countries = opts.exclude_countries.split(";")
    processor.min_edits = opts.min_edits
    processor.min_anon = opts.min_anon
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
Esempio n. 15
0
def main():
    import optparse

    p = optparse.OptionParser(usage="usage: %prog file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-s', action="store", dest="signature", default=None,
                 help="Signature in this language (e.g. sig, firma..)")
    opts, files = p.parse_args()
    if opts.verbose:
        import sys
        import logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    try:
        xml = files[0]
    except IndexError:
        p.error("Give me one file, please")

    en_user, en_user_talk = u"User", u"User talk"

    lang, date, type_ = mwlib.explode_dump_filename(xml)

    src = BZ2File(xml)

    tag = mwlib.get_tags(src)

    ns_translation = mwlib.get_translations(src)
    lang_user, lang_user_talk = ns_translation['User'], \
             ns_translation['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    lang_user = unicode(lang_user, "utf8")
    en_user = unicode(en_user)

    # open dump with an external process to use multiple cores
    _fast = True
    if _fast:
        src.close()
        src = lib.BZ2FileExt(xml)

    if opts.signature is not None:
        processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag,
                              user_talk_names=(lang_user_talk, en_user_talk),
                              search=(lang_user, en_user), lang=lang,
                              signature=opts.signature)
    else:
        processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag,
                              user_talk_names=(lang_user_talk, en_user_talk),
                              search=(lang_user, en_user), lang=lang)

    with Timr('Processing'):
        processor.start(src)

    with Timr('Create network'):
        g = processor.ecache.get_network()

    logging.info("Len:", len(g.vs))
    logging.info("Edges:", len(g.es))

    g.write("%swiki-%s%s.pickle" % (lang, date, type_), format="pickle")
Esempio n. 16
0
def main():
    opts, args = opt_parse()
    xml = args[0]
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
    logging.info('---------------------START---------------------')

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    welcome = defaultdict(str)

    welcome.update({'it': r'Benvenut', 'en': r'Welcome'})

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
                         tags='page,title,revision,timestamp,contributor,'
                         'username,ip,comment,id')

    translations = mwlib.get_translations(src)

    try:
        lang_user = unicode(translations['User'])
        lang_user_talk = unicode(translations['User talk'])
    except UnicodeDecodeError:
        lang_user = smart_str(translations['User'])
        lang_user_talk = smart_str(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
                                     user_talk_names=(lang_user_talk,
                                                      u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    processor.welcome_pattern = welcome[lang]

    with Timr('Processing'):
        processor.start(src)  ## PROCESSING

    with Timr('Getting network'):
        g = processor.get_network()

    logging.info("Nodes: %d", len(g.vs))
    logging.info("Edges: %d", len(g.es))

    with Timr('Saving graph'):
        save_graph(g, lang, type_, date_)
Esempio n. 17
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file desired_list output_file")
    p.add_option('-t',
                 '--type',
                 action="store",
                 dest="type",
                 default="all",
                 help="Type of page to analize (content|talk|all)")
    p.add_option('-e',
                 '--encoding',
                 action="store",
                 dest="encoding",
                 default="latin-1",
                 help="encoding of the desired_list file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T',
                 "--timeout",
                 action="store",
                 dest="timeout",
                 type=float,
                 default=0.5,
                 help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c',
                 '--clean',
                 action="store_true",
                 dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    desired_pages_fn = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    src.close()
    src = deflate(xml)

    out = open(output, 'w')
    processor = HistoryRevisionsPageProcessor(tag=tag,
                                              lang=lang,
                                              output=out,
                                              userns=translation['User'])
    processor.talkns = translation['Talk']
    if opts.type == 'talk':
        processor.get_articles = False
    elif opts.type == 'content':
        processor.get_talks = False
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding)
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Esempio n. 18
0
def main():
    op = create_option_parser()

    (options, args) = op.parse_args()

    if len(args) != 1:
        print "Insert one (and only one) file to process\n"
        op.print_help()
        sys.exit(2)

    fn = args[0]
    lang, date, type_ = mwlib.explode_dump_filename(fn)

    g = sg.load(fn)
    g.time_slice_subgraph(start=options.start, end=options.end)
    g.invert_edge_attr('weight', 'length')

    vn = len(g.g.vs)  # number of vertexes
    en = len(g.g.es)  # number of edges

    timr = Timr()

    if options.as_table:
        tablr = Tablr()
        tablr.start(1024 * 32, lang)

    if options.group or options.users_role or options.histogram:
        for group_name, group_attr in groups.iteritems():
            g.defineClass(group_name, group_attr)
            print ' * %s : nodes number : %d' % (group_name,
                                                 len(g.classes[group_name]))
    else:
        g.defineClass('all', {})

    print " * filename: %s" % (fn, )
    print " * lang: %s" % (lang, )
    print " * date: %s" % (date, )

    if options.details:
        with Timr("details"):
            print " * nodes number: %d" % (vn, )
            print " * edges number: %d" % (en, )

            nodes_with_outdegree = len(g.g.vs.select(_outdegree_ge=1))
            nodes_with_indegree = len(g.g.vs.select(_indegree_ge=1))
            self_loop_edges = len([edge for edge in g.g.es \
                                   if edge.target == edge.source])

            print " * nodes with out edges number: %d (%6f%%)" % (
                nodes_with_outdegree, 100. * nodes_with_outdegree / vn)
            print " * nodes with in edges number: %d (%6f%%)" % (
                nodes_with_indegree, 100. * nodes_with_indegree / vn)
            print " * max weights on edges : %s" % top(g.g.es['weight'])

            print " * self-loop edges: %d" % self_loop_edges
            #print " * diameter : %6f" % g.g.diameter(weights='length')
            #print " * average weight : %6f" % numpy.average(g.g.es['weight'])

    if options.density or options.reciprocity:
        with Timr('density&reciprocity'):
            for cls, vs in g.classes.iteritems():
                if not len(vs) > 1:
                    continue

                subgraph = vs.subgraph()

                print " * %s : density : %.10f" % (cls, subgraph.density())
                print " * %s : reciprocity : %.10f" % (cls,
                                                       subgraph.reciprocity())

    if options.degree:
        with Timr('degree'):
            g.g.vs['indegree'] = g.g.degree(type=ig.IN)
            g.g.vs['outdegree'] = g.g.degree(type=ig.OUT)

            for cls, vs in g.classes.iteritems():
                if not vs:
                    continue

                ind = numpy.array(vs['indegree'])
                outd = numpy.array(vs['outdegree'])

                print " * %s : mean IN degree (no weights): %f" % (
                    cls, numpy.average(ind))
                print " * %s : mean OUT degree (no weights): %f" % (
                    cls, numpy.average(outd))
                print " * %s : max IN degrees (no weights): %s" % (cls,
                                                                   top(ind))
                print " * %s : max OUT degrees (no weights): %s" % (cls,
                                                                    top(outd))

                print " * %s : stddev IN degree (no weights): %f" % (
                    cls, numpy.sqrt(numpy.var(ind)))
                print " * %s : stddev OUT degree (no weights): %f" % (
                    cls, numpy.sqrt(numpy.var(outd)))

    if options.transitivity:
        ##print " * transitivity: %f" % (nx.transitivity(g), )
        pass

    if options.summary:
        # don't use with --as-table
        print " * summary: %s" % (g.g.summary(), )

    if options.distance:
        with Timr('split clusters'):
            vc = g.g.clusters()
            size_clusters = vc.sizes()
            giant = vc.giant()

            print " * length of 5 max clusters: %s" % top(size_clusters)
            #print " * #node in 5 max clusters/#all nodes: %s" % top(
            #    [1.*cluster_len/vn for cluster_len in size_clusters])

    if options.distance:
        with Timr('distance'):
            gg = sg.Graph(giant)
            print " * average distance in the giant component: %f" % \
                  gg.averageDistance(weight='length')
            print " * average hops in the giant component: %f" % \
                  gg.averageDistance()

            #print "Average distance 2: %f" % giant.average_path_length(True,
            #                                                           False)

    if options.efficiency:
        with Timr('efficiency'):
            print " * efficiency: %f" % g.efficiency(weight='length')

    ##TODO: compute for centrality only if "all" or "degree"
    if (options.plot or options.histogram or options.power_law
            or options.centrality):
        with Timr('set weighted indegree'):
            g.set_weighted_degree()

    if options.centrality:
        timr.start('centrality')
        centralities = options.centrality.split(',')
        if 'all' in centralities:
            centralities = 'betweenness,pagerank,degree'.split(',')

        if set(centralities).difference(
                'betweenness,pagerank,degree'.split(',')):
            logging.error('Unknown centrality')
            sys.exit(0)

        if "betweenness" in centralities:
            print >> sys.stderr, "betweenness"
            g.g.vs['bw'] = g.g.betweenness(weights='length', directed=True)

        #g.g.vs['ev'] = g.g.evcent(weights='weight') # eigenvector centrality

        if 'pagerank' in centralities:
            print >> sys.stderr, "pagerank"
            g.g.vs['pr'] = g.g.pagerank(weights='weight')  # pagerank

        if 'degree' in centralities:
            print >> sys.stderr, "outdegree"
            g.set_weighted_degree(type=ig.OUT)
        #total_weights = sum(g.g.es['weight'])
        max_edges = vn * (vn - 1)

        for cls, vs in g.classes.iteritems():
            if not vs:
                continue

            if "betweenness" in centralities:
                norm_betweenness = numpy.array(g.classes[cls]['bw']) \
                                   / max_edges
                print " * %s : average betweenness : %.10f" % (
                    cls, numpy.average(norm_betweenness))
                print " * %s : stddev betweenness : %.10f" % (
                    cls, numpy.sqrt(numpy.var(norm_betweenness)))
                print " * %s : max betweenness: %s" % (
                    cls, top(numpy.array(g.classes[cls]['bw']) / max_edges))

            #print " * Average eigenvector centrality : %6f" % numpy.average(
            #    g.vs['ev'])
            if 'pagerank' in centralities:
                print " * %s : average pagerank : %.10f" % (
                    cls, numpy.average(g.classes[cls]['pr']))
                print " * %s : stddev pagerank : %.10f" % (
                    cls, numpy.sqrt(numpy.var(g.classes[cls]['pr'])))
                print " * %s : max pagerank: %s" % (cls,
                                                    top(g.classes[cls]['pr']))

            if 'degree' in centralities:
                wi = g.classes[cls]['weighted_indegree']
                print " * %s : average IN degree centrality (weighted): %.10f" % (
                    cls, numpy.average(wi))
                print " * %s : stddev IN degree centrality (weighted): %.10f" % (
                    cls, numpy.sqrt(numpy.var(wi)))
                print " * %s : max IN degrees centrality (weighted): %s" % (
                    cls, top(wi))
                del wi

                wo = g.classes[cls]['weighted_outdegree']
                print " * %s : average OUT degree centrality (weighted) : %.10f" %\
                      (cls, numpy.average(wo))
                print " * %s : stddev OUT degree centrality (weighted) : %.10f" % \
                      (cls, numpy.sqrt(numpy.var(wo)))
                print " * %s : max OUT degrees centrality (weighted): %s" % (
                    cls, top(wo))
                del wo

        timr.stop('centrality')

    if options.power_law:
        with Timr('power law'):
            for cls, vs in g.classes.iteritems():
                if not vs:
                    continue

                indegrees = vs['weighted_indegree']

                try:
                    alpha_exp = ig.statistics.power_law_fit(indegrees, xmin=6)
                    print " * %s : alpha exp IN degree distribution : %10f " %\
                          (cls, alpha_exp)
                except ValueError:
                    print >> sys.stderr,\
                          " * %s : alpha exp IN degree distribution : ERROR" %\
                          (cls,)

    if options.histogram:
        list_with_index = lambda degrees, idx: [(degree, idx)
                                                for degree in degrees
                                                if degree]
        all_list = []

        nogrp_indegrees = g.g.vs.select(sysop_ne=True,
                                        bureaucrat_ne=True,
                                        steward_ne=True,
                                        founder_ne=True,
                                        bot_ne=True)['weighted_indegree']
        all_list += list_with_index(nogrp_indegrees, 1)

        sysops_indegrees = g.classes['sysop']['weighted_indegree']
        all_list += list_with_index(sysops_indegrees, 2)

        burs_indegrees = g.classes['bureaucrat']['weighted_indegree']
        all_list += list_with_index(burs_indegrees, 3)

        stewards_indegrees = g.classes['steward']['weighted_indegree']
        all_list += list_with_index(stewards_indegrees, 4)

        founders_indegrees = g.classes['founder']['weighted_indegree']
        all_list += list_with_index(founders_indegrees, 5)

        bots_indegrees = g.classes['bot']['weighted_indegree']
        all_list += list_with_index(bots_indegrees, 6)

        if options.gnuplot:
            f = open('hist.dat', 'w')
        else:
            f = open('%swiki-%s-hist.dat' % (lang, date), 'w')

        all_list.sort(reverse=True)

        for indegree, grp in all_list:
            for _ in range(grp - 1):
                print >> f, 0,
            print >> f, indegree,
            for _ in range(grp, 6):
                print >> f, 0,
            print >> f, ""
        f.close()

    if options.gnuplot:
        from popen2 import Popen3

        process = Popen3('gnuplot hist.gnuplot')
        process.wait()

        os.rename('hist.png', '%swiki-%s-hist.png' % (lang, date))
        os.rename('hist.dat', '%swiki-%s-hist.dat' % (lang, date))

    if options.plot:
        ## TODO: evaluate if this can be done with
        ## http://bazaar.launchpad.net/~igraph/igraph/0.6-main/revision/2018
        with Timr('plot'):
            import math

            ## filter:
            #print len(g.g.vs), len(g.g.es)
            #g.set_weighted_degree(type=ig.OUT)
            #g.g = g.g.subgraph(g.g.vs.select(weighted_indegree_ge=10,
            #                           weighted_outdegree_ge=1))
            #g.g.write_graphml('itwiki-20100729-stub-meta-history_in10_out1.graphml')
            #print len(g.g.vs), len(g.g.es)

            bots = g.g.vs.select(bot=True)
            bots['color'] = ('purple', ) * len(bots)
            logging.debug('bots: ok')

            anonyms = g.g.vs.select(anonymous=True)
            anonyms['color'] = ('blue', ) * len(anonyms)

            sysops = g.g.vs.select(sysop=True)
            sysops['color'] = ('yellow', ) * len(sysops)

            bur_sysops = g.g.vs.select(bureaucrat=True, sysop=True)
            bur_sysops['color'] = ('orange', ) * len(bur_sysops)

            g.g.vs['size'] = [
                math.sqrt(v['weighted_indegree'] + 1) * 10 for v in g.g.vs
            ]

            logging.debug('plot: begin')
            ig.plot(g.g,
                    target=lang + "_general.png",
                    bbox=(0, 0, 8000, 8000),
                    edge_color='grey',
                    layout='drl')
            logging.debug('plot: end')
            weights = g.g.es['weight']
            max_weight = max(weights)

            g.g.es['color'] = [(255. * e['weight'] / max_weight, 0., 0.)
                               for e in g.g.es]
            g.g.es['width'] = weights

            ig.plot(g.g,
                    target=lang + "_weighted_edges.png",
                    bbox=(0, 0, 4000, 2400),
                    layout='fr',
                    vertex_label=' ')

    if options.as_table:
        tablr.stop()

        #tablr.printHeader()
        #tablr.printData()
        tablr.saveInDjangoModel()

    if options.adjacency:
        giant = g.g.clusters().giant()
        #destAdj = "%s/%swiki-%s-adj.csv" % (os.path.split(fn)[0], lang, date)
        destAdj = "%swiki-%s-adj.csv" % (lang, date)
        #destRec = "%s/%swiki-%s-rec.csv" % (os.path.split(fn)[0], lang, date)
        destRec = "%swiki-%s-rec.csv" % (lang, date)
        sg.Graph(giant).writeAdjacencyMatrix(destAdj, 'username')
        sg.Graph(giant).writeReciprocityMatrix('username', destRec)

    if options.users_role:
        l = g.get_user_class('username',
                             ('anonymous', 'bot', 'bureaucrat', 'sysop'))

        #destUR = "%s/%swiki-%s-ur.csv" % (os.path.split(fn)[0], lang, date)
        destUR = "%swiki-%s-ur.csv" % (lang, date)
        with open(destUR, 'w') as f:
            for username, role in sorted(l):
                print >> f, "%s,%s" % (username, role)

        from random import shuffle
        #destCls = "%s/%swiki-%s-%%s.csv" % (os.path.split(fn)[0], lang, date)
        destCls = "%swiki-%s-%%s.csv" % (lang, date)
        for cls in ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal_user'):
            users = g.classes[cls]['username']
            shuffle(users)
            with open(destCls % cls, 'w') as f:
                for username in users:
                    print >> f, \
                          ("%s,http://vec.wikipedia.org/w/index.php?title=" + \
                          "Discussion_utente:%s&action=history&offset=" + \
                          "20100000000001") % (username, username)
Esempio n. 19
0
def main():
    import optparse
    from sonet.lib import SonetOption

    p = optparse.OptionParser(
            usage="usage: %prog [options] input_file dictionary output_file",
            option_class=SonetOption
        )
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    p.add_option('-S', '--detailed-start', action="store",
        dest='detailed_start', type="yyyymmdd", metavar="YYYYMMDD",
        default=None, help="Detailed output start date")
    p.add_option('-E', '--detailed-end', action="store",
        dest='detailed_end', type="yyyymmdd", metavar="YYYYMMDD", default=None,
        help="Detailed output end date")
    p.add_option('-n', '--detailed-namespace', action="store",
                 dest="detailed_ns", default="Normal",
                 help="Namespace of desired detailed data (default: Normal)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    dic = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags=('page,title,revision,timestamp,text,redirect,'
                              'contributor,username,ip'))
    namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)]
    src.close()
    src = deflate(xml)

    if os.path.exists(output):
        logging.error("File %s already exists!", output)
        sys.exit(0)

    out = open(output, 'w')
    processor = PyWCProcessor(tag=tag, lang=lang, dic=dic,
                              output=out, userns=translation['User'])
    processor.namespaces = namespaces
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean
    if opts.detailed_start and opts.detailed_end:
        print """
        You are going to run the script with detailed output on %d days.
        This is going to produce some CSV files on your disk, one for each
        day. Is this want you really want to do? [press enter to continue]
        """ % (opts.detailed_end - opts.detailed_start).days
        raw_input()
        processor.pywc.detailed = True
        processor.detailed_start = opts.detailed_start
        processor.detailed_end = opts.detailed_end
        processor.detailed_ns = opts.detailed_ns

    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Esempio n. 20
0
def main():
    import optparse
    from sonet.lib import SonetOption
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file output_file",
        option_class=SonetOption)
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output")
    p.add_option('-i',
                 '--ignorecols',
                 action="store",
                 dest="ignorecols",
                 help="Columns numbers of the source file to ignore"
                 "(comma separated and starting from 0)")
    p.add_option('-I',
                 '--id',
                 action="store",
                 dest="id_col",
                 type="int",
                 help="Id column number (starting from 0)",
                 default=0)
    p.add_option('-o', '--onlycols', action="store", dest="onlycols",
                 help="Select only this set of columns" + \
                      "(comma separated and starting from 0)")
    p.add_option('-p',
                 '--percentages',
                 action="store_true",
                 dest="perc",
                 help="Use percentages instead of absolute value")
    p.add_option('-w',
                 '--window',
                 action="store",
                 dest="window",
                 type=int,
                 help="Collapse days")
    p.add_option('-g',
                 '--group',
                 action="store",
                 dest="group",
                 help="Group by weekday/month")
    p.add_option('-S',
                 '--sliding',
                 action="store",
                 dest="smooth",
                 type=int,
                 help="Sliding window")
    p.add_option('--exclude-less-than',
                 action="store",
                 dest="excludelessthan",
                 type=int,
                 help=("Exclude lines with totals (or dic if -d option is "
                       "used) smaller than this parameter"))
    p.add_option('--exclude-more-than',
                 action="store",
                 dest="excludemorethan",
                 type=int,
                 help=("Exclude lines with totals (or dic if -d option is "
                       "used) greater than this parameter"))
    p.add_option('-s',
                 '--start',
                 action="store",
                 dest='start',
                 type="yyyymmdd",
                 metavar="YYYYMMDD",
                 default=None,
                 help="Look for revisions starting from this date")
    p.add_option('-e',
                 '--end',
                 action="store",
                 dest='end',
                 type="yyyymmdd",
                 metavar="YYYYMMDD",
                 default=None,
                 help="Look for revisions until this date")
    p.add_option('-d',
                 '--dic',
                 action="store_true",
                 dest="dic",
                 default=False,
                 help="Calculate percentage over dic column instead of total")
    p.add_option('-n',
                 '--namespaces',
                 action="store",
                 dest="namespaces",
                 help="Output only selected namespaces (comma separated)")
    opts, files = p.parse_args()

    if len(files) != 2:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
    csv_reader = csv.reader(open(files[0]), delimiter="\t")
    onlycols = None
    ignorecols = None
    if opts.onlycols:
        onlycols = [int(x) for x in opts.onlycols.split(",")]
    if opts.ignorecols:
        ignorecols = [int(x) for x in opts.ignorecols.split(",")]

    # content contains all the csv file
    content = [row for row in csv_reader]

    # columns to skip (namespace, text, total, ...)
    ns_index = 1
    len_line = len(content[0])
    to_skip = [ns_index, len_line - 1, len_line - 2, len_line - 3, opts.id_col]

    # CSV header, only of interesting columns
    header = [x for x in _gen_data(content[0], to_skip, ignorecols, onlycols)]
    namespaces = {}
    for row in content[1:]:
        try:
            namespaces[row[ns_index]].append(row)
        except KeyError:
            namespaces[row[ns_index]] = [row]

    pdf_pag = PdfPages(files[1])
    opts.namespaces = opts.namespaces.split(",") if opts.namespaces else None
    for ns in namespaces:
        if (not opts.namespaces) or (ns in opts.namespaces):
            logging.info("Processing namespace: %s", ns)
            # Creates a matrix (list) with percentages of the occurrencies of every
            # category. Don't count id, total, text, ignore columns. If onlycols is set
            # consider only them.
            mat = []
            timestamps = []
            totals = []
            tot_index = -3
            if opts.dic:
                tot_index = -5

            for line in namespaces[ns]:
                #filter only pages with total (or dic is -d) greater or smaller than X
                if opts.excludemorethan:
                    if float(line[tot_index]) > opts.excludemorethan:
                        continue
                if opts.excludelessthan:
                    if float(line[tot_index]) < opts.excludelessthan:
                        continue

                mat.append([
                    x for x in _gen_data(line, to_skip, ignorecols, onlycols)
                ])
                totals.append(float(line[tot_index]))
                timestamps.append(dt.strptime(line[opts.id_col], "%Y/%m/%d"))

            mat = np.array(mat, dtype=np.float).transpose()
            logging.info("Input file read. Ready to plot")

            with Timr("Plotting"):
                for i, series in enumerate(mat):
                    logging.info("Plotting page %d", i + 1)

                    # Don't plot zeros and skip zero revisions!
                    #ser = [x for x in series if x != 0]
                    #time = [x for k, x in enumerate(timestamps) if series[k] != 0]
                    #tot = [x for k, x in enumerate(totals) if series[k] != 0]
                    ser = [x for k, x in enumerate(series) \
                           if (not opts.start or timestamps[k] >= opts.start) and \
                              (not opts.end or timestamps[k] <= opts.end)]
                    time = [x for k, x in enumerate(timestamps) \
                            if (not opts.start or x >= opts.start) and \
                               (not opts.end or x <= opts.end)]
                    tot = [x for k, x in enumerate(totals) \
                           if (not opts.start or timestamps[k] >= opts.start) and \
                              (not opts.end or timestamps[k] <= opts.end)]

                    if opts.smooth and len(time) and len(ser) and len(tot):
                        time, ser, tot = smooth_values(time, ser, tot,
                                                       opts.smooth)

                    if opts.window and len(time) and len(ser) and len(tot):
                        time, ser, tot = collapse_values(
                            time, ser, tot, opts.window)

                    if opts.group and len(time) and len(ser) and len(tot):
                        time, ser, tot = group_values(time, ser, tot,
                                                      opts.group)

                    try:
                        mean = float(sum(series)) / len(series)
                    except ZeroDivisionError:
                        continue
                    #rel_mean is the mean for the period [opts.end, opts.start]
                    try:
                        rel_mean = float(sum(ser)) / len(ser)
                    except ZeroDivisionError:
                        continue

                    if opts.perc:
                        try:
                            mean = float(sum(series)) / sum(totals)
                            rel_mean = float(sum(ser)) / sum(tot)
                        except ZeroDivisionError:
                            mean = 0
                            rel_mean = 0
                        # Calculate percentages
                        ser = [calc_perc(x, tot[k]) for k, x in enumerate(ser)]
                        # Set axis limit 0-1 IS IT GOOD OR BAD?
                        #axis.set_ylim(0, 1)
                        plt.ylabel("%")

                    first_time = time[0].date()
                    last_time = time[-1].date()
                    plt.clf()
                    plt.subplots_adjust(bottom=0.25)
                    plt.xticks(rotation=90)
                    fig = plt.gcf()
                    fig.set_size_inches(11.7, 8.3)
                    axis = plt.gca()
                    axis.xaxis.set_major_formatter(
                        md.DateFormatter('%Y-%m-%d'))
                    axis.set_xlim(matplotlib.dates.date2num(first_time),
                                  matplotlib.dates.date2num(last_time))
                    if last_time - first_time < timedelta(days=30):
                        axis.xaxis.set_major_locator(md.DayLocator(interval=1))
                        axis.xaxis.set_minor_locator(md.DayLocator(interval=1))
                    else:
                        axis.xaxis.set_minor_locator(
                            md.MonthLocator(interval=1))
                        rule = md.rrulewrapper(md.MONTHLY, interval=4)
                        auto_loc = md.RRuleLocator(rule)
                        axis.xaxis.set_major_locator(auto_loc)
                    axis.tick_params(labelsize='x-small')
                    plt.xlabel("Revisions Timestamp")

                    if len(time) and len(ser):
                        if opts.window:
                            time = [t.date() for t in time]
                        logging.info("Mean: %f", mean)
                        logging.info("Relative Mean: %f", rel_mean)
                        plt.plot(matplotlib.dates.date2num(time), ser, "b.-")
                        plt.axhline(y=mean, color="r")
                        plt.title("%s - %s - Mean: %.5f - Relative mean: %.5f" % \
                                  (ns, header[i], round(mean, 5),
                                   round(rel_mean, 5)))
                    pdf_pag.savefig()
    pdf_pag.close()
Esempio n. 21
0
 def get_network(self):
     with Timr('Flushing'):
         self.ecache.flush()
     return self.ecache.get_network(edge_label='timestamp')