Esempio n. 1
0
def main():
    files = sys.argv[1:]
    cache_path = 'geo_counts.csv'
    if os.path.exists(cache_path):
        counts = load_cache(cache_path)
    else:
        counts = count_files(files, keepers, criteria)
        write_counts(counts, cache_path)
    plot_error_vs_time(counts, dates)
    plot_country_error_vs_time(counts,dates)
    plot_mom_vs_time(counts,dates)
    basic_stats(counts)
Esempio n. 2
0
def main():
    opts = parse_args()
    criteria = (lambda r : r.site() in ['M', 'Z'], lambda r : r.old_init_request())
    fields = ('date', 'lang', 'project', 'site', 'country_code2', 'provider_from_fname')
    counts = count_files(opts['glob'],
            fields,
            criteria,
            count_event=10,
            fname='carrier.local.all.incremental', limit=10000)
    df = pd.DataFrame([v + (c,) for v,c in counts.items()],
            columns = ['date', 'lang', 'project', 'site', 'country', 'carrier', 'count'])
    df.date = df.date.apply(lambda d : datetime.datetime.strftime(d, '%Y-%m-%d'))
    logger.info('carriers: %s', pprint.pformat(df.carrier.unique()))
    df.to_csv('carrier.local.all', index=False, sep='\t')
Esempio n. 3
0
def main():
    parser = SquidArgumentParser()
    parser.add_argument('--nprocs', default=10)
    args = parser.parse_args()
    logger.info(pprint.pformat(args.__dict__))

    keepers = ['date', 'language', 'project', 'site', 'country', 'na']

    criteria = [
            lambda r : r.old_init_request(),
            lambda r : r.site() == 'M',
            lambda r : r.datetime() > args.start,
            lambda r : r.datetime() < args.end,
    ]

    counts = count_files(args.squid_files, 
            keepers, 
            criteria,
            count_event=1000,
            limit=args.max_lines,
            nproc=15,
            fname='country_counts_incremental.csv')

    write_counts(counts, 'country_counts.csv')
Esempio n. 4
0
from squid.mapreduce import count_files
import sys
import pandas as pd
from operator import itemgetter

fields = ["country", "x_cs_str", "provider", "date", "site", "lang", "project"]
counts = count_files([sys.argv[1]], fields)
lines = [key + (count,) for key, count in counts.items()]
lines.sort(key=itemgetter(*range(len(fields) + 1)))
df = pd.DataFrame(lines, columns=fields + ["count"])
df.to_csv("%s.counts.csv" % sys.argv[1], index=False)