def main(): parser = squid.SquidArgumentParser('filters squid logs by provider ranges') squid.squidrow.load_cidr_ranges() parser.add_argument('country_code') parser.add_argument('-o', '--outdir', default='.', help='directory in which to place the filtered files') args = parser.parse_args() logger.info(pprint.pformat(args)) keepers = ['language', 'project', 'site', 'title'] criteria = [ lambda r : r.country_code2() == args.country_code, lambda r : r.old_init_request(), lambda r : r.datetime() > args.start, lambda r : r.datetime() < args.end, ] counts = squid.count_files(args.squid_files, keepers, criteria, count_event=1000, limit=args.max_lines, nproc=15, fname='%s_top_k_titles_incremental.csv' % args.country_code) squid.write_counts(counts, '%s_top_k_articles.csv' % args.country_code)
def main(): parser = squid.SquidArgumentParser('filters squid logs by provider ranges') squid.squidrow.load_cidr_ranges() parser.add_argument('provider', choices=squid.squidrow.cidr_ranges.keys(), help='name of a provider to filter by') parser.add_argument('-o', '--outdir', default='.', help='directory in which to place the filtered files') args = parser.parse_args() logger.info(pprint.pformat(args)) keepers = ['date', 'language', 'project', 'site', 'country', 'na'] criteria = [ lambda r : r.site() in ['M', 'Z'], lambda r : r.old_init_request(), lambda r : r.provider() == args.provider, lambda r : r.datetime() > args.start, lambda r : r.datetime() < args.end, ] counts = squid.count_files(args.squid_files, keepers, criteria, count_event=1000, limit=args.max_lines, nproc=15, fname='%s_counts_incremental.csv' % args.provider) squid.write_counts(counts, '%s_counts.csv' % args.provider)
from squid import count_files, write_counts from squid.util import SquidArgumentParser import logging import pprint logger = logging.getLogger(__name__) if __name__ == '__main__': parser = SquidArgumentParser() parser.set_defaults(basename='zero-.*', datadir='/a/squid/archive/zero') args = parser.parse_args() logger.info(pprint.pformat(vars(args))) criteria = [ lambda r : r.site() in ['M', 'Z'], lambda r : r.old_init_request(), lambda r : r.project == 'wikipedia', ] fields = ['date', 'language', 'project', 'site', 'na', 'provider_from_file'] counts = count_files(args.squid_files, fields, criteria, nproc=10, limit=args.max_lines, fname='carrier_counts_cidr_all.incremental.csv') write_counts(counts, 'carrier_counts_cidr_all.counts.csv')
import pprint import datetime from squid import count_files, write_counts, get_files urls = set([tuple(line.strip().split('/')[1:]) for line in open('urls.txt')]) pprint.pprint(urls) glob = '/a/squid/archive/sampled/sampled-1000.tab.log-20130301.gz' fields = ['date', 'project', 'country', 'title'] criteria = [ lambda r : r.status_code() < 300, lambda r : r.url_path() and r.url_path()[0] == 'wiki', lambda r : r.project() == 'wikimediafoundation', lambda r : (len(r.url_path()) > 1 and r.url_path()[1] in ['FAQ', 'Ways_to_Give', 'Thank_You']) or r.url_path() in urls] files = get_files(start = datetime.date(2012,11,15), end = datetime.date(2013,01,16)) files.extend(get_files(start = datetime.date(2013,2,25), end = datetime.date(2013,4,1))) counts = count_files(files, criteria=criteria, fields=fields, count_event=1000) write_counts(counts, 'fundraising_pv_custom_init.csv')