Beispiel #1
0
def main():
    parser = squid.SquidArgumentParser('filters squid logs by provider ranges')
    squid.squidrow.load_cidr_ranges()
    parser.add_argument('country_code')
    parser.add_argument('-o', '--outdir', default='.', help='directory in which to place the filtered files')
    args = parser.parse_args()
    logger.info(pprint.pformat(args))

    keepers = ['language', 'project', 'site', 'title']

    criteria = [
            lambda r : r.country_code2() == args.country_code,
            lambda r : r.old_init_request(),
            lambda r : r.datetime() > args.start,
            lambda r : r.datetime() < args.end,
    ]

    counts = squid.count_files(args.squid_files, 
            keepers, 
            criteria,
            count_event=1000,
            limit=args.max_lines,
            nproc=15,
            fname='%s_top_k_titles_incremental.csv' % args.country_code)

    squid.write_counts(counts, '%s_top_k_articles.csv' % args.country_code)
def main():
    parser = squid.SquidArgumentParser('filters squid logs by provider ranges')
    squid.squidrow.load_cidr_ranges()
    parser.add_argument('provider', 
            choices=squid.squidrow.cidr_ranges.keys(),
            help='name of a provider to filter by')
    parser.add_argument('-o', '--outdir', default='.', help='directory in which to place the filtered files')
    args = parser.parse_args()
    logger.info(pprint.pformat(args))
   
    keepers = ['date', 'language', 'project', 'site', 'country', 'na']

    criteria = [
            lambda r : r.site() in ['M', 'Z'],
            lambda r : r.old_init_request(),
            lambda r : r.provider() == args.provider,
            lambda r : r.datetime() > args.start,
            lambda r : r.datetime() < args.end,
    ]

    counts = squid.count_files(args.squid_files, 
            keepers, 
            criteria,
            count_event=1000,
            limit=args.max_lines,
            nproc=15,
            fname='%s_counts_incremental.csv' % args.provider)

    squid.write_counts(counts, '%s_counts.csv' % args.provider)
from squid import count_files, write_counts
from squid.util import SquidArgumentParser
import logging
import pprint

logger = logging.getLogger(__name__)

if __name__ == '__main__':
    parser = SquidArgumentParser()
    parser.set_defaults(basename='zero-.*', datadir='/a/squid/archive/zero')
    args = parser.parse_args()
    logger.info(pprint.pformat(vars(args)))


    criteria = [
            lambda r : r.site() in ['M', 'Z'],
            lambda r : r.old_init_request(),
            lambda r : r.project == 'wikipedia',
            ]

    fields = ['date', 'language', 'project', 'site', 'na', 'provider_from_file']
    
    counts = count_files(args.squid_files,
            fields,
            criteria,
            nproc=10,
            limit=args.max_lines,
            fname='carrier_counts_cidr_all.incremental.csv')

    write_counts(counts, 'carrier_counts_cidr_all.counts.csv')
Beispiel #4
0
import pprint
import datetime
from squid import count_files, write_counts, get_files

urls = set([tuple(line.strip().split('/')[1:]) for line in open('urls.txt')])
pprint.pprint(urls)
glob = '/a/squid/archive/sampled/sampled-1000.tab.log-20130301.gz'
fields = ['date', 'project', 'country', 'title']
criteria = [
        lambda r : r.status_code() < 300,
        lambda r : r.url_path() and r.url_path()[0] == 'wiki',
        lambda r : r.project() == 'wikimediafoundation',
        lambda r : (len(r.url_path()) > 1 and r.url_path()[1] in ['FAQ', 'Ways_to_Give', 'Thank_You']) or  r.url_path() in urls]

files = get_files(start = datetime.date(2012,11,15),
        end = datetime.date(2013,01,16))
files.extend(get_files(start = datetime.date(2013,2,25),
        end = datetime.date(2013,4,1)))

counts = count_files(files,
        criteria=criteria,
        fields=fields,
        count_event=1000)

write_counts(counts, 'fundraising_pv_custom_init.csv')