Esempio n. 1
0
def parse_args():

    parser = SquidArgumentParser(description='Process a collection of squid logs and write certain extracted metrics to file')
    parser.add_argument('providers', 
                        metavar='PROVIDER_IDENTIFIER',
                        nargs='*',
                        default=DEFAULT_PROVIDERS,
                        help='list of provider identifiers used in squid log file names')
    parser.add_argument('--name_format',
                        dest='name_format',
                        type=str,
                        default='%s.log-%.gz',
                        help='a printf style format string which is formatted with the tuple: (provider_name, date_representation')
    parser.set_defaults(datadir='/a/squid/archive/zero')


    args = parser.parse_args()
    # custom logic for which files to grab
    prov_files = {}
    for prov in args.providers:
        args.basename = prov
        logging.info('args prior to ge_files: %s', pprint.pformat(args.__dict__))
        prov_files[prov] = SquidArgumentParser.get_files(args)
    setattr(args, 'squid_files', prov_files)

    
    logging.info(pprint.pformat(args.__dict__))
    return args
#!/usr/bin/python
from squid import count_files, write_counts
from squid.util import SquidArgumentParser
import logging
import pprint

logger = logging.getLogger(__name__)

if __name__ == '__main__':
    parser = SquidArgumentParser()
    parser.set_defaults(basename='zero-.*', datadir='/a/squid/archive/zero')
    args = parser.parse_args()
    logger.info(pprint.pformat(vars(args)))


    criteria = [
            lambda r : r.site() in ['M', 'Z'],
            lambda r : r.old_init_request(),
            lambda r : r.project == 'wikipedia',
            ]

    fields = ['date', 'language', 'project', 'site', 'na', 'provider_from_file']
    
    counts = count_files(args.squid_files,
            fields,
            criteria,
            nproc=10,
            limit=args.max_lines,
            fname='carrier_counts_cidr_all.incremental.csv')

    write_counts(counts, 'carrier_counts_cidr_all.counts.csv')