def main(): today = datetime.datetime.today() one_day = datetime.timedelta(1) yesterday = today - one_day # find yesterday's log logfile = "/var/log/gogogon/consumer.log.%04d-%02d-%02d" % \ (yesterday.year, yesterday.month, yesterday.day) if not os.path.exists(logfile): return # sort and uniq the log cmd = 'grep INFO %s | cut -f 4- -d " " | sort | uniq -c' % logfile pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) # collect up global hashes and click counts details = dict() for line in pipe.stdout: (count, global_hash, url) = line.strip().split(' ', 2) details[global_hash] = dict( u=url, global_clicks=count, agency=domain(url), global_hash=global_hash, ) # grab hashes in groups of GROUPSIZE size for i in xrange(1+len(details)/GROUPSIZE): hashes = details.keys()[i*GROUPSIZE:i*GROUPSIZE+GROUPSIZE] # lookup titles for info in bitly.info(*hashes): if not info['title']: continue details[info['hash']]['title']=info['title'] # output files json_file = "/var/log/gogogon/ranks/%04d-%02d-%02d.json" % \ (yesterday.year, yesterday.month, yesterday.day) csv_file = "/var/log/gogogon/ranks/%04d-%02d-%02d.csv" % \ (yesterday.year, yesterday.month, yesterday.day) # sort by global clicks descending records = details.values() records.sort(key=lambda x: x["global_clicks"], reverse=True) # write json json.dump(records, file(json_file, 'w')) # write csv csv_writer = csv.writer(file(csv_file, 'w')) csv_writer.writerow(["Long URL", "Page Title", "Clicks", "Agency Domain", "Global hash"]) for record in records: if not 'title' in record: continue csv_writer.writerow([ record['u'], record['title'].encode('utf8'), record['global_clicks'], record['agency'], record['global_hash'], ])
def main(): ymd = sys.argv[1] print "recovering %s" % ymd # gather global hashes and click counts details = dict() lines = os.popen("curl -s %s | grep %s" % (ARCHIVE_URL, ymd)).readlines() for line in lines: matches = LINK_RE.findall(line) if matches: link = ARCHIVE_URL+matches[0] for (global_hash, url) in read_data(link): if global_hash not in details: details[global_hash] = dict( u=url, global_clicks=1, agency=domain(url), global_hash=global_hash, ) else: details[global_hash]['global_clicks'] += 1 print "getting titles" # grab hashes in groups of GROUPSIZE size for i in xrange(1+len(details)/GROUPSIZE): hashes = details.keys()[i*GROUPSIZE:i*GROUPSIZE+GROUPSIZE] # lookup titles for item in bitly.info(hashes=hashes): if 'title' not in item: continue details[item['hash']]['title']=item['title'] # sort by global clicks descending records = details.values() records.sort(key=lambda x: x["global_clicks"], reverse=True) write_output_files(records=records, ymd=ymd, latest=False)
def main(): # setup logger formatter = logging.Formatter( '%(process)d %(levelname)s %(asctime)s %(message)s', '%Y-%m-%d %H:%M:%S') handler = logging.FileHandler("/var/log/gogogon/ranks.log") handler.setFormatter(formatter) logger = logging.getLogger() logger.addHandler(handler) logger.setLevel(logging.DEBUG) logger.debug("starting up") today = datetime.datetime.today() one_day = datetime.timedelta(1) yesterday = today - one_day ymd = "%04d-%02d-%02d" % (yesterday.year, yesterday.month, yesterday.day) # find yesterday's log logfile = os.path.join(LOG_INPUT_DIR, "consumer.log.%s" % ymd) # But allow this to be overridden parser = optparse.OptionParser() parser.add_option('-f', '--file', dest="logfile", default=logfile) parser.add_option('-o', '--output-directory', dest="output_dir", default=RANKS_OUTPUT_DIR) parser.add_option('-a', '--agency', dest="use_agency_domain", default=False) options, remainder = parser.parse_args() logfile = options.logfile output_dir = options.output_dir if not os.path.exists(logfile): raise RuntimeError('Log file does not exist: ' + logfile) if not os.path.exists(output_dir): raise RuntimeError('Output directory does not exist: ' + output_dir) # sort and uniq the log cmd = 'grep INFO %s | cut -f 5- -d " " | sort | uniq -c' % logfile pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) # collect up global hashes and click counts details = dict() for line in pipe.stdout: (count, global_hash, url) = line.strip().split(' ', 2) details[global_hash] = dict( u=url, global_clicks=long(count), agency=domain(url), global_hash=global_hash, ) # grab hashes in groups of GROUPSIZE size for i in xrange(1 + len(details) / GROUPSIZE): hashes = details.keys()[i * GROUPSIZE:i * GROUPSIZE + GROUPSIZE] # lookup titles for item in bitly.info(hashes=hashes): if 'title' not in item: continue details[item['hash']]['title'] = item['title'] # lookup yesterday's clicks for item in bitly.clicks_by_day(hashes=hashes, days=2): if 'clicks' not in item: continue clicks = int(item['clicks'][1]['clicks']) if clicks > details[item['hash']]['global_clicks']: details[item['hash']]['global_clicks'] = clicks # sort by global clicks descending records = details.values() records.sort(key=lambda x: x["global_clicks"], reverse=True) logger.debug("writing output files") write_output_files(records, ymd, output_dir) if options.use_agency_domain: write_agency_domain_files(records, output_dir, ymd) logger.debug("shutting down")
def main(): # setup logger formatter = logging.Formatter('%(process)d %(levelname)s %(asctime)s %(message)s', '%Y-%m-%d %H:%M:%S') handler = logging.FileHandler("/var/log/gogogon/ranks.log") handler.setFormatter(formatter) logger = logging.getLogger() logger.addHandler(handler) logger.setLevel(logging.DEBUG) logger.debug("starting up") today = datetime.datetime.today() one_day = datetime.timedelta(1) yesterday = today - one_day ymd = "%04d-%02d-%02d" % (yesterday.year, yesterday.month, yesterday.day) # find yesterday's log logfile = os.path.join(LOG_INPUT_DIR, "consumer.log.%s" % ymd) # But allow this to be overridden parser = optparse.OptionParser() parser.add_option('-f', '--file', dest="logfile", default=logfile) parser.add_option('-o', '--output-directory', dest="output_dir", default= RANKS_OUTPUT_DIR) parser.add_option('-a', '--agency', dest="use_agency_domain", default=False) options, remainder = parser.parse_args() logfile = options.logfile output_dir = options.output_dir if not os.path.exists(logfile): raise RuntimeError('Log file does not exist: ' + logfile) if not os.path.exists(output_dir): raise RuntimeError('Output directory does not exist: ' + output_dir) # sort and uniq the log cmd = 'grep INFO %s | cut -f 5- -d " " | sort | uniq -c' % logfile pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) # collect up global hashes and click counts details = dict() for line in pipe.stdout: (count, global_hash, url) = line.strip().split(' ', 2) details[global_hash] = dict( u=url, global_clicks=long(count), agency=domain(url), global_hash=global_hash, ) # grab hashes in groups of GROUPSIZE size for i in xrange(1+len(details)/GROUPSIZE): hashes = details.keys()[i*GROUPSIZE:i*GROUPSIZE+GROUPSIZE] # lookup titles for item in bitly.info(hashes=hashes): if 'title' not in item: continue details[item['hash']]['title']=item['title'] # lookup yesterday's clicks for item in bitly.clicks_by_day(hashes=hashes, days=2): if 'clicks' not in item: continue clicks = int(item['clicks'][1]['clicks']) if clicks > details[item['hash']]['global_clicks']: details[item['hash']]['global_clicks'] = clicks # sort by global clicks descending records = details.values() records.sort(key=lambda x: x["global_clicks"], reverse=True) logger.debug("writing output files") write_output_files(records, ymd, output_dir) if options.use_agency_domain: write_agency_domain_files(records, output_dir, ymd) logger.debug("shutting down")