Esempio n. 1
0
def main():
  today = datetime.datetime.today()
  one_day = datetime.timedelta(1)
  yesterday = today - one_day
  
  # find yesterday's log
  logfile = "/var/log/gogogon/consumer.log.%04d-%02d-%02d" % \
    (yesterday.year, yesterday.month, yesterday.day)
  if not os.path.exists(logfile): return
  
  # sort and uniq the log
  cmd = 'grep INFO %s | cut -f 4- -d " " | sort | uniq -c' % logfile
  pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
  
  # collect up global hashes and click counts
  details = dict()
  for line in pipe.stdout:
    (count, global_hash, url) = line.strip().split(' ', 2)
    details[global_hash] = dict(
      u=url,
      global_clicks=count,
      agency=domain(url),
      global_hash=global_hash,
    )
  
  # grab hashes in groups of GROUPSIZE size
  for i in xrange(1+len(details)/GROUPSIZE):
    hashes = details.keys()[i*GROUPSIZE:i*GROUPSIZE+GROUPSIZE]
    # lookup titles
    for info in bitly.info(*hashes):
      if not info['title']: continue
      details[info['hash']]['title']=info['title']
  
  # output files
  json_file = "/var/log/gogogon/ranks/%04d-%02d-%02d.json" % \
    (yesterday.year, yesterday.month, yesterday.day)
  csv_file = "/var/log/gogogon/ranks/%04d-%02d-%02d.csv" % \
    (yesterday.year, yesterday.month, yesterday.day)
  
  # sort by global clicks descending
  records = details.values()
  records.sort(key=lambda x: x["global_clicks"], reverse=True)  

  # write json
  json.dump(records, file(json_file, 'w'))
  
  # write csv
  csv_writer = csv.writer(file(csv_file, 'w'))
  csv_writer.writerow(["Long URL", "Page Title", "Clicks", "Agency Domain", "Global hash"])
  for record in records:
    if not 'title' in record: continue
    csv_writer.writerow([
      record['u'],
      record['title'].encode('utf8'),
      record['global_clicks'],
      record['agency'],
      record['global_hash'],
    ])
Esempio n. 2
0
def main():
  ymd = sys.argv[1]
  print "recovering %s" % ymd
  
  # gather global hashes and click counts
  details = dict()
  lines = os.popen("curl -s %s | grep %s" % (ARCHIVE_URL, ymd)).readlines()
  for line in lines:
    matches = LINK_RE.findall(line)
    if matches:
      link = ARCHIVE_URL+matches[0]
      for (global_hash, url) in read_data(link):
        if global_hash not in details:
          details[global_hash] = dict(
            u=url,
            global_clicks=1,
            agency=domain(url),
            global_hash=global_hash,
          )
        else:
          details[global_hash]['global_clicks'] += 1
  
  print "getting titles"
  # grab hashes in groups of GROUPSIZE size
  for i in xrange(1+len(details)/GROUPSIZE):
    hashes = details.keys()[i*GROUPSIZE:i*GROUPSIZE+GROUPSIZE]
    # lookup titles
    for item in bitly.info(hashes=hashes):
      if 'title' not in item: continue
      details[item['hash']]['title']=item['title']
  
  # sort by global clicks descending
  records = details.values()
  records.sort(key=lambda x: x["global_clicks"], reverse=True)  
  
  write_output_files(records=records, ymd=ymd, latest=False)
Esempio n. 3
0
def main():
    # setup logger
    formatter = logging.Formatter(
        '%(process)d %(levelname)s %(asctime)s %(message)s',
        '%Y-%m-%d %H:%M:%S')
    handler = logging.FileHandler("/var/log/gogogon/ranks.log")
    handler.setFormatter(formatter)
    logger = logging.getLogger()
    logger.addHandler(handler)
    logger.setLevel(logging.DEBUG)
    logger.debug("starting up")

    today = datetime.datetime.today()
    one_day = datetime.timedelta(1)
    yesterday = today - one_day
    ymd = "%04d-%02d-%02d" % (yesterday.year, yesterday.month, yesterday.day)

    # find yesterday's log
    logfile = os.path.join(LOG_INPUT_DIR, "consumer.log.%s" % ymd)
    # But allow this to be overridden
    parser = optparse.OptionParser()
    parser.add_option('-f', '--file', dest="logfile", default=logfile)
    parser.add_option('-o',
                      '--output-directory',
                      dest="output_dir",
                      default=RANKS_OUTPUT_DIR)
    parser.add_option('-a',
                      '--agency',
                      dest="use_agency_domain",
                      default=False)
    options, remainder = parser.parse_args()
    logfile = options.logfile
    output_dir = options.output_dir

    if not os.path.exists(logfile):
        raise RuntimeError('Log file does not exist: ' + logfile)
    if not os.path.exists(output_dir):
        raise RuntimeError('Output directory does not exist: ' + output_dir)

    # sort and uniq the log
    cmd = 'grep INFO %s | cut -f 5- -d " " | sort | uniq -c' % logfile
    pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)

    # collect up global hashes and click counts
    details = dict()
    for line in pipe.stdout:
        (count, global_hash, url) = line.strip().split(' ', 2)
        details[global_hash] = dict(
            u=url,
            global_clicks=long(count),
            agency=domain(url),
            global_hash=global_hash,
        )

    # grab hashes in groups of GROUPSIZE size
    for i in xrange(1 + len(details) / GROUPSIZE):
        hashes = details.keys()[i * GROUPSIZE:i * GROUPSIZE + GROUPSIZE]
        # lookup titles
        for item in bitly.info(hashes=hashes):
            if 'title' not in item: continue
            details[item['hash']]['title'] = item['title']
        # lookup yesterday's clicks
        for item in bitly.clicks_by_day(hashes=hashes, days=2):
            if 'clicks' not in item: continue
            clicks = int(item['clicks'][1]['clicks'])
            if clicks > details[item['hash']]['global_clicks']:
                details[item['hash']]['global_clicks'] = clicks

    # sort by global clicks descending
    records = details.values()
    records.sort(key=lambda x: x["global_clicks"], reverse=True)

    logger.debug("writing output files")
    write_output_files(records, ymd, output_dir)
    if options.use_agency_domain:
        write_agency_domain_files(records, output_dir, ymd)
    logger.debug("shutting down")
Esempio n. 4
0
def main():
  # setup logger
  formatter = logging.Formatter('%(process)d %(levelname)s %(asctime)s %(message)s', '%Y-%m-%d %H:%M:%S')
  handler = logging.FileHandler("/var/log/gogogon/ranks.log")
  handler.setFormatter(formatter)
  logger = logging.getLogger()
  logger.addHandler(handler)
  logger.setLevel(logging.DEBUG)
  logger.debug("starting up")

  today = datetime.datetime.today()
  one_day = datetime.timedelta(1)
  yesterday = today - one_day
  ymd = "%04d-%02d-%02d" % (yesterday.year, yesterday.month, yesterday.day)
  
  # find yesterday's log
  logfile = os.path.join(LOG_INPUT_DIR, "consumer.log.%s" % ymd)
  # But allow this to be overridden
  parser = optparse.OptionParser()
  parser.add_option('-f', '--file', dest="logfile", 
                    default=logfile)
  parser.add_option('-o', '--output-directory', dest="output_dir", 
                    default= RANKS_OUTPUT_DIR)
  parser.add_option('-a', '--agency', dest="use_agency_domain", 
                    default=False)
  options, remainder = parser.parse_args()
  logfile = options.logfile
  output_dir = options.output_dir
            
  if not os.path.exists(logfile): 
      raise RuntimeError('Log file does not exist: ' + logfile)
  if not os.path.exists(output_dir):
      raise RuntimeError('Output directory does not exist: ' + output_dir)
  
  # sort and uniq the log
  cmd = 'grep INFO %s | cut -f 5- -d " " | sort | uniq -c' % logfile
  pipe = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
  
  # collect up global hashes and click counts
  details = dict()
  for line in pipe.stdout:
    (count, global_hash, url) = line.strip().split(' ', 2)
    details[global_hash] = dict(
      u=url,
      global_clicks=long(count),
      agency=domain(url),
      global_hash=global_hash,
    )
  
  # grab hashes in groups of GROUPSIZE size
  for i in xrange(1+len(details)/GROUPSIZE):
    hashes = details.keys()[i*GROUPSIZE:i*GROUPSIZE+GROUPSIZE]
    # lookup titles
    for item in bitly.info(hashes=hashes):
      if 'title' not in item: continue
      details[item['hash']]['title']=item['title']
    # lookup yesterday's clicks
    for item in bitly.clicks_by_day(hashes=hashes, days=2):
      if 'clicks' not in item: continue
      clicks = int(item['clicks'][1]['clicks'])
      if clicks > details[item['hash']]['global_clicks']:
        details[item['hash']]['global_clicks'] = clicks
  
  # sort by global clicks descending
  records = details.values()
  records.sort(key=lambda x: x["global_clicks"], reverse=True)  

  logger.debug("writing output files")
  write_output_files(records, ymd, output_dir)
  if options.use_agency_domain:
    write_agency_domain_files(records, output_dir, ymd)
  logger.debug("shutting down")