print "Usage:"
    print "python test_yahoo_categories.py URLS [OPTIONS]";
    print ""
    print "Load urls and record Yahoo's categorization of them."
    print "[OPTIONS] is used to pass configuration parameters to PhantomJS"
    sys.exit(0);
    
checkArguments();
stats = Stats();
adExtractor = AdExtractor();

f = open(sys.argv[1]);
urls = f.readlines();
f.close();

categories = [];

for i in range(len(urls)):
  urls[i] = urls[i].replace('\n','');
  result = adExtractor.queryYahoo(urls[i]);
  try:
    for i in range(len(result['category'])):
      cat = result['category'][i]['category'];
      if not cat in categories:
        categories.append(cat);
  except:
    stats.increment('Category detection failed', 1);
    
categories.sort();
for i in range(len(categories)):
  print categories[i];
Ejemplo n.º 2
0
class AdExtractor:
  # Ad category db
  ad_db_filename = expanduser('~') + '/Lab_TargetedAds/src/resources/ad_db.json';
  ad_db = {};
  
  # Redirection DB
  redirection_db_filename = expanduser('~') + '/Lab_TargetedAds/src/resources/redirection_db.json'
  redirection_db = {};
  
  ad_providers_filename = expanduser('~') + '/Lab_TargetedAds/src/resources/ad_providers.json'
  ad_providers = [];
  
  category_mapping_alexa_filename = expanduser('~') + '/Lab_TargetedAds/src/resources/category_mapping_alexa.json'
  category_mapping_alexa = {};
  
  category_mapping_yahoo_filename = expanduser('~') + '/Lab_TargetedAds/src/resources/category_mapping_yahoo.json'
  category_mapping_yahoo = {};
  
  category_mapping_alchemy_filename = expanduser('~') + '/Lab_TargetedAds/src/resources/category_mapping_alchemy.json'
  category_mapping_alchemy = {};
  
  category_mapping_bluecoat_filename = expanduser('~') + '/Lab_TargetedAds/src/resources/category_mapping_bluecoat.json'
  category_mapping_bluecoat = {};
  
  def __init__(self):
    f = open(self.ad_db_filename);
    self.ad_db = json.load(f);
    f.close();
    f = open(self.redirection_db_filename);
    self.redirection_db = json.load(f);
    f.close();
    f = open(self.ad_providers_filename);
    self.ad_providers = json.load(f);
    f.close();
    
    f = open(self.category_mapping_alexa_filename);
    self.category_mapping_alexa = json.load(f);
    f.close();
    f = open(self.category_mapping_yahoo_filename);
    self.category_mapping_yahoo = json.load(f);
    f.close();
    f = open(self.category_mapping_alchemy_filename);
    self.category_mapping_alchemy = json.load(f);
    f.close();
    f = open(self.category_mapping_bluecoat_filename);
    self.category_mapping_bluecoat = json.load(f);
    f.close();
    
    self.stats = Stats('Ad Extracting Statistics');
  
  def isAdProvider(self, url):
    host = url.split('?')[0];
    for i in range(len(self.ad_providers)):
      try: # In case of encode error
        if self.ad_providers[i] in host:
          return True;
      except:
        continue;
    return False;
  
  def getAdProviders(self, url):
    ret = [];
    for i in range(len(self.ad_providers)):
      try: # In case of encode error
        if self.ad_providers[i] in url and not self.ad_providers[i] in ret:
          ret.append(self.ad_providers[i]);
      except:
        continue;
    return ret;
  
  def getLandingUrl(self, url):
    if self.isAdProvider(url):
      result = self.detectRedirection(url);
      if result != "NONE":
        return result.lower();
      else:
        return "NONE"
    else:
      return url.lower();
    
  def getLandingDomain(self, url):
    return url2Domain(self.getLandingUrl(url));
  
  def detectRedirection(self, url):
    # return "NONE"; # If we don't want to detect redirection
    if url[:4] != "http":
      return 'NONE' # TODO: Handle flashvars, may need decoding.
    
    if url in self.redirection_db:
      self.stats.increment('Redirection DB hit', 1);
      return self.redirection_db[url];
    self.stats.increment('Redirection DB miss', 1);
    
    command = "~/Lab_TargetedAds/phantomjs/phantomjs--linux-x86_64/bin/phantomjs ~/Lab_TargetedAds/src/bin/detect_redirection.js '";
    command += url + "'";
    results = runCommand(command);
    self.stats.increment('Ad link clicked', 1);
    for i in range(len(results)):
      results[i] = results[i].split('\t');
      if len(results[i]) >= 3 and results[i][0] == '<MSG><RESULT>' and results[i][1] == "Destination":
        destination = results[i][2];
        if not self.isAdProvider(destination):
          if not url in self.redirection_db:
            self.redirection_db[url] = destination;
          return results[i][2];
    return "NONE";
  
  def outputRedirectionDb(self):
    f = open(self.redirection_db_filename, 'w');
    f.write(json.dumps(self.redirection_db).replace('", "', '",\n"'));
    f.close();
    
  def outputAdDb(self):
    f = open(self.ad_db_filename, 'w');
    f.write(json.dumps(self.ad_db).replace('}, "http', '},\n"http'));
    f.close();
    
  def updateDb(self):
    self.outputRedirectionDb();
    self.outputAdDb();
  
  def getPageCategory(self, url):
    result = self.getPageRawCategory(url);
    if result == {}:
      return {};
    ret = {'source':'','category':[]};
    # Two levels for Alexa (e.g. Top/Shopping/Music)
    if result['source'] == 'Alexa':
      ret['source'] = 'Alexa';
      try:
        if type(result['category']) == list:
          for i in range(len(result['category'])):
            path = result['category'][i]['AbsolutePath'];
            refined_path = '/'.join(path.split('/')[:3]);
            # Ignore region-based categories
            if 'World' in refined_path or 'Region' in refined_path:
              continue;
            if not refined_path in ret['category']:
              ret['category'].append(refined_path);
        elif type(result['category']) == dict:
          path = result['category']['AbsolutePath'];
          refined_path = '/'.join(path.split('/')[:3]);
          ret['category'].append(refined_path);
      except:
        self.stats.increment('Category detection failed', 1);
        print 'ERROR PARSING:',result;
    # Ignore all scores for Yahoo
    elif result['source'] == 'Yahoo':
      ret['source'] = 'Yahoo';
      try:
        for i in range(len(result['category'])):
          cat = result['category'][i]['category'];
          if not cat in ret['category']:
            ret['category'].append(cat);
      except:
        self.stats.increment('Category detection failed', 1);
        print 'ERROR PARSING:',result;
    # Alchemy
    elif result['source'] == 'Alchemy':
      ret['source'] = 'Alchemy';
      ret['category'] = [result['category']];
    # Bluecoat
    elif result['source'] == 'Bluecoat':
      ret['source'] = 'Bluecoat';
      try:
        for i in range(len(result['category'])):
          cat = result['category'][i];
          if not cat in ret['category']:
            ret['category'].append(cat);
      except:
        self.stats.increment('Category detection failed', 1);
        print 'ERROR PARSING:',result;
    else:
      ret = {};
    if ret != {} and ret['source'] != '':
      self.stats.increment('Category detection succeeded', 1);
      ret['mapped_category'] = self.mapCategory(ret['source'], ret['category']);
    else:
      self.stats.increment('Category detection failed', 1);
    return ret;
  
  def getPageRawCategory(self, url):
    if url == None:
      return {};
    homepage = url2Homepage(url);
    if (url in self.ad_db) and (self.ad_db[url] != []) and (self.ad_db[url] != {}):
      self.stats.increment('Ad category DB hit', 1);
      return self.ad_db[url];
    if (homepage in self.ad_db) and (self.ad_db[homepage] != []) and (self.ad_db[homepage] != {}):
      self.stats.increment('Ad category DB hit', 1);
      return self.ad_db[homepage];
    self.stats.increment('Ad category DB miss', 1);
    # Alexa
    for i in range(3):
      self.stats.increment('Alexa queried', 1);
      ret = self.queryAlexa(url);
      if ret != {}:
        self.ad_db[url] = ret;
        return ret;
    for i in range(3):
      self.stats.increment('Alexa queried', 1);
      ret = self.queryAlexa(homepage);
      if ret != {}:
        self.ad_db[homepage] = ret;
        return ret;
    # Yahoo
    for i in range(2):
      self.stats.increment('Yahoo queried', 1);
      ret = self.queryYahoo(url);
      if ret != {}:
        self.ad_db[url] = ret;
        return ret;
    for i in range(2):
      self.stats.increment('Yahoo queried', 1);
      ret = self.queryYahoo(homepage);
      if ret != {}:
        self.ad_db[homepage] = ret;
        return ret;
    # Bluecoat
    for i in range(2):
      self.stats.increment('Bluecoat queried', 1);
      ret = self.queryBluecoat(url);
      if ret != {}:
        self.ad_db[url] = ret;
        return ret;
    return ret;
    # Alchemy
    #ret = self.queryAlchemy(url);
    #self.stats.increment('Alchemy queried', 1);
    #if ret != {}:
    #  self.ad_db[url] = ret;
    #  return ret;
    #ret = self.queryAlchemy(url);
    #self.stats.increment('Alchemy queried', 1);
    #if ret != {}:
    #  self.ad_db[homepage] = ret;
    #  return ret;
    #return ret;
    
  def queryAlexa(self, url):
    php_url = "http://localhost/ad_detect/get_url_category.php";
    php_url += '?site=' + urllib2.quote(url);
    try:
      response = urllib2.urlopen(php_url);
      html = response.read();
      ret = json.loads(html)['Response']['UrlInfoResult']['Alexa']['Related']['Categories']['CategoryData'];
      # Check whether it's a single Top/World category
      if type(ret) == dict:
        path = ret['AbsolutePath'];
        refined_path = '/'.join(path.split('/')[:3]);
        if 'World' in refined_path or 'Region' in refined_path:
          return {};
      if type(ret) == list:
        empty = True;
        for i in range(len(ret)):
          path = ret[i]['AbsolutePath'];
          refined_path = '/'.join(path.split('/')[:3]);
          # Ignore region-based categories
          if not 'World' in refined_path and not 'Region' in refined_path:
            empty = False;
        if empty:
          return {};
      return {'source':'Alexa', 'category':ret};
    except:
      return {}
  
  def queryYahoo(self, url):
    try:
      query = 'SELECT * FROM contentanalysis.analyze WHERE url="' + url + '"';
      baseUrl = 'http://query.yahooapis.com/v1/public/yql?q=';
      fullurl = baseUrl + urllib2.quote(query);
      response = urllib2.urlopen(fullurl);
      html = response.read();
      ret = {};
      ret['source'] = 'Yahoo';
      ret['category'] = [];
      # Find all categories
      str = html;
      index = str.find('yctCategory score="');
      while index >= 0:
        str = str[index + 19:];
        category_bgn = str.find('>') + 1;
        category_end = str.find('<');
        category = str[category_bgn:category_end];
        parser = HTMLParser()
        category = parser.unescape(category);
        score = float(str.split('"')[0]);
        ret['category'].append({'category': category, 'score': score});
        index = str.find('yctCategory score="');
      if len(ret['category']) > 0:
        return ret;
      else:
        return {};
    except:
      return {};
  
  def queryAlchemy(self, url):
    alchemy_api_key = $ALCHEMY_API_KEY$;
    alchemy_api_url = "http://access.alchemyapi.com/calls/url/URLGetCategory";
    call_url = alchemy_api_url + "?apikey=" + alchemy_api_key;
    call_url += "&url=" + urllib2.quote(url);
    call_url += "&outputMode=json";
    try:
      response = urllib2.urlopen(call_url);
      html = response.read();
      ret = {};
      ret['source'] = 'Alchemy';
      ret['category'] = json.loads(html)['category'];
      if ret['category'] == 'unknown':
        return {};
      else:
        return ret;
    except:
      return {};
    
  def queryBluecoat(self, url):
    phantomjs_bin = "~/Lab_TargetedAds/phantomjs/phantomjs--linux-x86_64/bin/phantomjs";
    phantomjs_script = "~/Lab_TargetedAds/src/bin/bluecoat_category.js";
    command = phantomjs_bin + ' ' + phantomjs_script + ' "' + url + '"';
    results = runCommand(command);
    ret = {}
    try:
      ret['source'] = 'Bluecoat'
      ret['category'] = []
      for i in range(len(results)):
        temp = results[i].split('\t');
        if len(temp) >= 2 and temp[0] == '<CATEGORY>':
          ret['category'].append(temp[1]);
      if len(ret['category']) > 0:
        return ret;
      else:
        return {};
    except:
      return {}
    
  def mapCategory(self, source, category):
    ret = []
    for i in range(len(category)):
      cat = category[i];
      if source == 'Alexa':
        while not cat in self.category_mapping_alexa and '/' in cat:
          cat = '/'.join(cat.split('/')[:-1]);
        if cat in self.category_mapping_alexa:
          for j in range(len(self.category_mapping_alexa[cat])):
            if not self.category_mapping_alexa[cat][j] in ret:
              ret.append(self.category_mapping_alexa[cat][j])
        else:
          print "MAPPING FAILED\tAlexa\t",cat
      elif source == 'Yahoo':
        if cat in self.category_mapping_yahoo:
          for j in range(len(self.category_mapping_yahoo[cat])):
            if not self.category_mapping_yahoo[cat][j] in ret:
              ret.append(self.category_mapping_yahoo[cat][j])
        else:
          print "MAPPING FAILED\tYahoo\t",cat
      elif source == 'Alchemy':
        if cat in self.category_mapping_alchemy:
          for j in range(len(self.category_mapping_alchemy[cat])):
            if not self.category_mapping_alchemy[cat][j] in ret:
              ret.append(self.category_mapping_alchemy[cat][j])
        else:
          print "MAPPING FAILED\tAlchemy\t",cat
      elif source == 'Bluecoat':
        if cat in self.category_mapping_bluecoat:
          for j in range(len(self.category_mapping_bluecoat[cat])):
            if not self.category_mapping_bluecoat[cat][j] in ret:
              ret.append(self.category_mapping_bluecoat[cat][j])
        else:
          print "MAPPING FAILED\tBluecoat\t",cat
    return ret;
      
  def __test__(self):
    print self.getPageCategory("http://www.microsoft.com");