def extract_domain(url): url = urllib.unquote(urllib.unquote(urllib.unquote(url))) if "http" not in url: print >> sys.stderr, "Error 4, http is not in url, invalid url: %s" % url return "-1" url = url.split("http")[-1] if url.startswith("://"): url = url.strip("://") elif url.startswith("s://"): url = url.strip("s://") else: print >> sys.stderr, "Error 2, error http head, invalid url : %s" % url return "-1" url = "http://" + url if url_util.is_valid_url(url): url = url_util.regularize_url(url) domain = Filter.domain_url(url) if domain == "NULL": print >> sys.stderr, "Error 3, domain is null, invalid url : %s" % url return "-1" return domain else: print >> sys.stderr, "Error 1, invalid url: %s" % url return "-1"
import sys import url_util import urllib for line in sys.stdin: vals = line.strip().split("\t") if len(vals) <= 11: tag = "2" index = vals[3] url = vals[1] query = vals[2] ori_val = "\1".join(vals) print "%s\t%s\t%s\t%s\t%s" % (index, tag, query, url, ori_val) elif len(vals) > 140: tag = "1" index = vals[0] query = url_util.regularize_str(vals[4]) url = urllib.unquote(vals[93]) if not url_util.is_valid_url(url): continue url = url_util.regularize_url(url) rig_q = vals[140].split("#")[0] click_q = vals[150].split("%")[3] try: rig_q_val = float(rig_q) click_q_val = float(click_q) except: print >> sys.stderr, "Error! rig q: %s or click q: %s is invalid." % (rig_q, click_q) continue print "\t".join([index, tag, query, url, rig_q, click_q])