import sys import url_util SOURCE = "word_url_winfoid" for line in sys.stdin: val = line.strip().split("\t") level = val[0] if level == "4": winfoid = val[1] bidword = url_util.regularize_str(val[6]) pc_url = val[13] wise_url = val[15] if pc_url != "DFT": pc_url = url_util.regularize_url(pc_url) print "%s\t%s\t%s\t%s" % (bidword, pc_url, SOURCE, winfoid) if wise_url != "DFT": wise_url = url_util.regularize_url(wise_url) print "%s\t%s\t%s\t%s" % (bidword, wise_url, SOURCE, winfoid)
return "-1" for line in sys.stdin: line = line.strip() if line == "": continue blocks = line.split(" ") if len(blocks) < 2: print >> sys.stderr, "shitu log is invalid, Error: len of vals is %s" % len(vals) continue vals = blocks[2].split("\t") if len(vals) < 23: print >> sys.stderr, "shitu log is invalid, Error: len of vals is %s" % len(vals) continue bidword = url_util.regularize_str(urllib.unquote(vals[23])) click = vals[1] if shitu_type == "wise": if len(vals) < 142: print >> sys.stderr, "shitu wise log is invalid, Error: len of vals is %s" % len(vals) continue target_url = vals[92] query_trade = vals[123] original_bid = vals[141] elif shitu_type == "pc": if len(vals) < 153: print >> sys.stderr, "shitu pc log is invalid, Error: len of vals is %s" % len(vals) continue target_url = vals[73] query_trade = vals[137] original_bid = vals[153]
import sys import url_util for line in sys.stdin: vals = line.strip().split("\t") if len(vals) >= 4: tag = "2" query = url_util.regularize_str(vals[0]) url = url_util.regularize_url(vals[11]) ori_val = "\1".join(vals) print "%s\t%s\t%s\t%s" % (query, url, tag, ori_val) elif len(vals) == 3: tag = "1" string = vals[0] url = vals[1] rel_q = vals[2] try: q_val = float(rel_q) except e: print >> sys.stderr, "Error! relevance q: %s is invalid." % rel_q continue print "%s\t%s\t%s\t%s" % (string, url, tag, rel_q)
for line in sys.stdin: line = line.strip() if line == "": continue blocks = line.split(" ") if len(blocks) < 2: print >> sys.stderr, "shitu log is invalid, Error: len of vals is %s" % len( vals) continue vals = blocks[2].split("\t") if len(vals) < 23: print >> sys.stderr, "shitu log is invalid, Error: len of vals is %s" % len( vals) continue bidword = url_util.regularize_str(urllib.unquote(vals[23])) click = vals[1] if shitu_type == "wise": if len(vals) < 142: print >> sys.stderr, "shitu wise log is invalid, Error: len of vals is %s" % len( vals) continue target_url = vals[92] query_trade = vals[123] original_bid = vals[141] elif shitu_type == "pc": if len(vals) < 153: print >> sys.stderr, "shitu pc log is invalid, Error: len of vals is %s" % len( vals) continue target_url = vals[73]
import sys import url_util import urllib for line in sys.stdin: vals = line.strip().split("\t") if len(vals) <= 11: tag = "2" index = vals[3] url = vals[1] query = vals[2] ori_val = "\1".join(vals) print "%s\t%s\t%s\t%s\t%s" % (index, tag, query, url, ori_val) elif len(vals) > 140: tag = "1" index = vals[0] query = url_util.regularize_str(vals[4]) url = urllib.unquote(vals[93]) if not url_util.is_valid_url(url): continue url = url_util.regularize_url(url) rig_q = vals[140].split("#")[0] click_q = vals[150].split("%")[3] try: rig_q_val = float(rig_q) click_q_val = float(click_q) except: print >> sys.stderr, "Error! rig q: %s or click q: %s is invalid." % (rig_q, click_q) continue print "\t".join([index, tag, query, url, rig_q, click_q])