コード例 #1
0
import sys
import url_util

SOURCE = "word_url_winfoid"

for line in sys.stdin:
    val = line.strip().split("\t")
	level    = val[0]
	
	if level == "4":
		winfoid  = val[1]
		bidword  = url_util.regularize_str(val[6])
		pc_url   = val[13]
		wise_url = val[15]
		if pc_url != "DFT":
			pc_url   = url_util.regularize_url(pc_url)
			print "%s\t%s\t%s\t%s" % (bidword, pc_url, SOURCE, winfoid)
		if wise_url != "DFT":
			wise_url = url_util.regularize_url(wise_url)
			print "%s\t%s\t%s\t%s" % (bidword, wise_url, SOURCE, winfoid)
        return "-1"
        

for line in sys.stdin:
    line = line.strip()
    if line == "":
        continue
    blocks = line.split("   ")
    if len(blocks) < 2:
        print >> sys.stderr, "shitu log is invalid, Error: len of vals is %s" % len(vals)
        continue
    vals = blocks[2].split("\t")
    if len(vals) < 23:
        print >> sys.stderr, "shitu log is invalid, Error: len of vals is %s" % len(vals)
        continue
    bidword = url_util.regularize_str(urllib.unquote(vals[23]))
    click = vals[1]
    if shitu_type == "wise":
        if len(vals) < 142:
            print >> sys.stderr, "shitu wise log is invalid, Error: len of vals is %s" % len(vals)
            continue
        target_url = vals[92]
        query_trade = vals[123]
        original_bid = vals[141]
    elif shitu_type == "pc":
        if len(vals) < 153:
            print >> sys.stderr, "shitu pc log is invalid, Error: len of vals is %s" % len(vals)
            continue
        target_url = vals[73]
        query_trade = vals[137]
        original_bid = vals[153]    
コード例 #3
0
import sys
import url_util

for line in sys.stdin:
    vals = line.strip().split("\t")
    if len(vals) >= 4:
        tag = "2"
        query = url_util.regularize_str(vals[0])
        url = url_util.regularize_url(vals[11])
        ori_val = "\1".join(vals)
        print "%s\t%s\t%s\t%s" % (query, url, tag, ori_val)
    elif len(vals) == 3:
        tag = "1"
        string = vals[0]
        url = vals[1]
        rel_q = vals[2]
        try:
            q_val = float(rel_q)
        except e:
            print >> sys.stderr, "Error! relevance q: %s is invalid." % rel_q
            continue
        print "%s\t%s\t%s\t%s" % (string, url, tag, rel_q)
for line in sys.stdin:
    line = line.strip()
    if line == "":
        continue
    blocks = line.split("   ")
    if len(blocks) < 2:
        print >> sys.stderr, "shitu log is invalid, Error: len of vals is %s" % len(
            vals)
        continue
    vals = blocks[2].split("\t")
    if len(vals) < 23:
        print >> sys.stderr, "shitu log is invalid, Error: len of vals is %s" % len(
            vals)
        continue
    bidword = url_util.regularize_str(urllib.unquote(vals[23]))
    click = vals[1]
    if shitu_type == "wise":
        if len(vals) < 142:
            print >> sys.stderr, "shitu wise log is invalid, Error: len of vals is %s" % len(
                vals)
            continue
        target_url = vals[92]
        query_trade = vals[123]
        original_bid = vals[141]
    elif shitu_type == "pc":
        if len(vals) < 153:
            print >> sys.stderr, "shitu pc log is invalid, Error: len of vals is %s" % len(
                vals)
            continue
        target_url = vals[73]
コード例 #5
0
import sys
import url_util
import urllib

for line in sys.stdin:
	vals = line.strip().split("\t")
	if len(vals) <= 11:
		tag = "2"
		index = vals[3]
		url = vals[1]
		query = vals[2]
		ori_val = "\1".join(vals)
		print "%s\t%s\t%s\t%s\t%s" % (index, tag, query, url, ori_val)
	elif len(vals) > 140:
		tag = "1"
		index = vals[0]
		query = url_util.regularize_str(vals[4])
		url = urllib.unquote(vals[93])
		if not url_util.is_valid_url(url):
			continue
		url = url_util.regularize_url(url)
		rig_q = vals[140].split("#")[0]
		click_q = vals[150].split("%")[3]
		try:
			rig_q_val = float(rig_q)
			click_q_val = float(click_q)
		except:
			print >> sys.stderr, "Error! rig q: %s or click q: %s is invalid." % (rig_q, click_q)
			continue
		print "\t".join([index, tag, query, url, rig_q, click_q])