# assuming ndom are unique with open(sys.argv[1]) as f: aliaspair = [(normalize(k),set(v)) for k,v in json.loads(f.readline()).items()] aliasdict = {} for k,v in aliaspair: if not k in aliasdict: aliasdict[k] = set() aliasdict[k] = aliasdict[k].union(v) qlsPerDomain = 6 if len(sys.argv) < 3 else int(sys.argv[2]) keepFewQls = False if len(sys.argv) < 4 else eval(str(sys.argv[3])) pd = None printed_domain=set() for line in stdgen('lastdomain\tlastql\tlasttitle\t0.0'): tmp = line.strip('\r\t\n ').split('\t') domain, ql, title, score = tmp if not domain == pd: if not pd == None: ret = sorted(ret,key=lambda t:float(t[3]), reverse=True)[:qlsPerDomain] if keepFewQls or len(ret) == qlsPerDomain: aliases = aliasdict.get(ndom, [pd]) for alias in aliases: if alias not in printed_domain: print '\n'.join('\t'.join(map(fixstr,[alias] + r[1:4])) for r in ret) printed_domain.add(alias) ret = [] ndom = normalize(domain) pd = domain
100, 200, 300, 500, 1000, 2000, 3000, 5000, 10000, 20000, 30000, 100000, 1000000000 ] #nvs = [1,10,20,40,100, 250, 500,1000,10000,100000000000] nvs = [1, 5, 10, 20, 30, 40, 60, 80, 100, 150, 200, 300, 400, 500, 1000] cts = [ 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.2, 0.3, 0.5, 1.0, 100 ] thresholds = [(nv, nc, ct) for nv in nvs for nc in ncs for ct in cts] nths = len(thresholds) if mode == 'map': pd = None for line in stdgen('\t'.join(['last'] * 11)): # input schema: parent_url url ql_txt bucid nviews nviews_cond nclicks nviews_dom nclicks_dom ctr ctr_cond parent_url, url, ql_txt, bucid, nviews, nviews_cond, nclicks, _, _, ctr, ctr_cond = line.strip( '\r\t\n ').split('\t') if parent_url == 'last': break if not pd == parent_url: if not pd == None and len(ret) >= 6: ret = sorted(ret, key=lambda t: t[-1], reverse=True)[:6] nviews_dom = sum(t[1] for t in ret) / 6.0 nclicks_dom = sum(t[2] for t in ret) ctr_dom = nclicks_dom / nviews_dom binaries = map( int, map(
# other choice is combinedCtrEditGbdtModel exec("modelfunc = %s" % sys.argv[5]) featgps = [t.split(',') for t in featgps.split('+')] logging.warning('featgps = %s' % (str(featgps))) featnames = [t for g in featgps for t in g] models = [GbdtModelJson(f) for f in gbdtxml] if len(sys.argv) > 6: nfeats = map(int, sys.argv[6].split(',')) pd = None for line in stdgen('\t'.join(['domain', 'ql', 'title', '-1.0'] + ['[]' for i, c in enumerate(featgps)])): tmp = line.strip('\n\t').split( '\t' ) # for hadoop streaming, stripping trailing tab is almost always a good idea domain, ql, title, sublinkscore = tmp[:4] try: sublinkscore = float(sublinkscore) except: logging.warning('float cast error: ' + str(sublinkscore)) if not pd == domain or domain == 'domain': if not pd == None: ret2 = sorted([t[1:4] + [ mildsigmoid( modelfunc( dict(
nql = normalize(tmp[1]) if not ndom in wl: wl[ndom] = {} if not nql in wl[ ndom] and not rankbyctr: # or any other predefined wl score; for editorial whitelist, cannot rank by ctr score = 1.0 / (len(wl[ndom]) + 1.0 ) # assume wl results are ranked tmp[3] = score wl[ndom][nql] = tmp elif rankbyctr and (not nql in wl[ndom] or wl[ndom][nql][3] < float(tmp[3])): tmp[3] = float(tmp[3]) wl[ndom][nql] = tmp pd = None for line in stdgen('lastdomain\tlastql\tlasttitle\t0.0' ): # use first line as lastline tmp = line.strip('\n').split('\t') domain = tmp[0] if not pd == domain: if not pd == None: for nql, v in wldom.items(): # whitelist enhancement # first title enhancement if (ndom, nql) in ret and not edit: title = ret[(ndom, nql)][2] else: title = v[2] # next ranking enhancement if (ndom, nql) in ret and rankbyscore:
bingtitleindex = None if len(sys.argv) > 3: bingtitleindex = int(sys.argv[3]) pd = None printheaders = 0 if printheaders: print '\t'.join(['clicks','domain','ql','bingtitle','pagetitle','intanc','extanc','newpagetitle','newintancs','newextancs']) pts = None for line in stdgen('domain\tql\tredt\tpagetitle\twmdata\tdom_title\tdom_wmdata\tbingtitle'): # throw away dom_wmdata line = line.strip('\n') if line.count('\t') > 7: line = line.strip('\t') tmp = line.split('\t') domain, ql, redt, pagetitle, wmdata = tmp[:5] if not bingtitleindex == None: bingtitle = tmp[bingtitleindex] elif len(tmp) >= 8: bingtitle = tmp[7] else: bingtitle = None nurl = normalize(ql) clicks = str(clickdict.get(nurl,0.0)) if not only_roman_chars(pagetitle): pagetitle = '' bingtitle = ud(bingtitle)
beautify = True if len(sys.argv ) > 3: beautify = eval(str(beautify)) scoreidx = 6 if len(sys.argv )> 4: scoreidx = int(sys.argv[4]) if len(sys.argv) > 5: # for debugging purpose mode = sys.argv[5] swapDict = {scoreidx:3,3:scoreidx} revSwapDict = reverseDict(swapDict) if mode == 'debug': pd = None for line in stdgen('None2\tNone2\tNone2\t0.0'): # domain ql title score tmp = line.strip('\n').split('\t') domain, ql, title = tmp[:3] score = float(tmp[3]) ndom = normalize(domain) if not pd == ndom: if not pd == None: print '\n'.join('\t'.join(map(str,t)) for t in QlPostProcess(ndom,inputbag,strong=True,nitems=10,sortby=3, dedup=True, beautify=True)) inputbag = [] pd = ndom inputbag.append([domain, ql, title, score]) else: pd = None for line in stdgen(('None\t' * 11)[:-1]):