Ejemplo n.º 1
0
# assuming ndom are unique

with open(sys.argv[1]) as f:
    aliaspair = [(normalize(k),set(v)) for k,v in json.loads(f.readline()).items()]
aliasdict = {}
for k,v in aliaspair:
    if not k in aliasdict: aliasdict[k] = set()
    aliasdict[k] = aliasdict[k].union(v)

qlsPerDomain = 6 if len(sys.argv) < 3 else int(sys.argv[2])

keepFewQls = False if len(sys.argv) < 4 else eval(str(sys.argv[3]))

pd = None
printed_domain=set()
for line in stdgen('lastdomain\tlastql\tlasttitle\t0.0'):
    tmp = line.strip('\r\t\n ').split('\t')
    domain, ql, title, score = tmp
    if not domain == pd:
        if not pd == None:
            ret = sorted(ret,key=lambda t:float(t[3]), reverse=True)[:qlsPerDomain]
            if keepFewQls or len(ret) == qlsPerDomain:
                aliases = aliasdict.get(ndom, [pd])
                for alias in aliases:
                    if alias not in printed_domain:
                        print '\n'.join('\t'.join(map(fixstr,[alias] +  r[1:4])) for r in ret)
                        printed_domain.add(alias)
        ret = []
        ndom = normalize(domain)
        pd = domain
Ejemplo n.º 2
0
        100, 200, 300, 500, 1000, 2000, 3000, 5000, 10000, 20000, 30000,
        100000, 1000000000
    ]
    #nvs = [1,10,20,40,100, 250, 500,1000,10000,100000000000]
    nvs = [1, 5, 10, 20, 30, 40, 60, 80, 100, 150, 200, 300, 400, 500, 1000]
    cts = [
        0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13,
        0.14, 0.15, 0.2, 0.3, 0.5, 1.0, 100
    ]

    thresholds = [(nv, nc, ct) for nv in nvs for nc in ncs for ct in cts]
    nths = len(thresholds)
    if mode == 'map':
        pd = None

        for line in stdgen('\t'.join(['last'] * 11)):
            # input schema: parent_url	url	ql_txt	bucid	nviews	nviews_cond	nclicks	nviews_dom	nclicks_dom	ctr	ctr_cond

            parent_url, url, ql_txt, bucid, nviews, nviews_cond, nclicks, _, _, ctr, ctr_cond = line.strip(
                '\r\t\n ').split('\t')
            if parent_url == 'last': break
            if not pd == parent_url:
                if not pd == None and len(ret) >= 6:
                    ret = sorted(ret, key=lambda t: t[-1], reverse=True)[:6]
                    nviews_dom = sum(t[1] for t in ret) / 6.0
                    nclicks_dom = sum(t[2] for t in ret)

                    ctr_dom = nclicks_dom / nviews_dom
                    binaries = map(
                        int,
                        map(
Ejemplo n.º 3
0
        # other choice is combinedCtrEditGbdtModel
        exec("modelfunc = %s" % sys.argv[5])

    featgps = [t.split(',') for t in featgps.split('+')]

    logging.warning('featgps = %s' % (str(featgps)))

    featnames = [t for g in featgps for t in g]
    models = [GbdtModelJson(f) for f in gbdtxml]

    if len(sys.argv) > 6:
        nfeats = map(int, sys.argv[6].split(','))

    pd = None

    for line in stdgen('\t'.join(['domain', 'ql', 'title', '-1.0'] +
                                 ['[]' for i, c in enumerate(featgps)])):
        tmp = line.strip('\n\t').split(
            '\t'
        )  # for hadoop streaming, stripping trailing tab is almost always a good idea
        domain, ql, title, sublinkscore = tmp[:4]
        try:
            sublinkscore = float(sublinkscore)
        except:
            logging.warning('float cast error: ' + str(sublinkscore))
        if not pd == domain or domain == 'domain':
            if not pd == None:

                ret2 = sorted([t[1:4] + [
                                mildsigmoid(
                                        modelfunc(
                                                dict(
Ejemplo n.º 4
0
                nql = normalize(tmp[1])
                if not ndom in wl: wl[ndom] = {}
                if not nql in wl[
                        ndom] and not rankbyctr:  # or any other predefined wl score; for editorial whitelist, cannot rank by ctr
                    score = 1.0 / (len(wl[ndom]) + 1.0
                                   )  # assume wl results are ranked
                    tmp[3] = score
                    wl[ndom][nql] = tmp
                elif rankbyctr and (not nql in wl[ndom]
                                    or wl[ndom][nql][3] < float(tmp[3])):
                    tmp[3] = float(tmp[3])
                    wl[ndom][nql] = tmp

    pd = None

    for line in stdgen('lastdomain\tlastql\tlasttitle\t0.0'
                       ):  # use first line as lastline
        tmp = line.strip('\n').split('\t')

        domain = tmp[0]
        if not pd == domain:

            if not pd == None:

                for nql, v in wldom.items():  # whitelist enhancement
                    # first title enhancement
                    if (ndom, nql) in ret and not edit:
                        title = ret[(ndom, nql)][2]
                    else:
                        title = v[2]
                    # next ranking enhancement
                    if (ndom, nql) in ret and rankbyscore:
Ejemplo n.º 5
0
    bingtitleindex = None
    if len(sys.argv) > 3:
        bingtitleindex = int(sys.argv[3])

    pd = None


    printheaders = 0


    if printheaders:
        print '\t'.join(['clicks','domain','ql','bingtitle','pagetitle','intanc','extanc','newpagetitle','newintancs','newextancs'])

    pts = None
    for line in stdgen('domain\tql\tredt\tpagetitle\twmdata\tdom_title\tdom_wmdata\tbingtitle'):
        # throw away dom_wmdata
        
        line = line.strip('\n')
        if line.count('\t') > 7: line = line.strip('\t')
        tmp = line.split('\t')
        domain, ql, redt, pagetitle, wmdata = tmp[:5]
        if not bingtitleindex == None: bingtitle = tmp[bingtitleindex]
        elif len(tmp) >= 8:
            bingtitle = tmp[7]
        else:
            bingtitle = None
        nurl = normalize(ql)
        clicks  = str(clickdict.get(nurl,0.0))
        if not only_roman_chars(pagetitle): pagetitle = ''
        bingtitle = ud(bingtitle)
Ejemplo n.º 6
0
    beautify = True
    if len(sys.argv ) > 3:
        beautify = eval(str(beautify))
    scoreidx = 6
    if len(sys.argv )> 4:
        scoreidx = int(sys.argv[4])

    if len(sys.argv) > 5:   # for debugging purpose
        mode = sys.argv[5]

    swapDict = {scoreidx:3,3:scoreidx}
    revSwapDict = reverseDict(swapDict)
    
    if mode == 'debug':
        pd = None
        for line in stdgen('None2\tNone2\tNone2\t0.0'):
            # domain ql title score
            tmp = line.strip('\n').split('\t')
            domain, ql, title = tmp[:3]
            score = float(tmp[3])
            ndom = normalize(domain)
            if not pd == ndom:
                if not pd == None:
                    print '\n'.join('\t'.join(map(str,t)) for t in QlPostProcess(ndom,inputbag,strong=True,nitems=10,sortby=3, dedup=True, beautify=True))
                inputbag = []
                pd = ndom
            inputbag.append([domain, ql, title, score])

    else:
        pd = None
        for line in stdgen(('None\t' * 11)[:-1]):