Ejemplo n.º 1
0
    def select_top_k_words(self, texts, field, mode="count", k=10, slack=8):
        name = "{}/{}_top{}-{}_{}_words.p".format(self.flags.data_path,
                                                  self.name, k, slack, mode)
        selected = load_pickle(None, name, set())
        if len(selected):
            return selected
        print("gen", name)

        name = "{}/{}_stem_dic.p".format(self.flags.data_path, self.name)
        self.stem_dic = load_pickle(self.stem_dic, name, {})
        assert len(self.stem_dic)

        if mode == "count":
            self.get_per_sample_words_count(texts, field)
        elif mode == "tfidf":
            self.get_per_sample_tfidf(texts, field)
        elif mode == "tf":
            self.get_per_sample_tfidf(texts, field)
        else:
            print("unknown mode", mode)
            assert 0

        for text in texts:
            if mode == "count":
                data = self.sample_words_count[text]
            elif mode == "tfidf":
                data = self.sample_tfidf[text]
            elif mode == "tf":
                data = self.sample_tf[text]

            for c, wd in enumerate(data):
                topk = sort_value(wd)[:k + slack]
                topk = set([i for i in topk if len(i) > 2])
                selected = selected.union(topk)
                if c > 0 and c % 1000 == 0:
                    print(
                        "{} documents done, mode {}, sample {}, num {}".format(
                            c, mode, topk, len(selected)))
        print("num of selected {} key words:".format(mode), len(selected))
        name = "{}/{}_top{}-{}_{}_words.p".format(self.flags.data_path,
                                                  self.name, k, slack, mode)
        save_pickle(selected, name)
        return selected
Ejemplo n.º 2
0
 def select_top_k_tfidf_words(self, texts, k=10, slack=8):
     from random import sample
     name = "{}/top{}-{}_tfidf_words.p".format(self.flags.data_path, k,
                                               slack)
     selected = load_pickle(None, name, set())
     if len(selected):
         return selected
     self.get_per_sample_tfidf(['training_text', 'test_text'], "Text")
     for text in texts:
         data = self.sample_tfidf[text]
         for c, tfidf in enumerate(data):
             topk = sort_value(tfidf)[:k + slack]
             topk = set([i for i in topk if len(i) > 2])
             selected = selected.union(topk)
             if c > 0 and c % 1000 == 0:
                 print("{} documents done, sample {}, num {}".format(
                     c, topk, len(selected)))
     print(len(selected))
     name = "{}/top{}-{}_tfidf_words.p".format(self.flags.data_path, k,
                                               slack)
     save_pickle(selected, name)
     return selected
Ejemplo n.º 3
0
 def feature_importance(self):
     fscore = self.bst.get_fscore()
     feas = sort_value(fscore)
     return [(fea, fscore[fea]) for fea in feas]
Ejemplo n.º 4
0
def sample_coord_data(data,
                      start_day,
                      end_day,
                      out,
                      min_distance=4000,
                      topk=5,
                      is_train=True,
                      counter={},
                      scounter=defaultdict(int),
                      dist_dic={}):

    if os.path.exists(out):
        return None, None, None

    bsdic = get_consecutive_start_dic(
        ['../input/train_sort.csv', '../input/test_sort.csv'],
        'comps/mobike/sol_carl/data/bikes.p', 'bikeid')
    usdic = get_consecutive_start_dic(
        ['../input/train_sort.csv', '../input/test_sort.csv'],
        'comps/mobike/sol_carl/data/users.p', 'userid')

    h2c = pickle.load(open("comps/mobike/sol_carl/data/h2c.p", "rb"))
    c2h = pickle.load(open("comps/mobike/sol_carl/data/c2h.p", "rb"))
    candifeas = ["userid", "bikeid", 'geohashed_start_loc']
    feas = candifeas + ["hour", "dow", "coord_start"]  #,'next_bs','next_us']
    dist_feas = [
        'ave_dist_ratio', 'max_dist_ratio', 'over_ave_dist', 'over_max_dist'
    ]
    all_dist_feas = ["%s-%s" % (i, j) for i in feas for j in dist_feas]

    lenh, lenc, rate = [], [], []
    fo = open(out, 'w')
    head = ",".join(feas + all_dist_feas)
    fo.write("orderid,candidate_loc,label,%s,next_bsx,next_usx,tbsx,tusx\n" %
             head)
    with open(data) as f:
        for c, row in enumerate(csv.DictReader(f)):
            ymd, hms = row['starttime'].split()
            year, month, day = ymd.split('-')
            day = int(day)
            row["coord_start"] = h2c[row['geohashed_start_loc']]
            if is_train:
                row["coord_end"] = h2c[row['geohashed_end_loc']]
            if 'geohashed_end_loc' not in row:
                row['geohashed_end_loc'] = 'xx'
            row['hour'] = hms.split('-')[0]
            row['dow'] = str(((int(month) - 5) * 31 + int(day)) % 7)
            row['next_bs'] = h2c.get(
                bsdic.get(row['orderid'], ('', 100000))[0], '')
            row['next_us'] = h2c.get(
                usdic.get(row['orderid'], ('', 100000))[0], '')

            if day > end_day:
                break
            elif is_train == False and day < start_day:
                continue
            elif day >= start_day:
                candidates = set()
                coords = set()
                for fea in candifeas:
                    val = "{}_{}".format(fea, row[fea])
                    dic = counter.get(val, {})
                    keys = sort_value(dic)
                    for d, coord in enumerate(keys):
                        if fea != "userid" and d > topk:
                            continue
                        if geo_distance(
                                float_coord(coord),
                                float_coord(
                                    row["coord_start"])) > min_distance:
                            continue
                        coords.add(coord)
                        points = c2h.get(coord, set())
                        candidates = candidates.union(points)
                lat, lon = float_coord(h2c[row['geohashed_start_loc']])
                for i in [0, 0.01, -0.01]:
                    for j in [0, 0.01, -0.01]:
                        if abs(i) == 0.01 and abs(j) == 0.01:
                            continue
                        coord = (lat + i, lon + j)
                        if coord not in c2h:
                            continue
                        if geo_distance(
                                float_coord(coord),
                                float_coord(
                                    row["coord_start"])) > min_distance:
                            continue
                        coords.add(coord)
                        ps = c2h.get(coord, [])
                        for p in ps:
                            candidates.add(p)
                tmp = {}
                for fea in feas:
                    val = "%s_%s" % (fea, row[fea])
                    dis = dist_dic.get(val, [0])
                    ad = ave(dis)
                    md = max(dis)
                    tmp[val] = [ad, md]

                if row['next_bs'] != '':
                    coords.add(row['next_bs'])
                if row['next_us'] != '':
                    coords.add(row['next_us'])

                for i in coords:
                    line = [
                        "%s,%s_%s,%d" %
                        (row['orderid'], i[0], i[1],
                         int(row['geohashed_end_loc'] in c2h.get(i, [])))
                    ]
                    for fea in feas:
                        val = "%s_%s" % (fea, row[fea])
                        if val in counter:
                            line.append(
                                "%.2f" %
                                (counter[val].get(i, 0) * 1.0 / scounter[val]))
                        else:
                            line.append('0')

                    for fea in feas:
                        val = "%s_%s" % (fea, row[fea])
                        ad, md = tmp[val]
                        if md == 0 or ad == 0:
                            line.extend([''] * 4)
                        else:
                            dx = geo_distance(float_coord(i),
                                              float_coord(row["coord_start"]))
                            line.extend([
                                "%.2f" % (dx / ad),
                                "%.2f" % (dx / md),
                                str(int(dx > ad)),
                                str(int(dx > md))
                            ])
                    if len(line) != 1 + len(feas) + len(all_dist_feas):
                        print(len(line), 1 + len(feas) + len(all_dist_feas))
                        print(line)
                        assert 0
                    line.append("%d" % (int(row['next_bs'] == i)))
                    line.append("%d" % (int(row['next_us'] == i)))
                    if row['next_bs'] == i:
                        line.append(
                            str(bsdic.get(row['orderid'], ('', 10000000))[1]))
                    else:
                        line.append("100000000")
                    if row['next_us'] == i:
                        line.append(
                            str(usdic.get(row['orderid'], ('', 10000000))[1]))
                    else:
                        line.append("100000000")
                    line = ",".join(line) + '\n'
                    fo.write(line)
                del tmp
                rate.append(int(row['geohashed_end_loc'] in candidates))
                lenh.append(len(candidates))
                lenc.append(len(coords))
            if c > 0 and c % 100000 == 0:
                print(
                    "samples %d day %d target rate %.2f ave coords %.2f ave hashes %.2f"
                    % (c, day, ave(rate), ave(lenc), ave(lenh)))
            if is_train:
                start = float_coord(row["coord_start"])
                end = float_coord(row["coord_end"])
                dis = geo_distance(start, end)
                for fea in feas:
                    val = "{}_{}".format(fea, row[fea])
                    if val not in counter:
                        counter[val] = defaultdict(int)
                    counter[val][row["coord_end"]] += 1
                    scounter[val] += 1

                    if val not in dist_dic:
                        dist_dic[val] = []
                    dist_dic[val].append(dis)

    fo.close()
    return counter, scounter, dist_dic
Ejemplo n.º 5
0
def sample_hash_data(inx,
                     out,
                     coord_pred,
                     counter={},
                     scounter=defaultdict(int),
                     startday=16,
                     bads=set(),
                     xc=None,
                     xsc=None,
                     threshold=20,
                     max_loc=30,
                     isva=0,
                     min_count=0):
    """
    normalized count of {xxx, geohashed_end_loc} in history
    """

    if os.path.exists(out):
        return counter, scounter, xc, xsc
    assert 'sort' in inx
    if isva:
        assert "va" in inx or "test" in inx

    bsdic = get_consecutive_start_dic(
        ['../input/train_sort.csv', '../input/test_sort.csv'],
        'comps/mobike/sol_carl/data/bikes.p', 'bikeid')
    usdic = get_consecutive_start_dic(
        ['../input/train_sort.csv', '../input/test_sort.csv'],
        'comps/mobike/sol_carl/data/users.p', 'userid')

    if xc is None or xsc is None:
        xc, xsc = get_normalize_sloc_counter(
            ['../input/train.csv', '../input/test.csv'])
    print("loaded sloc")

    feas = 'gs6_user,gs5_user,coord_start_user,us,ub,userid,bikeid,biketype,dow,geohashed_start_loc,gs6,gs5,gs4,next_bs,next_us,coord_ss'
    candidate_feas = [
        'coord_start_user', 'gs5_user', 'userid', 'geohashed_start_loc',
        'bikeid'
    ]
    header = 'orderid,candidate_loc,label,%s' % feas
    feas = feas.split(',')

    sfeas = 'ub,userid,bikeid,biketype,dow'.split(',')
    header = "%s,%s,coord_score,next_bsx,next_usx,tbsx,tusx\n" % (
        header, ",".join(["s_%s" % i for i in sfeas]))

    fo = open(out, 'w')
    fo.write(header)

    rate = []
    coord_score = read_coord_prob(coord_pred)
    coord_candi = read_coord_candi(coord_pred)
    h2c = pickle.load(open("comps/mobike/sol_carl/data/h2c.p", "rb"))

    for c, row in enumerate(csv.DictReader(open(inx))):

        ymd, hms = row['starttime'].split()
        #row['hour'] = hms.split('-')[0]
        year, month, day = ymd.split('-')
        row['dow'] = str(((int(month) - 5) * 31 + int(day)) % 7)
        row['gs6'] = row['geohashed_start_loc'][:6]
        row['gs5'] = row['geohashed_start_loc'][:5]
        row['gs4'] = row['geohashed_start_loc'][:4]
        #row['ss'] = "%s_%s"%(row['geohashed_start_loc'],bsdic.get(row['orderid'],''))
        row['coord_ss'] = h2c.get(row['geohashed_start_loc'], (0, 0)), h2c.get(
            bsdic.get(row['orderid'], ''), (0, 0))
        row['next_bs'] = bsdic.get(row['orderid'], ('', 100000))[0]
        row['next_us'] = usdic.get(row['orderid'], ('', 100000))[0]
        #row['bs'] = "%s_%s"%(row['geohashed_start_loc'],row['bikeid'])
        row['ub'] = "%s_%s" % (row['userid'], row['bikeid'])
        row['us'] = "%s_%s" % (row['geohashed_start_loc'], row['userid'])
        row["coord_start_user"] = "******".join(
            list(h2c[row['geohashed_start_loc']]) + [row['userid']])
        row['gs6_user'] = row['gs6'] + '_' + row['userid']
        row['gs5_user'] = row['gs5'] + '_' + row['userid']
        #row["coord_start_bike"] = "_".join(list(h2c[row['geohashed_start_loc']])+[row['bikeid']])
        if 'geohashed_end_loc' not in row:
            row['geohashed_end_loc'] = '-1'

        #if int(day)>13 and isva==0:
        #    break
        if int(day) in bads:
            continue
        if int(day) > startday or isva:
            candidates = set()
            coord_candis = coord_candi.get(row['orderid'], None)
            for fea in candidate_feas:
                val = "%s_%s" % (fea, row[fea])
                dicx = counter.get(val, {})
                keyx = sort_value(dicx)
                d = 0
                for j in keyx:
                    if fea[0] != 'u' and d > threshold:
                        break
                    if dicx[j] < min_count:
                        break
                    if coord_candis and h2c[j] not in coord_candis:
                        continue
                    if j != row['geohashed_start_loc']:
                        candidates.add(j)
                    if isva == 0 and len(candidates) > max_loc:
                        break
                    d += 1

            for fea in candidate_feas:
                val = "%s_%s" % (fea, row[fea])
                dicx = xc.get(val, {})
                keyx = sort_value(dicx)
                d = 0
                for j in keyx:
                    if fea[0] != 'u' and d > threshold:
                        break
                    if dicx[j] < min_count:
                        break
                    if coord_candis and h2c[j] not in coord_candis:
                        continue
                    if j != row['geohashed_start_loc']:
                        candidates.add(j)
                    if isva == 0 and len(candidates) > max_loc:
                        break
                    d += 1

            if row['next_bs'] != '':
                candidates.add(row['next_bs'])
            if row['next_us'] != '':
                candidates.add(row['next_us'])

            rate.append(int(row['geohashed_end_loc'] in candidates))
            #if row['geohashed_end_loc'] != '-1':
            #    candidates.add(row['geohashed_end_loc'])
            #if len(candidates)>threshold or isva:
            if row['geohashed_end_loc'] in candidates or isva:
                for candidate in candidates:
                    line = [
                        row['orderid'], candidate,
                        str(int(candidate == row['geohashed_end_loc']))
                    ]
                    for fea in feas:
                        val = "%s_%s" % (fea, row[fea])
                        if val in counter:
                            line.append(
                                str(counter[val].get(candidate, 0) * 1.0 /
                                    scounter[val]))
                        else:
                            line.append('0')
                    for fea in sfeas:
                        val = "%s_%s" % (fea, row[fea])
                        if val in xc:
                            line.append(
                                str(xc[val].get(candidate, 0) * 1.0 /
                                    xsc[val]))
                        else:
                            line.append('0')
                    line.append("%.4f" % (coord_score.get(
                        row['orderid'], {}).get(h2c[candidate], -1000)))
                    line.append("%d" % (int(row['next_bs'] == candidate)))
                    line.append("%d" % (int(row['next_us'] == candidate)))
                    if row['next_bs'] == candidate:
                        line.append(
                            str(bsdic.get(row['orderid'], ('', 10000000))[1]))
                    else:
                        line.append("100000000")
                    if row['next_us'] == candidate:
                        line.append(
                            str(usdic.get(row['orderid'], ('', 10000000))[1]))
                    else:
                        line.append("100000000")
                    #line.append(row['dow'])
                    line = ','.join(line) + '\n'
                    fo.write(line)
        if isva == 0:  #row['geohashed_end_loc'] != '-1':
            for fea in feas:
                val = "%s_%s" % (fea, row[fea])
                if val not in counter:
                    counter[val] = defaultdict(int)
                counter[val][row['geohashed_end_loc']] += 1
                scounter[val] += 1

        if c > 0 and c % 100000 == 0:
            print("%s %d day %s rows processed, cover rate %.3f" %
                  (inx, c, day, ave(rate)))
    fo.close()
    print("sample hash data done")
    return counter, scounter, xc, xsc