Ejemplo n.º 1
0
def load_movies(path, ignore_tags=False):
    # load movie basic info.
    movies = {}
    for line in file(path + '/movies.csv').readlines():
        # line may contains more then 2 ','
        row = line.strip().split(',')
        movie_id = row[0]
        title = ','.join(row[1:-1])
        genres = row[-1]

        if movie_id == 'movieId':
            # ignore first line.
            continue

        movie = MovieInfo()
        movie.id = int(movie_id)
        movie.title = title
        movie.genres = genres.split('|')
        movie.process()
        movies[movie.id] = movie
    pydev.info('load movie basic info over.')

    if ignore_tags:
        return movies

    # load tag meta-info.
    tag_info = {}
    for tagid, tag in pydev.foreach_row(file(path + '/genome-tags.csv'),
                                        seperator=','):
        if tagid == 'tagId':
            continue
        tag_info[tagid] = tag.strip()
    pydev.info('load tags info over.')

    # load genome tags info.
    tag_match_count = 0
    for movieid, tagid, score in pydev.foreach_row(file(path +
                                                        '/genome-scores.csv'),
                                                   seperator=','):
        try:
            key = int(movieid)
            if key not in movies:
                continue
            movies[key].tags.append(
                (int(tagid), tag_info.get(tagid, ''), float(score)))
            tag_match_count += 1
        except Exception, e:
            pydev.err(e)
Ejemplo n.º 2
0
    def __init__(self, stream):
        self.__users = {}

        cur_uid = None
        cur_queue = []
        for row in pydev.foreach_row(stream, seperator=',', min_fields_num=4):
            uid, iid, rating, ts = row
            if uid == 'userId':
                continue
            
            uid = int(uid)
            iid = int(iid)
            rating = float(rating)
            # make it binary.
            score = 0
            if rating >= 4:
                score = 1
            ts = int(ts)

            if uid != cur_uid:
                # sorted by ts.
                if len(cur_queue)>0:
                    cur_queue = sorted(cur_queue, key=lambda x:x[2])
                    self.__users[cur_uid] = cur_queue
                
                cur_uid = uid
                cur_queue = []

            cur_queue.append( (iid, score, ts) )

        cur_queue = sorted(cur_queue, key=lambda x:x[2])
        self.__users[cur_uid] = cur_queue
Ejemplo n.º 3
0
    def __init__(self, stream):
        self.data = []
        for row in pydev.foreach_row(stream, seperator=',', min_fields_num=4):
            uid, iid, rating, ts = row
            if uid == 'userId':
                continue

            uid = int(uid)
            iid = int(iid)
            rating = float(rating)
            # make it binary.
            score = 0
            if rating >= 4:
                score = 1
            ts = int(ts)

            self.data.append((uid, iid, score))
Ejemplo n.º 4
0
    def load(self, fd):
        self.__slot_index = {}
        slot_info = fd.readline().strip().split('\t')
        for slot in slot_info:
            self.__slot_index[slot] = IndexCoder()
        pydev.info('%d slot info loaded' % len(self.__slot_index))

        for slot, key, idx in pydev.foreach_row(fd):
            slot_index = self.__slot_index.get(slot, None)
            if slot_index is None:
                raise Exception('Cannot get slot : %s' % slot)

            if int(idx) != len(slot_index.tags):
                raise Exception('Index not match : %s:%s:%s' %
                                (slot, idx, key))

            slot_index.index[key] = len(slot_index.tags)
            slot_index.tags.append(key)
Ejemplo n.º 5
0
    def read(self, filename):
        self.__X = []
        self.__Y = []
        first_row = True
        for row in pydev.foreach_row(
                file(filename), 
                seperator=self.__seperator):

            if first_row and self.__ignore_first_row:
                first_row = False
                continue

            if self.__expect_column_count < 0:
                self.__expect_column_count = len(row)
                if self.__target_column < 0:
                    self.__target_column = self.__expect_column_count - 1
                print >> sys.stderr, 'columns set to %d, target:%d' % (self.__expect_column_count, self.__target_column)
            elif len(row) != self.__expect_column_count:
                    continue

            row = map(lambda x:x.strip(), row)

            for cid, trans in self.__name_to_id_dict.iteritems():
                row[cid] = trans.read(row[cid])

            row[self.__target_column] = self.__target_trans.read( row[self.__target_column] )
            y = row[self.__target_column]

            filter_row = map(
                            lambda (rid, value): float(value),
                            filter(
                                lambda (rid, value):rid not in self.__ignore_columns and rid!=self.__target_column, 
                                enumerate(row))
                            )

            x = numpy.array( filter_row )
            x = x.astype(numpy.float32)
            

            self.__X.append(x)
            self.__Y.append(y)

        #self.__target_trans.debug()
        print >> sys.stderr, 'Data load (%d records)' % len(self.__X)
Ejemplo n.º 6
0
    def slot_dnn(self):
        import train_slot_dnn
        autoarg = pydev.AutoArg()

        EmbeddingSize = int(autoarg.option('emb', 32))
        slotinfo_filename = autoarg.option('s')
        model_path = autoarg.option('m')

        # temp get slot_info.
        slot_info = []
        for slot, slot_feanum in pydev.foreach_row(file(slotinfo_filename),
                                                   format='si'):
            slot_info.append((slot, slot_feanum))

        model = train_slot_dnn.SlotDnnRank(slot_info,
                                           EmbeddingSize).to(self.device)
        model.load_state_dict(torch.load(model_path))

        self.test_ins_data(model, slot_info)
Ejemplo n.º 7
0
def algor_cooc(train, valid, test, topN, only1=False):
    # using dict built by build_cooc.py
    fd = file('temp/cooc.txt')

    cooc_dict = {}
    for key, items in pydev.foreach_row(fd):
        items = map(lambda x: (x[0], int(x[1])),
                    map(lambda x: x.split(':'), items.split(',')))
        cooc_dict[key] = items
    print >> sys.stderr, 'cooc load over'

    def predict(uid, items):
        local_stat = {}
        readset = set(map(lambda x: x[0], items))

        for item, score, _ in items:
            if only1 and score != 1:
                continue
            cooc_items = cooc_dict.get(item, [])
            for c_item, c_count in cooc_items:
                if c_item in readset:
                    continue
                local_stat[c_item] = local_stat.get(c_item, 0) + c_count

        ans = map(lambda x: x[0],
                  sorted(local_stat.iteritems(), key=lambda x: -x[1])[:topN])
        '''
        print 'items:'
        print items
        print 'local:'
        print sorted(local_stat.iteritems(), key=lambda x:-x[1])[:20]
        print 'ans:'
        print ans
        '''

        return ans

    utils.measure(predict, test, debug=False)
Ejemplo n.º 8
0
    def read(self, filename):
        self.__X = []
        self.__Y = []
        self.__info = []
        first_row = True

        fd = file(filename)
        progress = pydev.FileProgress(fd, filename)
        raw_X = []
        for row in pydev.foreach_row(fd, seperator=self.__seperator):
            progress.check_progress()

            # whether to ignore first row.
            if first_row and self.__ignore_first_row:
                first_row = False
                continue

            # check column count.
            if self.__expect_column_count < 0:
                self.__expect_column_count = len(row)
                if self.__target_column < 0:
                    self.__target_column = self.__expect_column_count - 1
                print >> sys.stderr, 'columns set to %d, target:%d' % (self.__expect_column_count, self.__target_column)
            elif len(row) != self.__expect_column_count:
                    continue

            # strip each columns.
            row = map(lambda x:x.strip(), row)

            # get x dict.
            id_value = []
            v_size = 0
            ignored_info = []
            for rid, value in enumerate(row):
                # continue if target columns.
                if rid == self.__target_column:
                    continue
                # continue if filter columns.
                if rid in self.__ignore_columns:
                    ignored_info.append(value)
                    continue

                # dense and id-value-sparse
                if self.__row_mode == DataReader.DenseValue:
                    cid = rid
                elif self.__row_mode == DataReader.IVSparse:
                    cid, value = value.split(':')
                    cid = int(cid)

                if cid in self.__concrete_ids:
                    # one-hot representation for key.
                    # feature = id-value : 1
                    fid, value = self.__feature_trans.allocate_id('#%03d:%s' % (cid, value)), 1
                else:
                    # feature = id : value
                    fid, value = self.__feature_trans.allocate_id('#%03d' % (cid)), float(value)

                id_value.append( (fid, value) )
                if v_size < fid+1:
                    v_size = fid+1 

            x = numpy.ndarray(shape=(v_size,))
            x.fill(0)
            for fid, value in id_value:
                x[fid] = float(value)

            raw_X.append(x)

            # get Y
            if self.__concrete_target:
                row[self.__target_column] = self.__target_trans.allocate_id( row[self.__target_column] )
            y = row[self.__target_column]
            self.__Y.append(y)

            self.__info.append( self.__seperator.join(ignored_info) )

        progress.end_progress()
        
        # resize for each X.
        x_size = self.__feature_trans.size()
        for x in raw_X:
            new_x = numpy.ndarray(shape=(x_size,), dtype=numpy.float32)
            new_x.fill(0)
            new_x[:x.shape[0]] = x
            self.__X.append( new_x )

        # resize Y if concrete label.
        if self.__target_one_hot:
            raw_Y = self.__Y
            self.__Y = []
            y_size = self.__target_trans.size()
            for y in raw_Y:
                new_y = numpy.ndarray(shape=(y_size,), dtype=numpy.float32)
                new_y.fill(0)
                new_y[ int(y) ] = 1.
                self.__Y.append( new_y )

        # transform X to numpy.ndarray
        self.__X = numpy.array(self.__X)

        # preprocessing.
        if self.__maxabs_scale:
            print >> sys.stderr, 'Do maxabs_scale'
            self.__X = preprocessing.maxabs_scale(self.__X)

        # make Y as ndarray
        self.__Y = numpy.array(self.__Y).astype(numpy.float32)

        #self.__feature_trans.debug()
        #self.__target_trans.debug()

        print >> sys.stderr, 'Data load [ %d(records) x %d(features) ]' % (len(self.__X), len(self.__X[0]))
Ejemplo n.º 9
0
            self.index.add_item(int(key), vec)
            valid_count += 1

        pydev.info('emb load over, begin to build index..')
        self.index.build(32)
        pydev.info('EmbeddingDict load over: valid_count=%d, line_count=%d' %
                   (valid_count, line_count))


if __name__ == '__main__':
    filename = sys.argv[1]
    index = EmbeddingDict(filename, contain_key=False, metric='dot')

    # load movielens movie_info.
    movie_info = {}
    for row in pydev.foreach_row(file('data/ml-20m/movies.csv'),
                                 seperator=','):
        movie_id = row[0]
        genres = row[-1]
        title = ','.join(row[1:-1])
        movie_info[movie_id] = title + ' : ' + genres

    while True:
        sys.stdout.write('Query: ')
        query_id = sys.stdin.readline().strip()
        try:
            if query_id.startswith('d'):
                d, a, b = query_id.split(':')
                a = int(a)
                b = int(b)
                dist = index.index.get_distance(a, b)
                print 'distance of [%d] and [%d] : %.3f' % (a, b, dist)
#! /bin/env python
# encoding=utf-8
# author: nickgu 
# 

import sys
import pydev
import utils

if __name__=='__main__':
    output_file = file('temp/word2vec.input', 'w')
    for uid, items in pydev.foreach_row(file('data/train')):
        actions = []
        for item in items.split(','):
            vals = item.split(':')
            if vals[1] == '0':
                continue

            actions.append(vals[0])

        print >> output_file, ' '.join(actions)
Ejemplo n.º 11
0
    TestNum = -1
    EmbeddingSize = int(autoarg.option('emb', 32))
    EpochCount = int(autoarg.option('epoch', 4))
    BatchSize = int(autoarg.option('batch', 10000))
    device_name = autoarg.option('device', 'cuda')
    input_filename = autoarg.option('f')
    slotinfo_filename = autoarg.option('s')
    model_save_path = autoarg.option('o')

    device = torch.device(device_name)

    reader = easy.slot_file.SlotFileReader(input_filename)

    # temp get slot_info.
    slot_info = []
    for slot, slot_feanum in pydev.foreach_row(file(slotinfo_filename),
                                               format='si'):
        slot_info.append((slot, slot_feanum))

    model = SlotDnnRank(slot_info, EmbeddingSize).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    loss_fn = nn.BCELoss()

    tester = model_tester.ModelTester()

    def fwbp():
        labels, slots = reader.next(BatchSize)

        # make pytorch data.
        clicks = torch.Tensor(labels).to(device)
        dct = {}
        for item in slots:
Ejemplo n.º 12
0
    def read(self, filename):
        self.__X = []
        self.__Y = []
        self.__info = []
        first_row = True

        fd = file(filename)
        progress = pydev.FileProgress(fd, filename)
        raw_X = []
        for row in pydev.foreach_row(fd, seperator=self.__seperator):
            progress.check_progress()

            # whether to ignore first row.
            if first_row and self.__ignore_first_row:
                first_row = False
                continue

            # check column count.
            if self.__expect_column_count < 0:
                self.__expect_column_count = len(row)
                if self.__target_column < 0:
                    self.__target_column = self.__expect_column_count - 1
                print >> sys.stderr, 'columns set to %d, target:%d' % (
                    self.__expect_column_count, self.__target_column)
            elif len(row) != self.__expect_column_count:
                continue

            # strip each columns.
            row = map(lambda x: x.strip(), row)

            # get x dict.
            id_value = []
            v_size = 0
            ignored_info = []
            for rid, value in enumerate(row):
                # continue if target columns.
                if rid == self.__target_column:
                    continue
                # continue if filter columns.
                if rid in self.__ignore_columns:
                    ignored_info.append(value)
                    continue

                # dense and id-value-sparse
                if self.__row_mode == DataReader.DenseValue:
                    cid = rid
                elif self.__row_mode == DataReader.IVSparse:
                    cid, value = value.split(':')
                    cid = int(cid)

                if cid in self.__concrete_ids:
                    # one-hot representation for key.
                    # feature = id-value : 1
                    fid, value = self.__feature_trans.allocate_id(
                        '#%03d:%s' % (cid, value)), 1
                else:
                    # feature = id : value
                    fid, value = self.__feature_trans.allocate_id(
                        '#%03d' % (cid)), float(value)

                id_value.append((fid, value))
                if v_size < fid + 1:
                    v_size = fid + 1

            x = numpy.ndarray(shape=(v_size, ))
            x.fill(0)
            for fid, value in id_value:
                x[fid] = float(value)

            raw_X.append(x)

            # get Y
            if self.__concrete_target:
                row[self.__target_column] = self.__target_trans.allocate_id(
                    row[self.__target_column])
            y = row[self.__target_column]
            self.__Y.append(y)

            self.__info.append(self.__seperator.join(ignored_info))

        progress.end_progress()

        # resize for each X.
        x_size = self.__feature_trans.size()
        for x in raw_X:
            new_x = numpy.ndarray(shape=(x_size, ), dtype=numpy.float32)
            new_x.fill(0)
            new_x[:x.shape[0]] = x
            self.__X.append(new_x)

        # resize Y if concrete label.
        if self.__target_one_hot:
            raw_Y = self.__Y
            self.__Y = []
            y_size = self.__target_trans.size()
            for y in raw_Y:
                new_y = numpy.ndarray(shape=(y_size, ), dtype=numpy.float32)
                new_y.fill(0)
                new_y[int(y)] = 1.
                self.__Y.append(new_y)

        # transform X to numpy.ndarray
        self.__X = numpy.array(self.__X)

        # preprocessing.
        if self.__maxabs_scale:
            print >> sys.stderr, 'Do maxabs_scale'
            self.__X = preprocessing.maxabs_scale(self.__X)

        # make Y as ndarray
        self.__Y = numpy.array(self.__Y).astype(numpy.float32)

        #self.__feature_trans.debug()
        #self.__target_trans.debug()

        print >> sys.stderr, 'Data load [ %d(records) x %d(features) ]' % (len(
            self.__X), len(self.__X[0]))
Ejemplo n.º 13
0
def check_answer(answer_fn, squad_fn, output_fn):
    import torchtext
    import sys
    import tqdm

    tokenizer = nlp_utils.init_tokenizer()
    answer_fd = file(answer_fn)
    answers = []
    for row in pydev.foreach_row(file(answer_fn)):
        if len(row) != 6:
            break
        pred, tgt, tag, pratio, py_, py = row
        pratio = float(pratio)
        py_ = float(py_)
        py = float(py)
        ps, pe = map(lambda x: int(x), pred.split(','))
        ts, te = map(lambda x: int(x), tgt.split(','))
        answers.append(((ps, pe), (ts, te), tag, pratio, py_, py))
    print >> sys.stderr, 'answer loaded.'

    reader = SquadReader(squad_fn)
    output = file(output_fn, 'w')
    idx = 0
    n_EM = 0
    n_SM = 0
    n_SM_B = 0
    n_SM_E = 0
    bar = tqdm.tqdm(reader.iter_instance())
    for title, context, qid, question, ans, is_impossible in bar:
        if is_impossible:
            continue
        if idx >= len(answers):
            break

        qtoks = tokenizer(question)
        ctoks = tokenizer(context)
        ans_y_, ans_y, tag, p_ratio, p_y_, p_y = answers[idx]

        print >> output, '\n## ID=%d ##\n%s' % (idx, '=' * 100)
        print >> output, '== Context =='
        print >> output, context.encode('utf8')
        print >> output, '== Context Tokens =='
        print >> output, (u','.join(ctoks)).encode('utf8')
        print >> output, '== Question =='
        print >> output, question.encode('utf8')
        print >> output, '== Question Tokens =='
        print >> output, (u','.join(qtoks)).encode('utf8')
        print >> output, '== Expected answer =='
        print >> output, 'rec: ' + u' '.join(
            ctoks[ans_y[0]:ans_y[1]]).encode('utf8')
        print >> output, '(%d, %d)' % (ans_y[0], ans_y[1])
        for a in ans:
            print >> output, '%s (%d)' % (a['text'].encode('utf8'),
                                          a['answer_start'])
        print >> output, '== Predict output =='
        print >> output, u' '.join(ctoks[ans_y_[0]:ans_y_[1]]).encode('utf8')
        print >> output, '(%d, %d)' % (ans_y_[0], ans_y_[1])

        # match candidate or both side match.
        em = False
        if ans_y_ == ans_y:
            em = True
        adjust_answer = u''.join(ctoks[ans_y_[0]:ans_y_[1]]).replace(u' ', u'')
        for a in ans:
            aa = a['text'].replace(u' ', u'')
            if aa == adjust_answer:
                em = True
                break

        if em:
            print >> output, ' ## ExactMatch!'
            n_EM += 1
        elif ans_y_[0] == ans_y[0] or ans_y_[1] == ans_y[1]:
            print >> output, (' ## SideMatch! [%s]' % tag)
            n_SM += 1
            if tag == 'SM_B': n_SM_B += 1
            if tag == 'SM_E': n_SM_E += 1
        else:
            print >> output, ' ## Wrong!'
        print >> output, 'p_ratio=%.3f, p_y_=%.5f, p_y=%.5f' % (p_ratio, p_y_,
                                                                p_y)

        idx += 1
        bar.set_description(
            'EM=%.1f%%(%d), SM=%.1f%%, B=%.1f%%, E=%.1f%%, N=%d' %
            (n_EM * 100. / idx, n_EM, n_SM * 100. / idx, n_SM_B * 100. / idx,
             n_SM_E * 100. / idx, idx))