def load_movies(path, ignore_tags=False): # load movie basic info. movies = {} for line in file(path + '/movies.csv').readlines(): # line may contains more then 2 ',' row = line.strip().split(',') movie_id = row[0] title = ','.join(row[1:-1]) genres = row[-1] if movie_id == 'movieId': # ignore first line. continue movie = MovieInfo() movie.id = int(movie_id) movie.title = title movie.genres = genres.split('|') movie.process() movies[movie.id] = movie pydev.info('load movie basic info over.') if ignore_tags: return movies # load tag meta-info. tag_info = {} for tagid, tag in pydev.foreach_row(file(path + '/genome-tags.csv'), seperator=','): if tagid == 'tagId': continue tag_info[tagid] = tag.strip() pydev.info('load tags info over.') # load genome tags info. tag_match_count = 0 for movieid, tagid, score in pydev.foreach_row(file(path + '/genome-scores.csv'), seperator=','): try: key = int(movieid) if key not in movies: continue movies[key].tags.append( (int(tagid), tag_info.get(tagid, ''), float(score))) tag_match_count += 1 except Exception, e: pydev.err(e)
def __init__(self, stream): self.__users = {} cur_uid = None cur_queue = [] for row in pydev.foreach_row(stream, seperator=',', min_fields_num=4): uid, iid, rating, ts = row if uid == 'userId': continue uid = int(uid) iid = int(iid) rating = float(rating) # make it binary. score = 0 if rating >= 4: score = 1 ts = int(ts) if uid != cur_uid: # sorted by ts. if len(cur_queue)>0: cur_queue = sorted(cur_queue, key=lambda x:x[2]) self.__users[cur_uid] = cur_queue cur_uid = uid cur_queue = [] cur_queue.append( (iid, score, ts) ) cur_queue = sorted(cur_queue, key=lambda x:x[2]) self.__users[cur_uid] = cur_queue
def __init__(self, stream): self.data = [] for row in pydev.foreach_row(stream, seperator=',', min_fields_num=4): uid, iid, rating, ts = row if uid == 'userId': continue uid = int(uid) iid = int(iid) rating = float(rating) # make it binary. score = 0 if rating >= 4: score = 1 ts = int(ts) self.data.append((uid, iid, score))
def load(self, fd): self.__slot_index = {} slot_info = fd.readline().strip().split('\t') for slot in slot_info: self.__slot_index[slot] = IndexCoder() pydev.info('%d slot info loaded' % len(self.__slot_index)) for slot, key, idx in pydev.foreach_row(fd): slot_index = self.__slot_index.get(slot, None) if slot_index is None: raise Exception('Cannot get slot : %s' % slot) if int(idx) != len(slot_index.tags): raise Exception('Index not match : %s:%s:%s' % (slot, idx, key)) slot_index.index[key] = len(slot_index.tags) slot_index.tags.append(key)
def read(self, filename): self.__X = [] self.__Y = [] first_row = True for row in pydev.foreach_row( file(filename), seperator=self.__seperator): if first_row and self.__ignore_first_row: first_row = False continue if self.__expect_column_count < 0: self.__expect_column_count = len(row) if self.__target_column < 0: self.__target_column = self.__expect_column_count - 1 print >> sys.stderr, 'columns set to %d, target:%d' % (self.__expect_column_count, self.__target_column) elif len(row) != self.__expect_column_count: continue row = map(lambda x:x.strip(), row) for cid, trans in self.__name_to_id_dict.iteritems(): row[cid] = trans.read(row[cid]) row[self.__target_column] = self.__target_trans.read( row[self.__target_column] ) y = row[self.__target_column] filter_row = map( lambda (rid, value): float(value), filter( lambda (rid, value):rid not in self.__ignore_columns and rid!=self.__target_column, enumerate(row)) ) x = numpy.array( filter_row ) x = x.astype(numpy.float32) self.__X.append(x) self.__Y.append(y) #self.__target_trans.debug() print >> sys.stderr, 'Data load (%d records)' % len(self.__X)
def slot_dnn(self): import train_slot_dnn autoarg = pydev.AutoArg() EmbeddingSize = int(autoarg.option('emb', 32)) slotinfo_filename = autoarg.option('s') model_path = autoarg.option('m') # temp get slot_info. slot_info = [] for slot, slot_feanum in pydev.foreach_row(file(slotinfo_filename), format='si'): slot_info.append((slot, slot_feanum)) model = train_slot_dnn.SlotDnnRank(slot_info, EmbeddingSize).to(self.device) model.load_state_dict(torch.load(model_path)) self.test_ins_data(model, slot_info)
def algor_cooc(train, valid, test, topN, only1=False): # using dict built by build_cooc.py fd = file('temp/cooc.txt') cooc_dict = {} for key, items in pydev.foreach_row(fd): items = map(lambda x: (x[0], int(x[1])), map(lambda x: x.split(':'), items.split(','))) cooc_dict[key] = items print >> sys.stderr, 'cooc load over' def predict(uid, items): local_stat = {} readset = set(map(lambda x: x[0], items)) for item, score, _ in items: if only1 and score != 1: continue cooc_items = cooc_dict.get(item, []) for c_item, c_count in cooc_items: if c_item in readset: continue local_stat[c_item] = local_stat.get(c_item, 0) + c_count ans = map(lambda x: x[0], sorted(local_stat.iteritems(), key=lambda x: -x[1])[:topN]) ''' print 'items:' print items print 'local:' print sorted(local_stat.iteritems(), key=lambda x:-x[1])[:20] print 'ans:' print ans ''' return ans utils.measure(predict, test, debug=False)
def read(self, filename): self.__X = [] self.__Y = [] self.__info = [] first_row = True fd = file(filename) progress = pydev.FileProgress(fd, filename) raw_X = [] for row in pydev.foreach_row(fd, seperator=self.__seperator): progress.check_progress() # whether to ignore first row. if first_row and self.__ignore_first_row: first_row = False continue # check column count. if self.__expect_column_count < 0: self.__expect_column_count = len(row) if self.__target_column < 0: self.__target_column = self.__expect_column_count - 1 print >> sys.stderr, 'columns set to %d, target:%d' % (self.__expect_column_count, self.__target_column) elif len(row) != self.__expect_column_count: continue # strip each columns. row = map(lambda x:x.strip(), row) # get x dict. id_value = [] v_size = 0 ignored_info = [] for rid, value in enumerate(row): # continue if target columns. if rid == self.__target_column: continue # continue if filter columns. if rid in self.__ignore_columns: ignored_info.append(value) continue # dense and id-value-sparse if self.__row_mode == DataReader.DenseValue: cid = rid elif self.__row_mode == DataReader.IVSparse: cid, value = value.split(':') cid = int(cid) if cid in self.__concrete_ids: # one-hot representation for key. # feature = id-value : 1 fid, value = self.__feature_trans.allocate_id('#%03d:%s' % (cid, value)), 1 else: # feature = id : value fid, value = self.__feature_trans.allocate_id('#%03d' % (cid)), float(value) id_value.append( (fid, value) ) if v_size < fid+1: v_size = fid+1 x = numpy.ndarray(shape=(v_size,)) x.fill(0) for fid, value in id_value: x[fid] = float(value) raw_X.append(x) # get Y if self.__concrete_target: row[self.__target_column] = self.__target_trans.allocate_id( row[self.__target_column] ) y = row[self.__target_column] self.__Y.append(y) self.__info.append( self.__seperator.join(ignored_info) ) progress.end_progress() # resize for each X. x_size = self.__feature_trans.size() for x in raw_X: new_x = numpy.ndarray(shape=(x_size,), dtype=numpy.float32) new_x.fill(0) new_x[:x.shape[0]] = x self.__X.append( new_x ) # resize Y if concrete label. if self.__target_one_hot: raw_Y = self.__Y self.__Y = [] y_size = self.__target_trans.size() for y in raw_Y: new_y = numpy.ndarray(shape=(y_size,), dtype=numpy.float32) new_y.fill(0) new_y[ int(y) ] = 1. self.__Y.append( new_y ) # transform X to numpy.ndarray self.__X = numpy.array(self.__X) # preprocessing. if self.__maxabs_scale: print >> sys.stderr, 'Do maxabs_scale' self.__X = preprocessing.maxabs_scale(self.__X) # make Y as ndarray self.__Y = numpy.array(self.__Y).astype(numpy.float32) #self.__feature_trans.debug() #self.__target_trans.debug() print >> sys.stderr, 'Data load [ %d(records) x %d(features) ]' % (len(self.__X), len(self.__X[0]))
self.index.add_item(int(key), vec) valid_count += 1 pydev.info('emb load over, begin to build index..') self.index.build(32) pydev.info('EmbeddingDict load over: valid_count=%d, line_count=%d' % (valid_count, line_count)) if __name__ == '__main__': filename = sys.argv[1] index = EmbeddingDict(filename, contain_key=False, metric='dot') # load movielens movie_info. movie_info = {} for row in pydev.foreach_row(file('data/ml-20m/movies.csv'), seperator=','): movie_id = row[0] genres = row[-1] title = ','.join(row[1:-1]) movie_info[movie_id] = title + ' : ' + genres while True: sys.stdout.write('Query: ') query_id = sys.stdin.readline().strip() try: if query_id.startswith('d'): d, a, b = query_id.split(':') a = int(a) b = int(b) dist = index.index.get_distance(a, b) print 'distance of [%d] and [%d] : %.3f' % (a, b, dist)
#! /bin/env python # encoding=utf-8 # author: nickgu # import sys import pydev import utils if __name__=='__main__': output_file = file('temp/word2vec.input', 'w') for uid, items in pydev.foreach_row(file('data/train')): actions = [] for item in items.split(','): vals = item.split(':') if vals[1] == '0': continue actions.append(vals[0]) print >> output_file, ' '.join(actions)
TestNum = -1 EmbeddingSize = int(autoarg.option('emb', 32)) EpochCount = int(autoarg.option('epoch', 4)) BatchSize = int(autoarg.option('batch', 10000)) device_name = autoarg.option('device', 'cuda') input_filename = autoarg.option('f') slotinfo_filename = autoarg.option('s') model_save_path = autoarg.option('o') device = torch.device(device_name) reader = easy.slot_file.SlotFileReader(input_filename) # temp get slot_info. slot_info = [] for slot, slot_feanum in pydev.foreach_row(file(slotinfo_filename), format='si'): slot_info.append((slot, slot_feanum)) model = SlotDnnRank(slot_info, EmbeddingSize).to(device) optimizer = optim.Adam(model.parameters(), lr=0.01) loss_fn = nn.BCELoss() tester = model_tester.ModelTester() def fwbp(): labels, slots = reader.next(BatchSize) # make pytorch data. clicks = torch.Tensor(labels).to(device) dct = {} for item in slots:
def read(self, filename): self.__X = [] self.__Y = [] self.__info = [] first_row = True fd = file(filename) progress = pydev.FileProgress(fd, filename) raw_X = [] for row in pydev.foreach_row(fd, seperator=self.__seperator): progress.check_progress() # whether to ignore first row. if first_row and self.__ignore_first_row: first_row = False continue # check column count. if self.__expect_column_count < 0: self.__expect_column_count = len(row) if self.__target_column < 0: self.__target_column = self.__expect_column_count - 1 print >> sys.stderr, 'columns set to %d, target:%d' % ( self.__expect_column_count, self.__target_column) elif len(row) != self.__expect_column_count: continue # strip each columns. row = map(lambda x: x.strip(), row) # get x dict. id_value = [] v_size = 0 ignored_info = [] for rid, value in enumerate(row): # continue if target columns. if rid == self.__target_column: continue # continue if filter columns. if rid in self.__ignore_columns: ignored_info.append(value) continue # dense and id-value-sparse if self.__row_mode == DataReader.DenseValue: cid = rid elif self.__row_mode == DataReader.IVSparse: cid, value = value.split(':') cid = int(cid) if cid in self.__concrete_ids: # one-hot representation for key. # feature = id-value : 1 fid, value = self.__feature_trans.allocate_id( '#%03d:%s' % (cid, value)), 1 else: # feature = id : value fid, value = self.__feature_trans.allocate_id( '#%03d' % (cid)), float(value) id_value.append((fid, value)) if v_size < fid + 1: v_size = fid + 1 x = numpy.ndarray(shape=(v_size, )) x.fill(0) for fid, value in id_value: x[fid] = float(value) raw_X.append(x) # get Y if self.__concrete_target: row[self.__target_column] = self.__target_trans.allocate_id( row[self.__target_column]) y = row[self.__target_column] self.__Y.append(y) self.__info.append(self.__seperator.join(ignored_info)) progress.end_progress() # resize for each X. x_size = self.__feature_trans.size() for x in raw_X: new_x = numpy.ndarray(shape=(x_size, ), dtype=numpy.float32) new_x.fill(0) new_x[:x.shape[0]] = x self.__X.append(new_x) # resize Y if concrete label. if self.__target_one_hot: raw_Y = self.__Y self.__Y = [] y_size = self.__target_trans.size() for y in raw_Y: new_y = numpy.ndarray(shape=(y_size, ), dtype=numpy.float32) new_y.fill(0) new_y[int(y)] = 1. self.__Y.append(new_y) # transform X to numpy.ndarray self.__X = numpy.array(self.__X) # preprocessing. if self.__maxabs_scale: print >> sys.stderr, 'Do maxabs_scale' self.__X = preprocessing.maxabs_scale(self.__X) # make Y as ndarray self.__Y = numpy.array(self.__Y).astype(numpy.float32) #self.__feature_trans.debug() #self.__target_trans.debug() print >> sys.stderr, 'Data load [ %d(records) x %d(features) ]' % (len( self.__X), len(self.__X[0]))
def check_answer(answer_fn, squad_fn, output_fn): import torchtext import sys import tqdm tokenizer = nlp_utils.init_tokenizer() answer_fd = file(answer_fn) answers = [] for row in pydev.foreach_row(file(answer_fn)): if len(row) != 6: break pred, tgt, tag, pratio, py_, py = row pratio = float(pratio) py_ = float(py_) py = float(py) ps, pe = map(lambda x: int(x), pred.split(',')) ts, te = map(lambda x: int(x), tgt.split(',')) answers.append(((ps, pe), (ts, te), tag, pratio, py_, py)) print >> sys.stderr, 'answer loaded.' reader = SquadReader(squad_fn) output = file(output_fn, 'w') idx = 0 n_EM = 0 n_SM = 0 n_SM_B = 0 n_SM_E = 0 bar = tqdm.tqdm(reader.iter_instance()) for title, context, qid, question, ans, is_impossible in bar: if is_impossible: continue if idx >= len(answers): break qtoks = tokenizer(question) ctoks = tokenizer(context) ans_y_, ans_y, tag, p_ratio, p_y_, p_y = answers[idx] print >> output, '\n## ID=%d ##\n%s' % (idx, '=' * 100) print >> output, '== Context ==' print >> output, context.encode('utf8') print >> output, '== Context Tokens ==' print >> output, (u','.join(ctoks)).encode('utf8') print >> output, '== Question ==' print >> output, question.encode('utf8') print >> output, '== Question Tokens ==' print >> output, (u','.join(qtoks)).encode('utf8') print >> output, '== Expected answer ==' print >> output, 'rec: ' + u' '.join( ctoks[ans_y[0]:ans_y[1]]).encode('utf8') print >> output, '(%d, %d)' % (ans_y[0], ans_y[1]) for a in ans: print >> output, '%s (%d)' % (a['text'].encode('utf8'), a['answer_start']) print >> output, '== Predict output ==' print >> output, u' '.join(ctoks[ans_y_[0]:ans_y_[1]]).encode('utf8') print >> output, '(%d, %d)' % (ans_y_[0], ans_y_[1]) # match candidate or both side match. em = False if ans_y_ == ans_y: em = True adjust_answer = u''.join(ctoks[ans_y_[0]:ans_y_[1]]).replace(u' ', u'') for a in ans: aa = a['text'].replace(u' ', u'') if aa == adjust_answer: em = True break if em: print >> output, ' ## ExactMatch!' n_EM += 1 elif ans_y_[0] == ans_y[0] or ans_y_[1] == ans_y[1]: print >> output, (' ## SideMatch! [%s]' % tag) n_SM += 1 if tag == 'SM_B': n_SM_B += 1 if tag == 'SM_E': n_SM_E += 1 else: print >> output, ' ## Wrong!' print >> output, 'p_ratio=%.3f, p_y_=%.5f, p_y=%.5f' % (p_ratio, p_y_, p_y) idx += 1 bar.set_description( 'EM=%.1f%%(%d), SM=%.1f%%, B=%.1f%%, E=%.1f%%, N=%d' % (n_EM * 100. / idx, n_EM, n_SM * 100. / idx, n_SM_B * 100. / idx, n_SM_E * 100. / idx, idx))