def load(): global g_userRes, g_userResF, g_userInfoF, g_userInfo ret = util.loadPickle(g_userResF) if ret == None: return {}, {} ret2 = util.loadPickle(g_userInfoF) if ret2 == None: return ret, {} return ret, ret2
def load(): global g_userRes, g_userResF, g_userDoneF, g_userDone ret = util.loadPickle(g_userResF) if ret == None: return {}, [] ret2 = util.loadPickle(g_userDoneF) if ret2 == None: return ret, [] return ret, ret2
def loadData(self): data_A = util.loadPickle(self.dataset_A)[0] data_B = util.loadPickle(self.dataset_B)[0] data_A = np.concatenate([*map(lambda x: x.flatten(), data_A)]) data_B = np.concatenate([*map(lambda x: x.flatten(), data_B)]) data_A = np.reshape(data_A, (self.shape[0], -1)) data_B = np.reshape(data_B, (self.shape[0], -1)) self.data_A = data_A self.data_B = data_B self.data_len = min(data_A.shape[1], data_B.shape[1]) self.data_len = self.data_len - (self.data_len // self.shape[1])
def loadRules(DIR, FILENAME, ruleset, method, ITER): if method == 'A' or method == 'B': fullpath_rules = DIR + '/' + FILENAME + '/' + ruleset + '/' + ITER + '/rules_' + method + '.pkl' rules = loadPickle(fullpath_rules) elif method == 'both': fullpath_rules_A = DIR + '/' + FILENAME + '/' + ruleset + '/' + ITER + '/rules_A.pkl' fullpath_rules_B = DIR + '/' + FILENAME + '/' + ruleset + '/' + ITER + '/rules_B.pkl' rules_A = loadPickle(fullpath_rules_A) rules_B = loadPickle(fullpath_rules_B) rules = rules_A + rules_B else: print("no method") return (rules)
def sample_save(self, epoch, batch_i): _, coded_sps_mean_A, coded_sps_std_A, coded_sps_max_A, _, _ = util.loadPickle('./cache36_suzuki.pkl') wave = util.loadWave(f'./datasets/suzuki/a01.wav') pwav = util.wavePadding(wave) f0, sp, ap = util.worldDecompose(pwav) coded_sp = util.worldEncodeSpectralEnvelop(sp) coded_sp_t = coded_sp.T coded_sp_norm = (coded_sp_t - coded_sps_mean_A) / coded_sps_max_A coded_sp_norm = coded_sp_norm[:,:128*6] coded_sp_norm = coded_sp_norm.reshape(6, 36, 128) dist = self.g_AB.predict(coded_sp_norm) dist = dist.reshape((36, 128*6)) util.savePickle(f'./predict/log_a01_{epoch}_{batch_i}.pkl', dist)
def countIdf(cWcDict): idf = {} D = len(cWcDict.keys()) for c in cWcDict: for w in cWcDict[c]: if w not in idf: cnt = 0 for cc in cWcDict: if w in cWcDict[cc]: cnt += 1 idf[w] = math.log(float(D)/cnt, 2) return idf if __name__ == '__main__': #load wbWc = util.loadPickle('wbWc.pickle') infoWc = util.loadPickle('infoWc.pickle') cList = loadCirecle('report/circles', 10) cInfoWcDict = {} cWbWcDict = {} for c in cList: cKey = " ".join(c) cInfoWcDict[cKey] = {} cWbWcDict[cKey] = {} for u in c: addDict(cInfoWcDict[cKey], infoWc[u]) addDict(cWbWcDict[cKey], wbWc[u]) anaCircle(cInfoWcDict, cWbWcDict)
def loadRules(DIR, FILENAME, ruleset, method, ITER): fullpath_rules_A = DIR + '/' + FILENAME + '/' + ruleset + '/' + ITER + '/rules_A.pkl' fullpath_rules_B = DIR + '/' + FILENAME + '/' + ruleset + '/' + ITER + '/rules_B.pkl' rules_A = loadPickle(fullpath_rules_A) rules_B = loadPickle(fullpath_rules_B) return (rules_A, rules_B)
def load(): global g_userRes, g_userResF ret = util.loadPickle(g_userResF) if ret == None: return {} return ret
import util import numpy as np from scipy.sparse import csr_matrix def termVectorFromCSR(row_offsets, indices, data): offsets = zip(row_offsets[::], row_offsets[1::]) doc_id = 0 for (start, end) in offsets: yield doc_id, indices[start:end], data[start:end] doc_id += 1 term_map = util.loadPickle("vocab.pkl") doc_map = util.loadPickle("doc_index_map.pkl") no_of_terms = len(term_map) no_of_docs = len(doc_map) # Creating a Compressed Row Sparse Format of the Term-Document Matrix ROW_OFFSETS = [0] COLUMN_INDICES = [] VALUES = [] for doc, vector in util.scrollIndex(): prev_offset = ROW_OFFSETS[-1] ROW_OFFSETS.append(prev_offset + len(vector)) [(COLUMN_INDICES.append(term_map[term]), VALUES.append(count['tf'])) for (term, count) in vector.iteritems()] print ROW_OFFSETS print COLUMN_INDICES
def loadM(fName): return util.loadPickle(fName)
def main(_): vocab, rev_vocab = initialize_vocab(FLAGS.vocab_path) embed_path = FLAGS.embed_path or pjoin("data", "squad", "glove.trimmed.{}.npz".format(FLAGS.embedding_size)) global_train_dir = '/tmp/cs224n-squad-train' # Adds symlink to {train_dir} from /tmp/cs224n-squad-train to canonicalize the # file paths saved in the checkpoint. This allows the model to be reloaded even # if the location of the checkpoint files has moved, allowing usage with CodaLab. # This must be done on both train.py and qa_answer.py in order to work. if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) if os.path.exists(global_train_dir): os.unlink(global_train_dir) #os.symlink(os.path.abspath(FLAGS.train_dir), global_train_dir) train_dir = global_train_dir if not os.path.exists(FLAGS.log_dir): os.makedirs(FLAGS.log_dir) file_handler = logging.FileHandler(pjoin(FLAGS.log_dir, "log.txt")) logging.getLogger().addHandler(file_handler) print(vars(FLAGS)) with open(os.path.join(FLAGS.log_dir, "flags.json"), 'w') as fout: json.dump(FLAGS.__flags, fout) # ========= Download Dataset json ========= # You can change this code to load dataset in your own way #dev_dirname = os.path.dirname(os.path.abspath(FLAGS.dev_path)) #dev_filename = os.path.basename(FLAGS.dev_path) #_, _, _ = prepare_dev(dev_dirname, dev_filename, vocab) # ========= Process input json ========= # for codalab prefix = os.path.join("data", "squad") # writes dev.answer, dev.context, dev.question, dev.span dev_path = FLAGS.dev_path dev_filename = FLAGS.dev_path.split("/")[-1] if FLAGS.download: dev_data = data_from_json(os.path.join(prefix, dev_filename)) else: dev_data = data_from_json(dev_filename) dev_num_questions, dev_num_answers = read_write_dataset(dev_data, 'dev', prefix="") print("Processed {} questions and {} answers in dev".format(dev_num_questions, dev_num_answers)) # writes dev.ids.context, dev.ids.question vocab_path = pjoin(os.path.join("data", "squad"), "vocab.dat") dev_deposit_path = pjoin(os.path.join("", ""), "dev") #pjoin(os.path.join("data", "squad"), "dev") x_dis_path = dev_deposit_path + ".ids.context" y_ids_path = dev_deposit_path + ".ids.question" data_to_token_ids(dev_deposit_path + ".context", x_dis_path, vocab_path) data_to_token_ids(dev_deposit_path + ".question", y_ids_path, vocab_path) # load data sets #Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data(os.path.join("data", "squad"), "dev") # for our purposes this is as test set. Q_test, P_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, Q_len_test, P_len_test = load_data_home(dev_deposit_path) # for our purposes this is as test set. question_uuid_data = [] with open(dev_deposit_path + ".quid") as f: for line in f: question_uuid_data.append((line)) # pad the data at load-time. So, we don't need to do any masking later!!! # ref: https://keras.io/preprocessing/sequence/ # if len < maxlen, pad with specified val # elif len > maxlen, truncate QMAXLEN = FLAGS.QMAXLEN PMAXLEN = FLAGS.PMAXLEN Q_test = pad_sequences(Q_test, maxlen=QMAXLEN, value=PAD_ID, padding='post') P_test = pad_sequences(P_test, maxlen=PMAXLEN, value=PAD_ID, padding='post') A_start_test = pad_sequences(A_start_test, maxlen=PMAXLEN, value=0, padding='post') A_end_test = pad_sequences(A_end_test, maxlen=PMAXLEN, value=0, padding='post') test_data = zip(P_test, Q_test, P_len_test, Q_len_test, A_start_test, A_end_test, A_len_test, P_raw_test, A_raw_test, question_uuid_data) # ========= Model-specific ========= # You must change the following code to adjust to your model """models = [ 'MPCM', 'COATT', 'COATT_fixed', 'COATT_mix','COATT_fixed_mix', 'COATT_fixed_200_mix'] # 'COATT_fixed_200', leave out to save time predictions_start = {}; predictions_end = {} with open("preds_dev.txt", "a") as f: f.write("model" + "," + "pred_raw" + "," + "a_raw") for model in models: FLAGS.model_type = model FLAGS.train_dir = "train/ensemble_train_" + model train_dir = "train/ensemble_train_" + model # define sizes etc. for different models. if model == 'COATT_fixed_200' or model == 'COATT_fixed_200_mix' : FLAGS.embedding_size = 200 FLAGS.lstm_units = 200 elif model == "MPCM_p100": FLAGS.embedding_size = 100 FLAGS.lstm_units = 100 FLAGS.perspective_units = 100 else: FLAGS.embedding_size = 100 FLAGS.lstm_units = 100 FLAGS.perspective_units = 50 with tf.Graph().as_default(): with tf.Session() as sess: embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz') pretrained_embeddings = embeddings['glove'] qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys())) initialize_model(sess, qa, train_dir) # get predicted start-end indices a_s_l = [] a_e_l = [] f1 = exact_match = total = 0; answers = {}; prob_start = {}; prob_end = {}; p_raw_mapping= {} prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size)) for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)): batch_test = batch[:4] (ys, ye) = qa.predict_on_batch(sess, *batch_test) a_s = (np.argmax(ys, axis=1)) a_e = (np.argmax(ye, axis=1)) a_s_l = a_s_l + list(a_s) a_e_l = a_e_l + list(a_e) print(len(a_s)) for j in range(len(a_s)): p_raw = batch[7][j] a_raw = batch[8][j] s = a_s[j] e = a_e[j] pred_raw = ' '.join(p_raw.split()[s:e + 1]) p_raw_mapping[batch[9][j].strip("\n")] = p_raw #answers[batch[9][j].strip("\n")] = pred_raw.strip("\n") prob_start[batch[9][j].strip("\n")] = ys[j] prob_end[batch[9][j].strip("\n")] = ye[j] f.write(model + "," + pred_raw + "," + a_raw ) prog.update(i + 1, [("processed", i + 1)]) predictions_start[model] = prob_start predictions_end[model] = prob_end f.close() # save dropPickle(predictions_start, "preds_start.pkl") dropPickle(predictions_end, "preds_end.pkl") dropPickle(p_raw_mapping, "p_raw_mapping.pkl")""" predictions_start = loadPickle("preds_start.pkl") predictions_end = loadPickle("preds_end.pkl") p_raw_mapping = loadPickle("p_raw_mapping.pkl") models = ['COATT_fixed_200'] #predictions_start = {}; predictions_end = {} with open("preds_dev.txt", "a") as f: f.write("model" + "," + "pred_raw" + "," + "a_raw") for model in models: FLAGS.model_type = model FLAGS.train_dir = "train/ensemble_train_" + model train_dir = "train/ensemble_train_" + model if model == 'COATT_fixed_200' or model == 'COATT_fixed_200_mix' : FLAGS.embedding_size = 200 FLAGS.lstm_units = 200 elif model == "MPCM_p100": FLAGS.embedding_size = 100 FLAGS.lstm_units = 100 FLAGS.perspective_units = 100 else: FLAGS.embedding_size = 100 FLAGS.lstm_units = 100 FLAGS.perspective_units = 50 with tf.Graph().as_default(): with tf.Session() as sess: embeddings = np.load(FLAGS.data_dir + '/glove.trimmed.' + str(FLAGS.embedding_size) + '.npz') pretrained_embeddings = embeddings['glove'] qa = QASystem(FLAGS, pretrained_embeddings, vocab_dim=len(vocab.keys())) initialize_model(sess, qa, train_dir) # get predicted start-end indices a_s_l = [] a_e_l = [] f1 = exact_match = total = 0; answers = {}; prob_start = {}; prob_end = {}; p_raw_mapping= {} prog = Progbar(target=1 + int(len(test_data) / FLAGS.batch_size)) for i, batch in enumerate(minibatches(test_data, FLAGS.batch_size, shuffle = False)): batch_test = batch[:4] (ys, ye) = qa.predict_on_batch(sess, *batch_test) a_s = (np.argmax(ys, axis=1)) a_e = (np.argmax(ye, axis=1)) a_s_l = a_s_l + list(a_s) a_e_l = a_e_l + list(a_e) print(len(a_s)) for j in range(len(a_s)): p_raw = batch[7][j] a_raw = batch[8][j] s = a_s[j] e = a_e[j] print(s,e)# comment this out pred_raw = ' '.join(p_raw.split()[s:e + 1]) p_raw_mapping[batch[9][j].strip("\n")] = p_raw #answers[batch[9][j].strip("\n")] = pred_raw.strip("\n") prob_start[batch[9][j].strip("\n")] = ys[j] prob_end[batch[9][j].strip("\n")] = ye[j] f.write(model + "," + pred_raw + "," + a_raw ) prog.update(i + 1, [("processed", i + 1)]) predictions_start[model] = prob_start predictions_end[model] = prob_end f.close() dropPickle(predictions_start, "preds_start.pkl") dropPickle(predictions_end, "preds_end.pkl") dropPickle(p_raw_mapping, "p_raw_mapping.pkl") # combine the predictions of the two models (while making independent start, end predictions) """answers = {} for qkey in predictions_start['MPCM'].keys(): ys = predictions_start['MPCM'][qkey]*predictions_start['COATT'][qkey]*predictions_start['COATT_fixed'][qkey] ye = predictions_end['MPCM'][qkey]*predictions_end['COATT'][qkey]*predictions_end['COATT_fixed'][qkey] s = (np.argmax(ys)) arr = ye.copy() arr[0:s] = 0 e = (np.argmax(arr)) #e = (np.argmax(ye)) pred_raw = ' '.join(p_raw_mapping[qkey].split()[s:e + 1]) answers[qkey] = pred_raw.strip("\n")""" # predict span with max predicted probability (make joint prediction rather than indepenedntly predicitng start and end indices) answers = {} for qkey in predictions_start['MPCM'].keys(): ys = predictions_start['MPCM'][qkey]*predictions_start['COATT'][qkey]*predictions_start['COATT_fixed'][qkey]\ *predictions_start['COATT_mix'][qkey]*predictions_start['COATT_fixed_mix'][qkey]\ *predictions_start['COATT_fixed_200_mix'][qkey]*predictions_start['COATT_fixed_200'][qkey] #to save time ye = predictions_end['MPCM'][qkey]*predictions_end['COATT'][qkey]*predictions_end['COATT_fixed'][qkey]\ *predictions_end['COATT_mix'][qkey]*predictions_end['COATT_fixed_mix'][qkey]\ *predictions_end['COATT_fixed_200_mix'][qkey]*predictions_end['COATT_fixed_200'][qkey] #to save time s = 0; e = 0; prodmax = 0 for si in range(0, len(ys)): for ei in range(si, len(ye)): prod = ys[si]*ye[ei] if prod > prodmax: s = si e = ei prodmax = prod print(s,e, prodmax) pred_raw = ' '.join(p_raw_mapping[qkey].split()[s:e + 1]); print(pred_raw) answers[qkey] = pred_raw.strip("\n") # write to json file to root dir with io.open('dev-prediction.json', 'w', encoding='utf-8') as f: f.write(unicode(json.dumps(answers, ensure_ascii=False)))
def loadInfo(fn): return util.loadPickle(fn)
# -*- coding: utf-8 -*- import sys import json from operator import add reload(sys) sys.setdefaultencoding('utf-8') from config import sc from util import loadPickle date = sys.argv[1] gender = loadPickle("/home/hadoop/chen.cheng/moa/gender24.pkl") b = sc.broadcast(gender) data = sc.textFile("hdfs://antispam/user/hadoop/output/chencheng/crux/data/rawData/%s-24/" % (date)) data.cache() out_male = data.map(lambda x : json.loads(x)).filter(lambda x: x[0][0] and x[0][1])\ .filter(lambda x: b.value[int(x[0][0])] == "M").map(lambda x: json.dumps(x)) out_male.saveAsTextFile("hdfs://antispam/user/hadoop/output/chencheng/crux/data/male/%s/"% (date)) out_female = data.map(lambda x : json.loads(x)).filter(lambda x: x[0][0] and x[0][1])\ .filter(lambda x: b.value[int(x[0][0])] == "F")\ .map(lambda x: json.dumps(x)) ''' with open('/home/hadoop/chen.cheng/Chronos/momoid', 'w') as f: for item in out: f.write("%s\n" %(item ) ) '''
idf = {} D = len(cWcDict.keys()) for c in cWcDict: for w in cWcDict[c]: if w not in idf: cnt = 0 for cc in cWcDict: if w in cWcDict[cc]: cnt += 1 idf[w] = math.log(float(D) / cnt, 2) return idf if __name__ == '__main__': #load wbWc = util.loadPickle('wbWc.pickle') infoWc = util.loadPickle('infoWc.pickle') cList = loadCirecle('report/circles', 10) cInfoWcDict = {} cWbWcDict = {} for c in cList: cKey = " ".join(c) cInfoWcDict[cKey] = {} cWbWcDict[cKey] = {} for u in c: addDict(cInfoWcDict[cKey], infoWc[u]) addDict(cWbWcDict[cKey], wbWc[u]) anaCircle(cInfoWcDict, cWbWcDict) """ print "=" * 50