def loadCachedData(self, name): try: if self.filename == "remote": return None return load_pickle(self.filename + "." + name) except: return None
def setup_DUC_sentences(task, parser=None, reload=False): ## load problems quickly from pickle file if (not reload) and os.path.isfile(task.data_pickle): sys.stderr.write('Loading [%s] problem data from [%s]\n' %(task.name, task.data_pickle)) task.problems = util.load_pickle(task.data_pickle) return ## parse sentences text.text_processor.load_splitta_model('/u/dgillick/sbd/splitta/model_nb/') for problem in task.problems: sys.stderr.write('%s\n' %problem.id) problem.load_documents() if parser: for doc in problem.new_docs: doc.parse_sentences(parser) problem.parsed = True if parser: parser.run() for sentence, parsetree in parser.parsed.items(): sentence.parsed = parsetree ## save pickled version for faster loading later sys.stderr.write('Saving [%s] problem data in [%s]\n' %(task.name, task.data_pickle)) util.save_pickle(task.problems, task.data_pickle)
def get_repo_frequencies(): """ Returns a map of repo id to (frequency, relative_freq) tuples. """ path = os.path.join(config.CALC_DATA_PATH, 'repo_frequencies1.pickle') global _repo_freqs repo_frequencies = _repo_freqs or util.load_pickle(path) if repo_frequencies: _repo_freqs = repo_frequencies return repo_frequencies user_watches = get_user_watches() total_watches = sum(len(w) for w in user_watches.values()) logger.debug("Total watches is {0}".format(total_watches)) repo_frequencies = dict() for repos in user_watches.values(): for watch in repos: if not watch in repo_frequencies: repo_frequencies[watch] = (1, 1/total_watches) else: freq = repo_frequencies[watch][0] + 1 repo_frequencies[watch] = (freq, freq/total_watches) util.store_pickle(repo_frequencies, path, debug=True) _repo_freqs = repo_frequencies return repo_frequencies
def __init__(self): self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]') self._stopwords = set(open(STOPWORDS).read().splitlines()) self._stopwords.add('said') self._porter_stemmer = nltk.stem.porter.PorterStemmer() self._sent_tokenizer = util.load_pickle('%s%s' %(STATIC_DATA_ROOT, 'punkt/english.pickle')) self._sent_split_ABBR_LIST = set(['Mr.', 'Mrs.', 'Sen.', 'No.', 'Dr.', 'Gen.', 'St.', 'Lt.', 'Col.']) self._sent_split_PUNCT_LIST = set(['\" ', '\")', ') ', '\' ', '\"\''])
def read_mapper(self): path = os.path.join(self.inverted_mapper_dir, '%s.inv.pickle' % self.name) try: self.inverted_id_mapper_list = util.load_pickle(path) except EOFError, e: print e print '!!!!! Fail to read inverted id map of %s' % self.name self.exit = True
def load(self, path, filename): self.filename = filename self.path = path #try: dict = ut.load_pickle(self.path+'/'+self.filename) #except: # print 'loading of '+self.path+'/'+filename+' failed. WARNING: it will be overwritten on save()!' # return self.datasets = dict['datasets']
def __init__(self, goal_path=None): self.plan = RawPlan() self.goal_list = [] self.goal_vector = [] self.stemmer = nltk.PorterStemmer() if goal_path is None: self.plan.populate_goal_actions_map() self.goal_actions_map = self.plan.goal_actions_map self.goal_list = self.goal_actions_map.keys() else: self.goal_actions_map = load_pickle(goal_path) self.goal_list = self.goal_actions_map.keys()
def test(self, feature_data = None): #test on current scan: print ut.getTime(), 'test on:', self.processor.scan_dataset.id if feature_data == None: filename = self.processor.get_features_filename() dict = ut.load_pickle(filename) else: dict = feature_data baseline_labels = self.classify_baseline_code() return baseline_labels, self.test_results(dict, baseline_labels)
def run(selected_feature_dir, extra_feature_dir, matrix_dir): feature_dir = selected_feature_dir util.makedir(matrix_dir) for file_name in os.listdir(feature_dir): name = file_name.split('.')[0] feature_path = os.path.join(feature_dir, file_name) feature_dict = util.load_pickle(feature_path, typ='json') # word features matrix word_matrix_dir = os.path.join(matrix_dir, 'word/') to_vector(name, feature_dict, word_matrix_dir) # extra features matrix extra_feature_path = os.path.join(extra_feature_dir, file_name) print extra_feature_path extra_feature_dict = util.load_pickle(extra_feature_path, typ='json') for cls in get_extra_feature_class(extra_feature_dict): cls_matrix_dir = os.path.join(matrix_dir, cls) cls_feature_dict = {} for rank in extra_feature_dict: cls_feature_dict[rank] = extra_feature_dict[rank][cls] to_vector(name, cls_feature_dict, cls_matrix_dir) return None
def run(category_dir, result_dir, config): result_file_extension = config["result_file_extension"] for file_name in os.listdir(category_dir): name = file_name.split('.')[0] category_path = os.path.join(category_dir, file_name) category = util.load_pickle(category_path) ps = PeopleSet(name, category) clustering_result_path = os.path.join(result_dir, '%s.%s' % (name, result_file_extension)) ps.dump_xml(clustering_result_path) del ps return None
def test(self, feature_data = None): #test on current scan: print ut.getTime(), 'test on:', self.processor.scan_dataset.id if feature_data == None: filename = self.processor.get_features_filename() print 'loading', filename dict = ut.load_pickle(filename) else: dict = feature_data #print ut.getTime(), dict current_set_size = dict['set_size'] feature_vector_length = len(self.processor.features.get_indexvector(self.features)) print ut.getTime(), feature_vector_length labels = np.array(np.zeros(len(self.processor.map_polys))) print 'test: length of labels vector:', len(labels) test = cv.cvCreateMat(1,feature_vector_length,cv.CV_32FC1) if current_set_size == 0: print ut.getTime(), 'ERROR: test dataset is empty!' return labels, 1, 1, 1 count = 0 for index in dict['point_indices']: fv = (dict['features'][count])[self.processor.features.get_indexvector(self.features)] #print ut.getTime(), fv, dict['features'][count] for fv_index, fv_value in enumerate(fv): test[fv_index] = fv_value #print 'class',self.cv_classifier label = self.cv_classifier.predict(test) #print label.value labels[index] = label.value #print 'tdone' if count % 4096 == 0: print ut.getTime(), 'testing:', count, 'of', current_set_size, '(',(float(count)/float(current_set_size)*100.0),'%)' count += 1 #save for later use for postprocessing: self.test_feature_dict = dict self.test_labels = labels #cv.cvReleaseMat(test) return labels, self.test_results(dict, labels)
def load_config(app, config_file): """Loads the configuration from the specified file and sets the properties of ```app```, ```db``` and ```machine``` application objects :param app: the flask application object :param config_file: the absolute path to the configuration file """ global db, machine, category_classifier config = ConfigParser.SafeConfigParser() try: config.readfp(open(config_file)) except IOError as e: app.logger.error("An error while reading '%s': %s" % (config_file, e.strerror)) # Initialize the database try: database_uri = config.get('database', 'sqlalchemy.url') pool_size = config.get('database', 'sqlalchemy.pool_size') # SQLAlchemy configuration app.config['SQLALCHEMY_DATABASE_URI'] = database_uri app.config['SQLALCHEMY_POOL_SIZE'] = int(pool_size) except ConfigParser.NoSectionError as e: logger.error("The specified section does not exist", e) db = SQLAlchemy(app) # Intialize the machine classifier_file = config.get("classifier", "classifier.file") if not classifier_file is None: if os.path.exists(classifier_file): _dict = util.load_pickle(classifier_file) category_classifier = _dict['categoryClassifier'] if not isinstance(category_classifier, DssgCategoryClassifier): app.logger.error("Invalid classifier object type: %s" % type(category_classifier)) category_classifier = None return # Proceed machine = Machine(category_classifier) else: app.logger.info("The classifier file '%s' does not exist" % classifier_file)
def get_user_watches(): """ Returns an dict of user id keys mapped to a set of repo ids being watched by that user """ path = os.path.join(config.CALC_DATA_PATH, 'user_watches.pickle') global _user_watches user_watches = _user_watches or util.load_pickle(path) if user_watches: _user_watches = user_watches return user_watches user_watches = collections.defaultdict(set) for line in open(os.path.join(config.SRC_DATA_PATH, 'data.txt')): k,v = line.rstrip().split(':') user_watches[int(k)].add(int(v)) util.store_pickle(user_watches, path, debug=True) _user_watches = user_watches return user_watches
def worker(proc_num, queue): while True: # time.sleep(random.random()*10) try: name = queue.get(block=False) except Empty: print proc_num, "Finished" return if name + ".pkl" in os.listdir(POLARITIES): continue print proc_num, "Running", name subredditgen.main(name) word_dict = util.load_pickle(DICTS.format(name)) word_dict.filter_extremes(no_above=0.1, no_below=100) to_keep = sorted(word_dict.dfs, key=lambda w : word_dict.dfs[w], reverse=True)[:5000] word_dict.filter_tokens(good_ids=to_keep) sub_vecs = create_representation("SVD", constants.SUBREDDIT_EMBEDDINGS.format(name)) pos_seeds, neg_seeds = seeds.twitter_seeds() sub_vecs = sub_vecs.get_subembed(set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds)) pols = polarity_induction_methods.bootstrap(sub_vecs, pos_seeds, neg_seeds, return_all=True, nn=25, beta=0.9, num_boots=50, n_procs=10) util.write_pickle(pols, POLARITIES + name + ".pkl")
def extra_extract(self, name, name_body_text): version = self.config['version'] id_mapper_path = os.path.join(util.ROOT, self.id_mapper_pickle_dir, '%s.json' % name) id_mapper = util.load_pickle(id_mapper_path, typ='json') extra_features = {} metadata_path = os.path.join(self.metadata_dir, '%s.xml' % name) with open(metadata_path) as f: content = f.read() corpus = etree.XML(content) for doc in corpus: rank = doc.get('rank') try: mapped_rank = id_mapper[rank] except KeyError: continue if version == '2007test': # The description file opposite the snippet and title in 2007 description file title = doc.xpath('./snippet')[0].xpath('string()') snippet = doc.get('title') elif version == '2008test': title = doc.get('title') try: snippet = doc.xpath('./snippet')[0].xpath('string()') # snippet may not exist. e.g. /data/weps-2/data/test/metadata/FRANZ_MASEREEL.xml, rank="26" except IndexError: snippet = '' url = doc.get('url') title_freq = self.title_tokenize(title) url_freq = self.url_tokenize(url) snippet_freq = self.snippet_tokenize(snippet) body_text_path = os.path.join(name_body_text, '%s.txt' % mapped_rank) with open(body_text_path) as f: email_freq = self.email_detect(f.read()) extra_features[mapped_rank] = {'title': title_freq, 'url': url_freq, 'snippet': snippet_freq, 'emails': email_freq} return extra_features
def setup_DUC_sentences(task, parser=None, reload=False): ## load problems quickly from pickle file if (not reload) and os.path.isfile(task.data_pickle): sys.stderr.write('Loading [%s] problem data from [%s]\n' %(task.name, task.data_pickle)) task.problems = util.load_pickle(task.data_pickle) return ## only parse sentences if needed for problem in task.problems: print problem.id problem.load_documents() if parser: for doc in problem.new_docs: doc.parse_sentences(parser) if parser: parser.run() for sentence, parsetree in parser.parsed.items(): sentence.parsed = parsetree ## save pickled version for faster loading later sys.stderr.write('Saving [%s] problem data in [%s]\n' %(task.name, task.data_pickle)) util.save_pickle(task.problems, task.data_pickle)
tokenized_datasets = [[[token.lower() for token in request] for request in dataset] for dataset in tokenized_datasets_original] """ Build the whole vocabulary Vocab lists: • special token: "UNK_TOKEN" • vocab_shared: intersection of word2vec vocab and politeness vocab • vocab_freq: frequent vocab that is not in word2vec vocab """ UNK = "UNK_TOKEN" if use_existing_vocab: vocab_politeness = load_pickle( "data/Stanford_politeness_corpus/vocab_politeness.pkl") else: # Load word embedding model logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = KeyedVectors.load_word2vec_format(fname=word2vec, binary=True) freq_threshold = 2 all_tokens = [ token for dataset in tokenized_datasets for request in dataset for token in request ] fdist = FreqDist(all_tokens) fdist_lst = fdist.most_common()
def load_trial_data(ind_obj1, ind_obj2, fn): fmat1 = util.load_pickle(fn + '/' + str(ind_obj1) + '.pkl') fmat2 = util.load_pickle(fn + '/' + str(ind_obj2) + '.pkl') return fmat1, fmat2
def load_sbd_model(model_path='model_nb/'): sys.stderr.write('loading model from [%s]... ' % model_path) model = util.load_pickle(model_path + 'model.pkl') model.path = model_path sys.stderr.write('done!\n') return model
def make_mpii_yolo(): joint_info_full = JointInfo( 'rank,rkne,rhip,lhip,lkne,lank,pelv,thor,neck,head,rwri,relb,rsho,lsho,lelb,lwri', 'lsho-lelb-lwri,rsho-relb-rwri,lhip-lkne-lank,rhip-rkne-rank,neck-head,pelv-thor' ) joint_info_used = JointInfo( 'rank,rkne,rhip,lhip,lkne,lank,rwri,relb,lelb,lwri', 'lelb-lwri,relb-rwri,lhip-lkne-lank,rhip-rkne-rank') selected_joints = [ joint_info_full.ids[name] for name in joint_info_used.names ] mat_path = f'{paths.DATA_ROOT}/mpii/mpii_human_pose_v1_u12_1.mat' s = matlabfile.load(mat_path).RELEASE annolist = np.atleast_1d(s.annolist) all_boxes = util.load_pickle( f'{paths.DATA_ROOT}/mpii/yolov3_detections.pkl') examples = [] with util.BoundedPool(None, 120) as pool: for anno_id, (anno, is_train, rect_ids) in enumerate( zip(annolist, util.progressbar(s.img_train), s.single_person)): if not is_train: continue image_path = f'{paths.DATA_ROOT}/mpii/images/{anno.image.name}' annorect = np.atleast_1d(anno.annorect) gt_people = [] for rect_id, rect in enumerate(annorect): if 'annopoints' not in rect or len(rect.annopoints) == 0: continue coords = np.full(shape=[joint_info_full.n_joints, 2], fill_value=np.nan, dtype=np.float32) for joint in np.atleast_1d(rect.annopoints.point): coords[joint.id] = [joint.x, joint.y] bbox = boxlib.expand(boxlib.bb_of_points(coords), 1.25) coords = coords[selected_joints] ex = Pose2DExample(image_path, coords, bbox=bbox) gt_people.append(ex) if not gt_people: continue image_relpath = os.path.relpath(f'images/{anno.image.name}') boxes = [box for box in all_boxes[image_relpath] if box[-1] > 0.5] if not boxes: continue iou_matrix = np.array( [[boxlib.iou(gt_person.bbox, box[:4]) for box in boxes] for gt_person in gt_people]) gt_indices, box_indices = scipy.optimize.linear_sum_assignment( -iou_matrix) for i_gt, i_det in zip(gt_indices, box_indices): if iou_matrix[i_gt, i_det] > 0.1: ex = gt_people[i_gt] ex.bbox = np.array(boxes[i_det][:4]) new_im_path = image_path.replace('mpii', 'mpii_downscaled_yolo') without_ext, ext = os.path.splitext(new_im_path) new_im_path = f'{without_ext}_{rect_id:02d}{ext}' pool.apply_async(make_efficient_example, (ex, new_im_path), callback=examples.append) examples.sort(key=lambda ex: ex.image_path) def n_valid_joints(example): return np.count_nonzero(np.all(~np.isnan(example.coords), axis=-1)) examples = [ex for ex in examples if n_valid_joints(ex) > 6] return Pose2DDataset(joint_info_used, examples)
def load_punkt_model(self, path): self._sent_tokenizer = util.load_pickle(path)
def setUp(self): config_path = util.abs_path('configure/2007test.NN.nltk.json') config = util.load_pickle(config_path, typ='json') self.flt = FeatureExtractor(config)
import util, concept_mapper import gflags import sys if __name__ == '__main__': FLAGS = gflags.FLAGS gflags.DEFINE_string('task', None, 'Which task (tac08)', short_name='t') gflags.MarkFlagAsRequired('task') gflags.DEFINE_string('load', None, 'Path to pickled task data') gflags.MarkFlagAsRequired('task') try: argv = FLAGS(sys.argv) except gflags.FlagsError, e: sys.stderr.write('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) sys.exit(1) sys.stderr.write('Loading [%s] problem data in [%s]\n' %(task.name, task.data_pickle)) task.problems = util.load_pickle(FLAGS.load) for problem in task.problems: sents = problem.get_new_sentences() gold_sents = problem.get_training_sentences() values = [] for sent in sents:
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--input_file', type=str, required=True) # parser.add_argument('--emotion_file', type=str, required=True) parser.add_argument('--conceptnet_path', type=str, required=True) parser.add_argument('--dataset_vocab_path', type=str, required=True) parser.add_argument('--top_k', type=int, default=100) args = parser.parse_args() input_file = args.input_file # emotion_file = args.emotion_file conceptnet_path = args.conceptnet_path dataset_vocab_path = args.dataset_vocab_path top_k = args.top_k concept_words = load_pickle(input_file) # emotions = read_file(emotion_file) CN = load_pickle(conceptnet_path) # load vocab print("Loading dataset vocab from ", dataset_vocab_path) vocab_ckpt = torch.load(dataset_vocab_path) word2id = vocab_ckpt["src"].base_field.vocab.stoi id2word = vocab_ckpt["src"].base_field.vocab.itos print("dataset vocab size: ", len(word2id)) associated_concepts = defaultdict(list) for h,r,t,w in tqdm(CN): associated_concepts[h].append((r,t,w)) # to clean concept words and save as np.array to save space
input_labels = [] for l in opt.in_labels.split('_'): input_labels.append(l) # Output labels if opt.out_labels is None: output_labels = in_labels else: output_labels = [] for l in opt.out_labels.split('_'): output_labels.append(l) # Data if opt.preprocessing.find('true')>=0: d = extract_data(data_path, input_labels, opt.state_topic) ut.save_pickle(d, os.path.join(data_path, opt.saved_filename)) else: d = ut.load_pickle(os.path.join(data_path, opt.saved_filename)) if opt.plot.find('true')>=0 or opt.debug.find('true')>=0: plot_raw_data(d) ## cross_eval(d) else: rospy.init_node('obj_state_classifier_node') wse = obj_state_classifier(input_labels, output_labels, opt.state_topic, opt.srv_topic, debug=opt.debug) wse.run(d)
# Predict 'soft' voting with probabilities pred1 = np.asarray([clf.predict(csc_matrix(X_list)) for clf in estimators]) pred2 = np.average(pred1, axis=0, weights=weights) pred = np.argmax(pred2, axis=1) # Convert integer predictions to original labels: return pred #x = export_classifiers() #print('x', size(x)) #export(x) trained = util.load_pickle(name='fs_1', path='..\\pickles\\feature_sets\\') print('trained', size(trained)) test = util.load_pickle(name='fs_test_1', path='..\\pickles\\test_features\\') print('test', size(test)) test_data = test['data_set'] featureset = 'fs_words_bigrams_pos' X_train, y_train = trained[featureset], trained['labels'] X_test, y_test = test[featureset], test['labels'] feat_size = X_train.shape[1] x = load_from_file() svm = x['svm'] xgb = x['xgb'] knn = x['knn'] nb = x['nb']
def main(args): """create word vector :param file_path: path of corpus :param window_size: window size :param shift: num of samples in w2v skip-gram negative-sampling(sgns) :param dim: the size of wordvec WV = [vocab_size, dim] """ logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO) logging.info(f"[INFO] args: {args}") logging.info("[INFO] Loading dictionary...") id_to_word, word_to_id = load_pickle(args.pickle_id2word) vocab_size = len(id_to_word) logging.debug(f"[DEBUG] vocab: {vocab_size} words") if args.cooccur_pretrained is not None: logging.info("[INFO] Loading pre-trained co-occur matrix...") C = load_matrix(args.cooccur_pretrained, len(id_to_word)) else: logging.info("[INFO] Creating co-occur matrix...") C = create_co_matrix(args.file_path, word_to_id, vocab_size, args.window_size) # threshold by min_count if args.threshold: C = threshold_cooccur(C, threshold=args.threshold) os.makedirs("model", exist_ok=True) c_name = "model/C_w-{}".format(args.window_size) with open(c_name, "w") as wp: for id, cooccur_each in enumerate(C): cooccur_nonzero = [ f"{id}:{c}" for id, c in enumerate(cooccur_each) if c > 0 ] wp.write(f"{id}\t{' '.join(cooccur_nonzero)}\n") if args.sppmi_pretrained is not None: logging.info("[INFO] Loading pre-trained sppmi matrix...") M = load_matrix(args.sppmi_pretrained, len(id_to_word)) else: logging.info("[INFO] Computing sppmi matrix...") # use smoothing or not in computing sppmi M = sppmi(C, args.shift, has_abs_dis=args.has_abs_dis, has_cds=args.has_cds) m_name = "model/SPPMI_w-{}_s-{}".format(args.window_size, args.shift) with open(m_name, "w") as wp: for id, sppmi_each in enumerate(M): sppmi_nonzero = [ f"{id}:{m}" for id, m in enumerate(sppmi_each) if m > 0 ] wp.write(f"{id}\t{' '.join(sppmi_nonzero)}\n") logging.info("[INFO] Calculating word vector...") try: from scipy.sparse.linalg import svds U, S, V = svds(coo_matrix(M), k=args.dim) except: U, S, V = np.linalg.svd(coo_matrix(M)) word_vec = np.dot(U, np.sqrt(np.diag(S))) wv_name = "model/WV_d-{}_w-{}_s-{}".format(args.dim, args.window_size, args.shift) np.save(wv_name, word_vec[:, :args.dim]) return
# ent_emb = torch.cat([ent_emb, F.normalize(ent_emb.mean(dim=0, keepdim=True), p=2, dim=-1)], dim=0) ent_emb = torch.cat( [ent_emb, F.normalize(torch.randn((1, ent_emb_dim)), p=2, dim=-1)], dim=0) print("ent embedding shape: ", ent_emb.shape) # load vocab print("Loading dataset vocab from ", dataset_vocab_path) vocab_ckpt = torch.load(dataset_vocab_path) word2id = vocab_ckpt["src"].base_field.vocab.stoi id2word = vocab_ckpt["src"].base_field.vocab.itos print("dataset vocab size: ", len(word2id)) # load stopwords stopwords = load_pickle("./data/KB/stopwords.pkl") # concept_VAD_strength_softmax = torch.ones(len(concept_embedding_dict)+1)/(len(concept_embedding_dict)+1) print("Loading concept VAD strength dict from ", concept_VAD_strength_dict_path) concept_VAD_strength_dict = load_pickle(concept_VAD_strength_dict_path) concept_VAD_strength_embedding = torch.zeros( len(concept_VAD_strength_dict) + 1) for k, v in concept_VAD_strength_dict.items(): concept_VAD_strength_embedding[ent2id[k]] = v concept_VAD_strength_embedding[ent2id["<pad>"]] = 0 # concept_VAD_strength_softmax = torch.softmax(concept_VAD_strength_embedding, dim=-1) smaller_suffix = "-smaller" if smaller else "" method_suffix = "-topk" value_suffix = "{0}".format(top_k)
estimator.fit(train_data, train_target) test_predict = estimator.predict(test_data) f1 = f1_score(test_target, test_predict, average='weighted') return f1 if __name__ == '__main__': print('Loading 20newsgroup dataset for all categories') ############################# Load train data train = fetch_20newsgroups(subset='train') print('Train data:\n') print('%d documents' % len(train.filenames)) print('%d categories' % len(train.target_names)) train_data = load_pickle('dataset/train-data.pkl')[:100] train_target = train.target[:100] D_train = len(train_target) ############################# Tune LDA V = 1000 kappa = 0.5 tau0 = 64 var_i = 100 num_topics = 20 sizes = [512, 256] alphas = [.1, .05, .01] pool = Pool(processes=3) works = [] kf = KFold(n_splits=3)
def make_mupots_yolo(): all_short_names = ( 'thor,spi4,spi2,spin,pelv,neck,head,htop,lcla,lsho,lelb,lwri,lhan,rcla,rsho,relb,rwri,' 'rhan,lhip,lkne,lank,lfoo,ltoe,rhip,rkne,rank,rfoo,rtoe'.split(',')) # originally: [7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 4, 3, 6] selected_joints = [ 7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 3, 6, 4 ] order_joints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 14] joint_names = [all_short_names[j] for j in selected_joints] j = p3ds.JointInfo.make_id_map(joint_names) edges = [(j.htop, j.head), (j.head, j.neck), (j.neck, j.lsho), (j.lsho, j.lelb), (j.lelb, j.lwri), (j.neck, j.rsho), (j.rsho, j.relb), (j.relb, j.rwri), (j.neck, j.spin), (j.spin, j.pelv), (j.pelv, j.lhip), (j.lhip, j.lkne), (j.lkne, j.lank), (j.pelv, j.rhip), (j.rhip, j.rkne), (j.rkne, j.rank)] joint_info = p3ds.JointInfo(j, edges) root = f'{paths.DATA_ROOT}/mupots' intrinsic_matrices = util.load_json(f'{root}/camera_intrinsics.json') dummy_coords = np.ones((joint_info.n_joints, 3)) detections_all = util.load_pickle(f'{root}/yolov3_detections.pkl') examples_val = [] examples_test = [] for i_seq in range(1, 21): annotations = matlabfile.load( f'{root}/TS{i_seq}/annot.mat')['annotations'] intrinsic_matrix = intrinsic_matrices[f'TS{i_seq}'] camera = cameralib.Camera(np.zeros(3), np.eye(3), intrinsic_matrix, distortion_coeffs=None, world_up=(0, -1, 0)) n_people = annotations.shape[1] n_frames = annotations.shape[0] for i_frame in range(n_frames): image_relpath = f'TS{i_seq}/img_{i_frame:06d}.jpg' detections_frame = detections_all[image_relpath] image_path = f'{root}/{image_relpath}' for detection in detections_frame: if detection[4] > 0.1: ex = p3ds.Pose3DExample(image_path, dummy_coords, detection[:4], camera, mask=None, univ_coords=dummy_coords, scene_name=f'TS{i_seq}') examples_test.append(ex) gt_people = [] for i_person in range(n_people): world_coords = np.array( annotations[i_frame, i_person].annot3.T[order_joints], dtype=np.float32) univ_world_coords = np.array( annotations[i_frame, i_person].univ_annot3.T[order_joints], dtype=np.float32) im_coords = camera.world_to_image(world_coords) gt_box = boxlib.expand(boxlib.bb_of_points(im_coords), 1.1) ex = p3ds.Pose3DExample(image_path, world_coords, gt_box, camera, mask=None, univ_coords=univ_world_coords, scene_name=f'TS{i_seq}') gt_people.append(ex) confident_detections = [ det for det in detections_frame if det[-1] > 0.1 ] if confident_detections: iou_matrix = np.array([[ boxlib.iou(gt_person.bbox, box[:4]) for box in confident_detections ] for gt_person in gt_people]) gt_indices, detection_indices = scipy.optimize.linear_sum_assignment( -iou_matrix) for i_gt, i_det in zip(gt_indices, detection_indices): if iou_matrix[i_gt, i_det] > 0.1: ex = gt_people[i_gt] ex.bbox = np.array(confident_detections[i_det][:4]) examples_val.append(ex) return p3ds.Pose3DDataset(joint_info, valid_examples=examples_val, test_examples=examples_test)
def __init__(self): self.target_dir = './upload/' self.frame_ids_path = './data/state/frameids.pickle' self.frame_ids = load_pickle(self.frame_ids_path, [])
def load_svm_sbd_model(model_file_path): model = util.load_pickle(model_file_path) sys.stderr.write('done!\n') return model
def load(self): self.input_normalizers = load_pickle('cache/models/%s-norm.pickle' % self.name) self.load_weights(self.name)
type=str, required=True) parser.add_argument('--top_k', type=int, default=100) args = parser.parse_args() input_file = args.input_file emotion_file = args.emotion_file emotion_lexicon_file = args.emotion_lexicon_file # conceptnet_path = args.conceptnet_path dataset_vocab_path = args.dataset_vocab_path dataset_vocab_embedding_path = args.dataset_vocab_embedding_path top_k = args.top_k num_emotional_words = top_k // 4 concept_words = load_pickle(input_file) # tuples of np.array emotions = read_file(emotion_file) # CN = load_pickle(conceptnet_path) emotion_lexicon = load_pickle(emotion_lexicon_file) # vocab_embedding = torch.load(dataset_vocab_embedding_path) # (vocab, emb_dim) # load vocab print("Loading dataset vocab from ", dataset_vocab_path) vocab_ckpt = torch.load(dataset_vocab_path) word2id = vocab_ckpt["src"].base_field.vocab.stoi id2word = vocab_ckpt["src"].base_field.vocab.itos print("dataset vocab size: ", len(word2id)) new_word_ids = [] new_word_scores = [] new_word_VAD_scores = []
def create_train_datastructures(self): #loop through all marked datasets self.processor.scan_dataset = self.processor.scans_database.get_dataset(0) training_set_size = 0 data = [] #get size of training set in total while False != self.processor.scan_dataset: if self.processor.scan_dataset.is_training_set: filename = self.processor.get_features_filename(True) print 'loading', filename dict = ut.load_pickle(filename) # make an equal size of points for each class: use object labels more often: difference = np.sum(dict['labels'] == processor.LABEL_SURFACE) - np.sum(dict['labels'] == processor.LABEL_CLUTTER) #print ut.getTime(), filename #print ut.getTime(), 'surface',np.sum(dict['labels'] == LABEL_SURFACE) #print ut.getTime(), 'clutter',np.sum(dict['labels'] == LABEL_CLUTTER) #print ut.getTime(), difference, "difference = np.sum(dict['labels'] == LABEL_SURFACE) - np.sum(dict['labels'] == LABEL_CLUTTER)" #print ut.getTime(), '' if difference > 0: clutter_features = (dict['features'])[np.nonzero(dict['labels'] == processor.LABEL_CLUTTER)] if len(clutter_features) > 0: #if there are none, do nothin' dict['set_size'] += difference dict['features'] = np.vstack((dict['features'], clutter_features[np.random.randint(0,len(clutter_features),size=difference)])) dict['labels'] = np.hstack((dict['labels'], np.ones(difference) * processor.LABEL_CLUTTER)) elif difference < 0: surface_features = (dict['features'])[np.nonzero(dict['labels'] == processor.LABEL_SURFACE)] if len(surface_features) > 0: #if there are none, do nothin' difference = -difference dict['set_size'] += difference dict['features'] = np.vstack((dict['features'], surface_features[np.random.randint(0,len(surface_features),size=difference)])) dict['labels'] = np.hstack((dict['labels'], np.ones(difference) * processor.LABEL_SURFACE)) training_set_size += dict['set_size'] data.append(dict) #get next one self.processor.scan_dataset = self.processor.scans_database.get_next_dataset() #print ut.getTime(), self.scan_dataset #create training set: self.processor.scan_dataset = self.processor.scans_database.get_dataset(0) current_training_set_index = 0 feature_vector_length = len(self.processor.features.get_indexvector(self.features)) print ut.getTime(), feature_vector_length #create dataset matrices: print ut.getTime(), '#training set size ', training_set_size #deactivate for now: max_traning_size = 1800000#2040000 #if training_set_size < max_traning_size: #if True: train_data = cv.cvCreateMat(training_set_size,feature_vector_length,cv.CV_32FC1) #CvMat* cvCreateMat(int rows, int cols, int type) train_labels = cv.cvCreateMat(training_set_size,1,cv.CV_32FC1) for dict in data: for index in range(dict['set_size']): #only train on surface and clutter if dict['labels'][index] == processor.LABEL_SURFACE or dict['labels'][index]== processor.LABEL_CLUTTER: #print ut.getTime(), point3d #print ut.getTime(), 'fvindexv',self.get_features_indexvector(features) #print ut.getTime(), 'len', len(self.get_features_indexvector(features)) fv = (dict['features'][index])[self.processor.features.get_indexvector(self.features)] #print ut.getTime(), 'fv',fv #print ut.getTime(), np.shape(fv) for fv_index, fv_value in enumerate(fv): train_data[current_training_set_index][fv_index] = fv_value train_labels[current_training_set_index] = dict['labels'][index] # for fv_index, fv_value in enumerate(fv): # print ut.getTime(), train_data[current_training_set_index][fv_index] # print ut.getTime(), '##',train_labels[current_training_set_index],'##' #print ut.getTime(), 'fv ', fv #print ut.getTime(), 'tr ',train_data[index] current_training_set_index = current_training_set_index + 1 #if current_training_set_index % 4096 == 0: # print ut.getTime(), 'label', dict['labels'][index], 'fv', fv if current_training_set_index % 16384 == 0: print ut.getTime(), 'reading features:', current_training_set_index, 'of', training_set_size, '(',(float(current_training_set_index)/float(training_set_size)*100.0),'%)' ##subsample from the features, NOT USED/NOT WORKING? # else: # print ut.getTime(), 'more than',max_traning_size,'features, sample from them...' # #select 2040000 features: # all_data = [] # all_labels = [] # for dict in data: # for index in range(dict['set_size']): # if dict['labels'][index] == processor.LABEL_SURFACE or dict['labels'][index]== processor.LABEL_CLUTTER: # fv = (dict['features'][index])[self.processor.features.get_indexvector(self.features)] # all_data += [fv] # all_labels += [dict['labels'][index]] # # current_training_set_index = current_training_set_index + 1 # if current_training_set_index % 16384 == 0: # print ut.getTime(), 'reading features:', current_training_set_index, 'of', training_set_size, '(',(float(current_training_set_index)/float(training_set_size)*100.0),'%)' # # del data # indices = np.array(random.sample(xrange(len(all_labels)),max_traning_size)) # all_data = np.asarray(all_data) # all_labels = np.asarray(all_labels) # # all_data = all_data[indices] # all_labels = all_labels[indices] # # train_data = cv.cvCreateMat(max_traning_size,feature_vector_length,cv.CV_32FC1) #CvMat* cvCreateMat(int rows, int cols, int type) # train_labels = cv.cvCreateMat(max_traning_size,1,cv.CV_32FC1) # # for index in range(max_traning_size): # for fv_index, fv_value in enumerate(all_data[index]): # train_data[index][fv_index] = fv_value # train_labels[index] = all_labels[index] # if index % 16384 == 0: # print ut.getTime(), 'setting features:', (float(index)/float(max_traning_size)) # print ut.getTime(), 'start training Classifier' type_mask = cv.cvCreateMat(1, feature_vector_length+1, cv.CV_8UC1) cv.cvSet( type_mask, cv.CV_VAR_NUMERICAL, 0) type_mask[feature_vector_length] = cv.CV_VAR_CATEGORICAL return (train_data, train_labels, type_mask)
def make_mpi_inf_3dhp(camera_ids=(0, 1, 2, 4, 5, 6, 7, 8)): all_short_names = ( 'spi3,spi4,spi2,spin,pelv,neck,head,htop,lcla,lsho,lelb,lwri,lhan,rcla,rsho,relb,rwri,' 'rhan,lhip,lkne,lank,lfoo,ltoe,rhip,rkne,rank,rfoo,rtoe'.split(',')) test_set_selected_joints = [*range(14), 15, 16, 14] selected_joints = [ 7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 3, 6, 4 ] joint_names = [all_short_names[j] for j in selected_joints] edges = ( 'htop-head-neck-lsho-lelb-lwri,neck-rsho-relb-rwri,neck-spin-pelv-lhip-lkne-lank,' 'pelv-rhip-rkne-rank') joint_info = p3ds.JointInfo(joint_names, edges) root_3dhp = f'{paths.DATA_ROOT}/3dhp' detections_all = util.load_pickle( f'{paths.DATA_ROOT}/3dhp/yolov3_person_detections.pkl') ################################# # TRAINING AND VALIDATION SET ################################# num_frames = np.asarray([[6416, 12430], [6502, 6081], [12488, 12283], [6171, 6675], [12820, 12312], [6188, 6145], [6239, 6320], [6468, 6054]]) train_subjects = [0, 1, 2, 3, 4, 5, 6] valid_subjects = [ 7 ] # this is my own arbitrary split for validation (Istvan Sarandi) train_examples = [] valid_examples = [] pool = util.BoundedPool(None, 120) for i_subject, i_seq, i_cam in itertools.product( train_subjects + valid_subjects, range(2), camera_ids): seqpath = f'{root_3dhp}/S{i_subject + 1}/Seq{i_seq + 1}' print(f'Processing {seqpath} camera {i_cam}') cam3d_coords = [ ann.reshape([ann.shape[0], -1, 3])[:, selected_joints] for ann in matlabfile.load(f'{seqpath}/annot.mat')['annot3'] ] univ_cam3d_coords = [ ann.reshape([ann.shape[0], -1, 3])[:, selected_joints] for ann in matlabfile.load(f'{seqpath}/annot.mat')['univ_annot3'] ] cameras = load_cameras(f'{seqpath}/camera.calibration') examples_container = train_examples if i_subject in train_subjects else valid_examples frame_step = 5 prev_coords = None camera = cameras[i_cam] n_frames = num_frames[i_subject, i_seq] if i_subject == 5 and i_seq == 1 and i_cam == 2: # This video is shorter for some reason n_frames = 3911 for i_frame in util.progressbar(range(0, n_frames, frame_step)): image_relpath = (f'3dhp/S{i_subject + 1}/Seq{i_seq + 1}/' f'imageSequence/img_{i_cam}_{i_frame:06d}.jpg') cam_coords = cam3d_coords[i_cam][i_frame] world_coords = cameras[i_cam].camera_to_world(cam_coords) univ_camcoords = univ_cam3d_coords[i_cam][i_frame] univ_world_coords = cameras[i_cam].camera_to_world(univ_camcoords) # Check if the joints are within the image frame bounds if not np.all(camera.is_visible(world_coords, [2048, 2048])): continue im_coords = camera.camera_to_image(cam_coords) bbox = get_bbox(im_coords, image_relpath, detections_all) # Adaptive temporal sampling if (prev_coords is not None and np.all( np.linalg.norm(world_coords - prev_coords, axis=1) < 100)): continue prev_coords = world_coords mask_path = image_relpath.replace('imageSequence', 'FGmasks') new_image_relpath = image_relpath.replace('3dhp', '3dhp_downscaled') ex = p3ds.Pose3DExample(image_relpath, world_coords, bbox, camera, mask=mask_path, univ_coords=univ_world_coords) pool.apply_async(make_efficient_example, (ex, new_image_relpath, 1, True), callback=examples_container.append) print('Waiting for tasks...') pool.close() pool.join() print('Done...') ################################# # TEST SET ################################# test_examples = [] cam1_4 = get_test_camera_subj1_4() cam5_6 = get_test_camera_subj5_6() activity_names = [ 'Stand/Walk', 'Exercise', 'Sit on Chair', 'Reach/Crouch', 'On Floor', 'Sports', 'Misc.' ] for i_subject in range(1, 7): seqpath = f'{root_3dhp}/TS{i_subject}' annotation_path = f'{seqpath}/annot_data.mat' with h5py.File(annotation_path, 'r') as m: cam3d_coords = np.array(m['annot3'])[:, 0, test_set_selected_joints] univ_cam3d_coords = np.array( m['univ_annot3'])[:, 0, test_set_selected_joints] valid_frames = np.where(m['valid_frame'][:, 0])[0] activity_ids = m['activity_annotation'][:, 0].astype(int) - 1 camera = cam1_4 if i_subject <= 4 else cam5_6 scene = ['green-screen', 'no-green-screen', 'outdoor'][(i_subject - 1) // 2] for i_frame in valid_frames: image_relpath = f'3dhp/TS{i_subject}/imageSequence/img_{i_frame + 1:06d}.jpg' cam_coords = cam3d_coords[i_frame] univ_camcoords = univ_cam3d_coords[i_frame] activity = activity_names[activity_ids[i_frame]] world_coords = camera.camera_to_world(cam_coords) univ_world_coords = camera.camera_to_world(univ_camcoords) im_coords = camera.camera_to_image(cam_coords) bbox = get_bbox(im_coords, image_relpath, detections_all) ex = p3ds.Pose3DExample(image_relpath, world_coords, bbox, camera, activity_name=activity, scene_name=scene, univ_coords=univ_world_coords) test_examples.append(ex) train_examples.sort(key=lambda x: x.image_path) valid_examples.sort(key=lambda x: x.image_path) test_examples.sort(key=lambda x: x.image_path) return p3ds.Pose3DDataset(joint_info, train_examples, valid_examples, test_examples)
def run_demo(best_path, record_save_path, model_type): print("============Begin Testing============") test_record_path = f'{record_save_path}/test_record.csv' dataloader = util.load_dataset(device, args.data_path, args.batch_size, args.batch_size, args.batch_size) g_temp = util.add_nodes_edges(adj_filename=args.adj_path, num_of_vertices=args.num_nodes) scaler = dataloader['scaler'] run_gconv = 1 lr_decay_rate = 0.97 sensor_ids, sensor_id_to_ind, adj_mx = util.load_adj( args.adj_path_forbase, args.adjtype) supports = [torch.tensor(i).to(device) for i in adj_mx] _, _, A = util.load_pickle(args.adj_path_forbase) A_wave = util.get_normalized_adj(A) A_wave = torch.from_numpy(A_wave).to(device) # print("A_wave:", A_wave.shape, type(A_wave)) best_mae = 100 if args.randomadj: adjinit = None else: adjinit = supports[0] if args.aptonly: supports = None if model_type == "GWaveNet": print("=========Model:GWaveNet=========") print("with scaler") model = GWNET(device, args.num_nodes, args.dropout, supports=supports, gcn_bool=args.gcn_bool, addaptadj=args.addaptadj, aptinit=adjinit, in_dim=args.in_dim, out_dim=args.seq_length, residual_channels=args.nhid, dilation_channels=args.nhid, skip_channels=args.nhid * 8, end_channels=args.nhid * 16) if model_type == "STGCN": print("=========Model:STGCN=========") print("with scaler") model = STGCN(A_wave.shape[0], 2, num_timesteps_input=12, num_timesteps_output=12) if model_type == "LSTM": print("=========Model:LSTM=========") input_dim = 2 hidden_dim = 2 output_dim = 2 model = LSTM(input_dim, hidden_dim, output_dim) model.to(device) model.zero_grad() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) optimizer.zero_grad() scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda epoch: lr_decay_rate**epoch) if torch.cuda.is_available(): model.load_state_dict(torch.load(best_path)) else: model.load_state_dict(torch.load(best_path, map_location='cpu')) outputs = [] target = torch.Tensor(dataloader['y_test']).to(device) target = target[:, :, :, 0] print("201 y_test:", target.shape) for iter, (x, y) in enumerate(dataloader['test_loader'].get_iterator()): testx = torch.Tensor(x).to(device).transpose(1, 3) testx = nn.functional.pad(testx, (1, 0, 0, 0)) with torch.no_grad(): pred = model.forward(testx).squeeze(3) print("iter: ", iter) print("pred: ", pred.shape) outputs.append(pred) yhat = torch.cat(outputs, dim=0) yhat = yhat[:target.size(0), ...] test_record, amape, armse, amae = [], [], [], [] pred = scaler.inverse_transform(yhat) for i in range(12): pred_t = pred[:, i, :] real_target = target[:, i, :] evaluation = evaluate_all(pred_t, real_target) log = 'test for horizon {:d}, Test MAPE: {:.4f}, Test RMSE: {:.4f}, Test MAE: {:.4f}' print(log.format(i + 1, evaluation[0], evaluation[1], evaluation[2])) amape.append(evaluation[0]) armse.append(evaluation[1]) amae.append(evaluation[2]) test_record.append([x for x in evaluation]) test_record_df = pd.DataFrame(test_record, columns=['mape', 'rmse', 'mae']).rename_axis('t') test_record_df.round(3).to_csv(test_record_path) log = 'On average over 12 horizons, Test MAE: {:.4f}, Test MAPE: {:.4f}, Test RMSE: {:.4f}' print(log.format(np.mean(amae), np.mean(amape), np.mean(armse))) print("=" * 10)
sent_length = max(bfeat['sent_length']) / avg_sent_len sent_title_sim = max(bfeat['sent-title_sim']) sent_query_sim = max(bfeat['sent-query_sim']) bfeat['sent_pos'] = sent_pos bfeat['sent_length'] = sent_length bfeat['sent-title_sim'] = sent_title_sim bfeat['sent-query_sim'] = sent_query_sim bfeat['sent_ratio'] /= num_sents if __name__ == '__main__': import concept_mapper, sys #problems = util.load_pickle('dat/tac09_data.pickle') #util.save_pickle(problems[0], 'dat/tac09_prob_1.pickle') problems = [util.load_pickle('dat/tac09_prob_1.pickle')] for problem in problems: sys.stderr.write("%s %d %d\n" % (problem.id, len(problem.new_docs), sum([len(doc.sentences) for doc in problem.new_docs]))) mapper = concept_mapper.HeuristicMapper(problem) mapper.map_concepts() feature = Feature(mapper.concepts, problem) feature.get_bigram_feat() for bigram in mapper.concepts: print feature.feat_to_string(bigram) sys.exit(0)
def make_mpi_inf_3dhp(camera_ids=(0, 1, 2, 4, 5, 6, 7, 8)): all_short_names = ( 'spi3,spi4,spi2,spin,pelv,neck,head,htop,lcla,lsho,lelb,lwri,lhan,rcla,rsho,relb,rwri,' 'rhan,lhip,lkne,lank,lfoo,ltoe,rhip,rkne,rank,rfoo,rtoe'.split(',')) # originally: [7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 4, 3, 6] test_set_selected_joints = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 14 ] selected_joints = [ 7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 3, 6, 4 ] joint_names = [all_short_names[j] for j in selected_joints] edges = ( 'htop-head-neck-lsho-lelb-lwri,neck-rsho-relb-rwri,neck-spin-pelv-lhip-lkne-lank,' 'pelv-rhip-rkne-rank') joint_info = p3ds.JointInfo(joint_names, edges) root_3dhp = f'{paths.DATA_ROOT}/3dhp' detections_all = util.load_pickle( f'{paths.DATA_ROOT}/3dhp/yolov3_person_detections.pkl') ################################# # TRAINING AND VALIDATION SET ################################# num_frames = np.asarray([[6416, 12430], [6502, 6081], [12488, 12283], [6171, 6675], [12820, 12312], [6188, 6145], [6239, 6320], [6468, 6054]]) train_subjects = [0, 1, 2, 3, 4, 5, 6] valid_subjects = [ 7 ] # this is my own arbitrary split for validation (Istvan Sarandi) train_examples = [] valid_examples = [] pool = util.BoundedPool(None, 120) for i_subject, i_seq, i_cam in itertools.product( train_subjects + valid_subjects, range(2), camera_ids): seqpath = f'{root_3dhp}/S{i_subject + 1}/Seq{i_seq + 1}' print(seqpath, i_cam) cam3d_coords = [ ann.reshape([ann.shape[0], -1, 3])[:, selected_joints] for ann in matlabfile.load(f'{seqpath}/annot.mat')['annot3'] ] univ_cam3d_coords = [ ann.reshape([ann.shape[0], -1, 3])[:, selected_joints] for ann in matlabfile.load(f'{seqpath}/annot.mat')['univ_annot3'] ] cameras = load_cameras(f'{seqpath}/camera.calibration') examples_container = train_examples if i_subject in train_subjects else valid_examples frame_step = 5 prev_coords = None camera = cameras[i_cam] n_frames = num_frames[i_subject, i_seq] if i_subject == 5 and i_seq == 1 and i_cam == 2: # This video is shorter for some reason n_frames = 3911 for i_frame in util.progressbar(range(0, n_frames, frame_step)): image_relpath = ( f'3dhp/S{i_subject + 1}/Seq{i_seq + 1}/Images/video_{i_cam}/' + f'frame_{i_frame:06d}.jpg') cam_coords = cam3d_coords[i_cam][i_frame] world_coords = cameras[i_cam].camera_to_world(cam_coords) univ_camcoords = univ_cam3d_coords[i_cam][i_frame] univ_world_coords = cameras[i_cam].camera_to_world(univ_camcoords) # Check if the joints are within the image frame bounds if not np.all(camera.is_visible(world_coords, [2048, 2048])): continue im_coords = camera.camera_to_image(cam_coords) bbox = get_bbox(im_coords, image_relpath, detections_all) # Adaptive temporal sampling if (prev_coords is not None and np.all( np.linalg.norm(world_coords - prev_coords, axis=1) < 100)): continue prev_coords = world_coords ex = p3ds.Pose3DExample(image_relpath, world_coords, bbox, camera, mask=None, univ_coords=univ_world_coords) pool.apply_async(make_efficient_example, (ex, ), callback=examples_container.append) print('Waiting for tasks...') pool.close() pool.join() print('Done...') ################################# # TEST SET ################################# test_examples = [] cam1_4 = make_3dhp_test_camera( sensor_size=np.array([10, 10]), im_size=np.array([2048, 2048]), focal_length=7.32506, pixel_aspect=1.00044, center_offset=np.array([-0.0322884, 0.0929296]), distortion=None, origin=np.array([3427.28, 1387.86, 309.42]), up=np.array([-0.208215, 0.976233, 0.06014]), right=np.array([0.000575281, 0.0616098, -0.9981])) cam5_6 = make_3dhp_test_camera( sensor_size=np.array([10, 5.625]), im_size=np.array([1920, 1080]), focal_length=8.770747185, pixel_aspect=0.993236423, center_offset=np.array([-0.104908645, 0.104899704]), distortion=np.array([ -0.276859611, 0.131125256, -0.000360494, -0.001149441, -0.049318332 ]), origin=np.array([-2104.3074, 1038.6707, -4596.6367]), up=np.array([0.025272345, 0.995038509, 0.096227370]), right=np.array([-0.939647257, -0.009210289, 0.342020929])) activity_names = [ 'Stand/Walk', 'Exercise', 'Sit on Chair', 'Reach/Crouch', 'On Floor', 'Sports', 'Misc.' ] for i_subject in range(1, 7): seqpath = f'{root_3dhp}/TS{i_subject}' annotation_path = f'{seqpath}/annot_data.mat' with h5py.File(annotation_path, 'r') as m: cam3d_coords = np.array(m['annot3'])[:, 0, test_set_selected_joints] univ_cam3d_coords = np.array( m['univ_annot3'])[:, 0, test_set_selected_joints] valid_frames = np.where(m['valid_frame'][:, 0])[0] activity_ids = m['activity_annotation'][:, 0].astype(int) - 1 camera = cam1_4 if i_subject <= 4 else cam5_6 scene = ['green-screen', 'no-green-screen', 'outdoor'][(i_subject - 1) // 2] for i_frame in valid_frames: image_relpath = f'3dhp/TS{i_subject}/imageSequence/img_{i_frame + 1:06d}.jpg' cam_coords = cam3d_coords[i_frame] univ_camcoords = univ_cam3d_coords[i_frame] activity = activity_names[activity_ids[i_frame]] world_coords = camera.camera_to_world(cam_coords) univ_world_coords = camera.camera_to_world(univ_camcoords) im_coords = camera.camera_to_image(cam_coords) bbox = get_bbox(im_coords, image_relpath, detections_all) ex = p3ds.Pose3DExample(image_relpath, world_coords, bbox, camera, activity_name=activity, scene_name=scene, univ_coords=univ_world_coords) test_examples.append(ex) train_examples.sort(key=lambda x: x.image_path) valid_examples.sort(key=lambda x: x.image_path) test_examples.sort(key=lambda x: x.image_path) return p3ds.Pose3DDataset(joint_info, train_examples, valid_examples, test_examples)
def __load_index__(self): self._bcode_off_map = util.load_pickle(self.index_path)
def prepare(self, features_k_nearest_neighbors, nonzero_indices = None, all_save_load = False, regenerate_neightborhood_indices = False): #print np.shape(self.processor.pts3d_bound), 'shape pts3d_bound' imgTmp = cv.cvCloneImage(self.processor.img) self.imNP = ut.cv2np(imgTmp,format='BGR') ###self.processor.map2d = np.asarray(self.processor.camPts_bound) #copied from laser to image mapping if features_k_nearest_neighbors == None or features_k_nearest_neighbors == False: #use range self.kdtree2d = kdtree.KDTree(self.processor.pts3d_bound.T) #print len(nonzero_indices) #print np.shape(np.asarray((self.processor.pts3d_bound.T)[nonzero_indices])) if nonzero_indices != None: print ut.getTime(), 'query ball tree for ', len(nonzero_indices), 'points' kdtree_query = kdtree.KDTree((self.processor.pts3d_bound.T)[nonzero_indices]) else: print ut.getTime(), 'query ball tree' kdtree_query = kdtree.KDTree(self.processor.pts3d_bound.T) filename = self.processor.config.path+'/data/'+self.processor.scan_dataset.id+'_sphere_neighborhood_indices_'+str(self.processor.feature_radius)+'.pkl' if all_save_load == True and os.path.exists(filename) and regenerate_neightborhood_indices == False: #if its already there, load it: print ut.getTime(), 'loading',filename self.kdtree_queried_indices = ut.load_pickle(filename) else: self.kdtree_queried_indices = kdtree_query.query_ball_tree(self.kdtree2d, self.processor.feature_radius, 2.0, 0.2) #approximate print ut.getTime(), 'queried kdtree: ',len(self.kdtree_queried_indices),'points, radius:',self.processor.feature_radius if all_save_load == True: ut.save_pickle(self.kdtree_queried_indices, filename) #make dict out of list for faster operations? (doesn't seem to change speed significantly): #self.kdtree_queried_indices = dict(zip(xrange(len(self.kdtree_queried_indices)), self.kdtree_queried_indices)) else: #experiemental: use_20_nearest_neighbors == True #TODO: exclude invalid values in get_featurevector (uncomment code there) self.kdtree2d = kdtree.KDTree(self.processor.pts3d_bound.T) self.kdtree_queried_indices = [] print ut.getTime(), 'kdtree single queries for kNN start, k=', features_k_nearest_neighbors count = 0 for point in ((self.processor.pts3d_bound.T)[nonzero_indices]): count = count + 1 result = self.kdtree2d.query(point, features_k_nearest_neighbors,0.2,2,self.processor.feature_radius) #existing = result[0][0] != np.Inf #print existing #print result[1] self.kdtree_queried_indices += [result[1]] #[existing] if count % 4096 == 0: print ut.getTime(),count print ut.getTime(), 'kdtree singe queries end' #convert to numpy array -> faster access self.kdtree_queried_indices = np.asarray(self.kdtree_queried_indices) #print self.kdtree_queried_indices #takes long to compute: #avg_len = 0 #minlen = 999999 #maxlen = 0 #for x in self.kdtree_queried_indices: # avg_len += len(x) # minlen = min(minlen, len(x)) # maxlen = max(maxlen, len(x)) #avg_len = avg_len / len(self.kdtree_queried_indices) #print ut.getTime(), "range neighbors: avg_len", avg_len, 'minlen', minlen, 'maxlen', maxlen #create HSV numpy images: # compute the hsv version of the image image_size = cv.cvGetSize(self.processor.img) img_h = cv.cvCreateImage (image_size, 8, 1) img_s = cv.cvCreateImage (image_size, 8, 1) img_v = cv.cvCreateImage (image_size, 8, 1) img_hsv = cv.cvCreateImage (image_size, 8, 3) cv.cvCvtColor (self.processor.img, img_hsv, cv.CV_BGR2HSV) cv.cvSplit (img_hsv, img_h, img_s, img_v, None) self.imNP_h = ut.cv2np(img_h) self.imNP_s = ut.cv2np(img_s) self.imNP_v = ut.cv2np(img_v) textures = texture_features.eigen_texture(self.processor.img) self.imNP_tex1 = textures[:,:,0] self.imNP_tex2 = textures[:,:,1] self.debug_before_first_featurevector = True self.generate_voi_histogram(self.processor.point_of_interest,self.processor.voi_width)
def load_sbd_model(model_path = 'model_nb/'): sys.stderr.write('loading model from [%s]... ' %model_path) model = util.load_pickle(model_path + 'model.pkl') model.path = model_path sys.stderr.write('done!\n') return model
if ilp_path is None: ilp_path = 'dat/%s/ilps' % (task_name) sent_path = args.sentpath if sent_path is None: sent_path = 'dat/%s/sents' % (task_name) maxrouge_path = args.maxrougepath if maxrouge_path is None: maxrouge_path = 'dat/%s/solutions/maxrouge' % (task_name) bigram_path = 'dat/%s/features' % (task_name) task = Task(task_name, topic_file, doc_path, man_path) # Get documents, split into sentences, tokenize and stem if args.load is not None: start_time = time.time() sys.stderr.write('Loading [%s] problem data in [%s]\n' %(task.name, task.data_pickle)) task.problems = util.load_pickle(args.load) sys.stderr.write('Done [%.2f s]\n' % (time.time() - start_time)) else: text.text_processor.load_splitta_model('lib/splitta/model_nb/') # Skip update data if task_name[:3] == 'tac': framework.setup_TAC08(task, True) elif task_name[:3] == 'duc': framework.setup_DUC_basic(task, True) elif task_name[:3] == 'new': framework.setup_news(task) else: raise Exception('Unknown task %s' % task) if task_name[:3] != 'new': for problem in task.problems: problem.load_documents()
from sklearn.model_selection import learning_curve, validation_curve,\ cross_val_score, GridSearchCV import matplotlib.pyplot as plt import json ############################# Display progress logs on stdout logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') print('Loading 20newsgroup dataset for all categories') ############################# Load train data train = fetch_20newsgroups(subset='train') # train_data = [preprocessor(doc) for doc in train.data] # save_pickle(train_data, 'dataset/train-data.pkl') train_data = load_pickle('dataset/train-data.pkl') train_target = train.target print('Train data:\n') print('%d documents' % len(train.filenames)) print('%d categories' % len(train.target_names)) # print(train.target_names[0]) # print(np.where(train.target == 0)) # print(train_target) # print(train.filenames) ############################# Preprocess preprocess = Pipeline([('count', CountVectorizer(stop_words='english', max_df=.75, ngram_range=(1, 1),
def make_efficient_example(ex): image_relpath = ex.image_path max_rotate = np.pi / 6 padding_factor = 1 / 0.85 scale_up_factor = 1 / 0.85 scale_down_factor = 1 / 0.85 shift_factor = 1.2 base_dst_side = 256 box_center = boxlib.center(ex.bbox) s, c = np.sin(max_rotate), np.cos(max_rotate) w, h = ex.bbox[2:] rot_bbox_side = max(c * w + s * h, c * h + s * w) rot_bbox = boxlib.box_around(box_center, rot_bbox_side) scale_factor = min(base_dst_side / np.max(ex.bbox[2:]) * scale_up_factor, 1) expansion_factor = padding_factor * shift_factor * scale_down_factor expanded_bbox = boxlib.expand(rot_bbox, expansion_factor) expanded_bbox = boxlib.intersect(expanded_bbox, np.array([0, 0, 2048, 2048])) new_camera = ex.camera.copy() new_camera.intrinsic_matrix[:2, 2] -= expanded_bbox[:2] new_camera.scale_output(scale_factor) new_camera.undistort() dst_shape = improc.rounded_int_tuple(scale_factor * expanded_bbox[[3, 2]]) new_im_relpath = ex.image_path.replace('3dhp', f'3dhp_downscaled') new_im_path = os.path.join(paths.DATA_ROOT, new_im_relpath) if not (util.is_file_newer(new_im_path, "2019-11-14T23:32:07") and improc.is_image_readable(new_im_path)): im = improc.imread_jpeg(f'{paths.DATA_ROOT}/{image_relpath}') new_im = cameralib.reproject_image(im, ex.camera, new_camera, dst_shape) util.ensure_path_exists(new_im_path) imageio.imwrite(new_im_path, new_im) new_bbox_topleft = cameralib.reproject_image_points( ex.bbox[:2], ex.camera, new_camera) new_bbox = np.concatenate([new_bbox_topleft, ex.bbox[2:] * scale_factor]) mask_rle_relpath = new_im_path.replace('Images', 'FGmaskImages').replace( '.jpg', '.pkl') mask_rle_path = os.path.join(paths.DATA_ROOT, mask_rle_relpath) if util.is_file_newer(mask_rle_path, "2020-03-11T20:46:46"): mask_runlength = util.load_pickle(mask_rle_path) else: mask_relpath = ex.image_path.replace('Images', 'FGmaskImages').replace( '.jpg', '.png') mask = imageio.imread(os.path.join(paths.DATA_ROOT, mask_relpath)) mask_reproj = cameralib.reproject_image(mask, ex.camera, new_camera, dst_shape) mask_runlength = get_mask_with_highest_iou(mask_reproj, new_bbox) util.dump_pickle(mask_runlength, mask_rle_path) return p3ds.Pose3DExample(new_im_relpath, ex.world_coords, new_bbox, new_camera, mask=mask_runlength, univ_coords=ex.univ_coords)
# mat2 = sys.argv[2] exp_dir = '../0710_data' svm_out = '../0710_out' add_slope(exp_dir) materials = [ m[:-4] for m in os.listdir('../0710_data') if m.endswith('.pkl') ] print materials accum = 0 buff = '' for pair in list(itertools.combinations(materials, 2)): mat1, mat2 = pair # print "Loading Data" vec1 = util.load_pickle(exp_dir + '/' + mat1 + '.pkl') vec2 = util.load_pickle(exp_dir + '/' + mat2 + '.pkl') # vec1 = transform_erfc(vec1, T_amb1) # vec2 = transform_erfc(vec2, T_amb2) # print len(vec1), len(vec2) data_dic = create_binary_dataset(vec1, vec2, mat1, mat2, 2) # print "created" score = run_crossvalidation_new(data_dic, num_folds) if not os.path.exists(svm_out): os.makedirs(svm_out) buff += '%s, %s, %f\n' % (mat1, mat2, score) print '%s, %s, %s' % (mat1, mat2, str(score)) print accum with open(svm_out + "/out.csv", "w") as text_file:
from DBConnection import db import numpy as np from util import load_pickle import os output_dir = 'final_data/PEMS-BAY/medium/estimation/' num_dates = 8404 no_features = 11 num_sensors = 87 sensor_ids = load_pickle('final_data/PEMS-BAY/medium/adj_mx_medium.pkl')[0] values_sensor_ids = str(['(' + str(x) + ')' for x in sensor_ids]).replace("'", "")[1:-1] select_query = "SELECT time, sensor_id, bucket_0::float, bucket_1::float, bucket_2::float, bucket_3::float," \ " bucket_4::float, bucket_5::float, bucket_6::float, bucket_7::float, bucket_8::float, bucket_9::float," \ " bucket_10::float from {0} where sensor_id = ANY(VALUES {1})" \ "order by time, sensor_id asc" data = db.execute_query( select_query.format('pems_final_normalized', values_sensor_ids)) data = np.array([x[2:] for x in data]) data = data.reshape([num_dates, num_sensors, no_features]) np.save(os.path.join(output_dir, "%s.npz" % 'data'), data) # Compute the average distribution data = db.execute_query(select_query.format('pems_final', values_sensor_ids)) data = np.array([x[2:] for x in data]) data = data.reshape([num_dates, num_sensors, no_features]) sensor_cars_counts = np.sum(data, axis=0) # (sensor, buckets)
def main(): print("*" * 10) print(args) print("*" * 10) dataloader = util.load_dataset(device, args.data_path, args.batch_size, args.batch_size, args.batch_size) scaler = dataloader['scaler'] print("scaler: ", scaler) model_type = "GWaveNet" # HA / SVR / ARIMA / STGCN / GWaveNet / LSTM sensor_ids, sensor_id_to_ind, adj_mx = util.load_adj(args.adj_path, args.adjtype) supports = [torch.tensor(i).to(device) for i in adj_mx] _, _, A = util.load_pickle(args.adj_path) A_wave = util.get_normalized_adj(A) A_wave = torch.from_numpy(A_wave).to(device) # print("A_wave:", A_wave.shape, type(A_wave)) best_path = os.path.join(args.save, 'best_model.pth') best_mae = 100 if args.randomadj: adjinit = None else: adjinit = supports[0] if args.aptonly: supports = None if model_type == "GWaveNet": print("=========Model:GWaveNet=========") print("with scaler") model = GWNET(device, args.num_nodes, args.dropout, supports=supports, gcn_bool=args.gcn_bool, addaptadj=args.addaptadj, aptinit=adjinit, in_dim=args.in_dim, out_dim=args.seq_length, residual_channels=args.nhid, dilation_channels=args.nhid, skip_channels=args.nhid * 8, end_channels=args.nhid * 16) if model_type == "STGCN": print("=========Model:STGCN=========") print("with scaler") model = STGCN(A_wave.shape[0], 2, num_timesteps_input=12, num_timesteps_output=12) if model_type == "LSTM": print("=========Model:LSTM=========") input_dim = 2 hidden_dim = 2 output_dim = 2 model = LSTM(input_dim, hidden_dim, output_dim) best_path = f'{args.save}/{model_type}.pkl' record = [] model.to(device) model.zero_grad() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) optimizer.zero_grad() loss_MSE = torch.nn.MSELoss() loss_gwnet = util.masked_mae loss_stgcn = util.masked_mae print("============Begin Training============") his_loss = [] val_time = [] train_time = [] for epoch in range(args.num_epochs): print('-' * 10) print('Epoch {}/{}'.format(epoch, args.num_epochs)) train_loss, train_mape, train_rmse, train_mae = [], [], [], [] t1 = time.time() t = time.time() dataloader['train_loader'].shuffle() for iter, (x, y) in enumerate(dataloader['train_loader'].get_iterator()): trainx = torch.Tensor(x).to(device) # x: (64, 24, 207, 2) trainy = torch.Tensor(y).to(device) # y: (64, 12, 207, 2) if trainx.shape[0] != args.batch_size: continue if model_type == "GWaveNet": trainx = trainx.transpose(1, 3) trainy = trainy.transpose(1, 3) trainy = trainy[:, 0, :, :] trainy = torch.unsqueeze(trainy, dim=1) trainx = nn.functional.pad(trainx, (1, 0, 0, 0)) pred = model.forward(trainx) pred = pred.transpose(1, 3) pred = scaler.inverse_transform(pred) loss_train = loss_gwnet(pred, trainy, 0.0) if model_type == "STGCN": # (batch_size,num_timesteps,num_nodes,num_features=in_channels) # ->(batch_size,num_nodes,num_timesteps,num_features=in_channels) trainx = trainx.permute(0, 2, 1, 3) trainy = trainy[:, :, :, 0].permute(0, 2, 1) pred = model(A_wave, trainx) # pred = scaler.inverse_transform(pred) # loss_train = loss_MSE(pred, trainy) loss_train = loss_stgcn(pred, trainy, 0.0) if model_type == "rnn": [batch_size, step_size, num_of_vertices, fea_size] = trainx.size() trainx = trainx.permute(0, 2, 1, 3) trainx = trainx.reshape(-1, step_size, fea_size) trainy = trainy.reshape(-1, 1, fea_size) trainy = trainy[:, 0, :] pred = model.loop(trainx) loss_train = loss_MSE(pred, trainy) Y_size = trainy.shape if iter == 0: print("trainy:", trainy.shape) optimizer.zero_grad() loss_train.backward() clip = 5 if clip is not None: torch.nn.utils.clip_grad_norm_(model.parameters(), clip) optimizer.step() evaluation = evaluate(pred, trainy) train_loss.append(loss_train.item()) train_mape.append(evaluation[0]) train_rmse.append(evaluation[1]) train_mae.append(evaluation[2]) if iter % args.interval == 0: log = 'Iter: {:03d}|Train Loss: {:.4f}|Train MAPE: {:.4f}|Train RMSE: {:.4f}|Train MAE: {:.4f}|Time: ' \ '{:.4f} ' print(log.format(iter, train_loss[-1], train_mape[-1], train_rmse[-1], train_mae[-1], time.time() - t), flush=True) t = time.time() t2 = time.time() train_time.append(t2 - t1) # validation valid_loss, valid_mape, valid_rmse, valid_mae = [], [], [], [] s1 = time.time() for iter, (x_val, y_val) in enumerate(dataloader['val_loader'].get_iterator()): # validation data loader iterator init inputs_val = torch.Tensor(x_val).to(device) # x: (64, 24, 207, 2) labels_val = torch.Tensor(y_val).to(device) if model_type == "GWaveNet": inputs_val = inputs_val.transpose(1, 3) labels_val = labels_val.transpose(1, 3) labels_val = labels_val[:, 0, :, :] labels_val = torch.unsqueeze(labels_val, dim=1) inputs_val = nn.functional.pad(inputs_val, (1, 0, 0, 0)) pred_val = model.forward(inputs_val) pred_val = pred_val.transpose(1, 3) pred_val = scaler.inverse_transform(pred_val) loss_valid = loss_gwnet(pred_val, labels_val, 0.0) if model_type == "STGCN": inputs_val = inputs_val.permute(0, 2, 1, 3) labels_val = labels_val[:, :, :, 0].permute(0, 2, 1) pred_val = model(A_wave, inputs_val) # pred_val = scaler.inverse_transform(pred_val) # loss_valid = loss_MSE(pred_val, labels_val) loss_valid = loss_stgcn(pred_val, labels_val, 0.0) if model_type == "rnn": [batch_size, step_size, num_of_vertices, fea_size] = trainx.size() inputs_val = inputs_val.permute(0, 2, 1, 3) inputs_val = inputs_val.reshape(-1, step_size, fea_size) labels_val = labels_val.reshape(-1, 1, fea_size) labels_val = labels_val[:, 0, :] pred_val = model.loop(inputs_val) loss_valid = loss_MSE(pred_val, labels_val) # pred_val = scaler.inverse_transform(pred_val) optimizer.zero_grad() # loss_valid.backward() evaluation = evaluate(pred_val, labels_val) valid_loss.append(loss_valid.item()) valid_mape.append(evaluation[0]) valid_rmse.append(evaluation[1]) valid_mae.append(evaluation[2]) s2 = time.time() log = 'Epoch: {:03d}, Inference Time: {:.4f} secs' print(log.format(epoch, (s2 - s1))) val_time.append(s2 - s1) mtrain_loss = np.mean(train_loss) mtrain_mape = np.mean(train_mape) mtrain_rmse = np.mean(train_rmse) mtrain_mae = np.mean(train_mae) mvalid_loss = np.mean(valid_loss) mvalid_mape = np.mean(valid_mape) mvalid_rmse = np.mean(valid_rmse) mvalid_mae = np.mean(valid_mae) his_loss.append(mvalid_loss) message = dict(train_loss=mtrain_loss, train_mape=mtrain_mape, train_rmse=mtrain_rmse, valid_loss=mvalid_loss, valid_mape=mvalid_mape, valid_rmse=mvalid_rmse) message = pd.Series(message) record.append(message) # save model parameters if message.valid_loss < best_mae: torch.save(model.state_dict(), best_path) best_mae = message.valid_loss epochs_since_best_mae = 0 best_epoch = epoch else: epochs_since_best_mae += 1 record_df = pd.DataFrame(record) record_df.round(3).to_csv(f'{args.save}/record.csv') log = 'Epoch: {:03d}, Training Time: {:.4f}/epoch,\n' \ 'Train Loss: {:.4f}, Train MAPE: {:.4f}, Train RMSE: {:.4f}, Train MAE: {:.4f}, \n' \ 'Valid Loss: {:.4f}, Valid MAPE: {:.4f}, Valid RMSE: {:.4f}, Valid MAE: {:.4f},' print(log.format(epoch, (t2 - t1), mtrain_loss, mtrain_mape, mtrain_rmse, mtrain_mae, mvalid_loss, mvalid_mape, mvalid_rmse, mvalid_mae), flush=True) print("#" * 20) print("=" * 10) print("Average Train Time: {:.4f} secs/epoch".format(np.mean(train_time))) print("Average Valid Time: {:.4f} secs".format(np.mean(val_time))) print("=" * 10) # Testing bestid = np.argmin(his_loss) print("bestid: ", bestid) model.load_state_dict(torch.load(best_path)) outputs = [] target = torch.Tensor(dataloader['y_test']).to(device) if model_type == "GWaveNet": target = target.transpose(1, 3)[:, 0, :, :] if model_type == "STGCN": target = target[:, :, :, 0] target = target.transpose(1, 2) for iter, (x, y) in enumerate(dataloader['test_loader'].get_iterator()): testx = torch.Tensor(x).to(device) # x: (64, 24, 207, 2) testy = torch.Tensor(y).to(device) # x: (64, 24, 207, 2) if model_type == "GWaveNet": with torch.no_grad(): testx = testx.transpose(1, 3) pred = model.forward(testx) pred = pred.transpose(1, 3) outputs.append(pred.squeeze()) if model_type == "STGCN": with torch.no_grad(): testx = testx.permute(0, 2, 1, 3) testy = testy[:, :, :, 0].permute(0, 2, 1) pred = model(A_wave, testx) # (64, 207, 12) outputs.append(pred) yhat = torch.cat(outputs, dim=0) yhat = yhat[:target.size(0), ...] amae, amape, armse, test_record = [], [], [], [] print("=" * 10) print("yhat:", yhat.shape) # yhat: torch.Size([6850, 207, 12]) print("target:", target.shape) # target: torch.Size([6850, 207, 12]) for i in range(Y_size[-1]): pred = scaler.inverse_transform(yhat[:, :, i]) # pred = yhat[:, :, i] real_target = target[:, :, i] evaluation = evaluate(pred, real_target) log = 'Evaluate on test data for horizon {:d}, Test MAPE: {:.4f}, Test RMSE: {:.4f}, Test MAE: {:.4f}' print(log.format(i + 1, evaluation[0], evaluation[1], evaluation[2])) amape.append(evaluation[0]) armse.append(evaluation[1]) amae.append(evaluation[2]) test_record.append([x for x in evaluation]) test_record_df = pd.DataFrame(test_record, columns=['mape', 'rmse', 'mae']).rename_axis('t') test_record_df.round(3).to_csv(f'{args.save}/test_record.csv') log = 'On average over 12 horizons, Test MAE: {:.4f}, Test MAPE: {:.4f}, Test RMSE: {:.4f}' print(log.format(np.mean(amae), np.mean(amape), np.mean(armse))) print("=" * 10)
def load(self): self.featdict = util.load_pickle(self.path + 'feats')
def make_main_process_pkl(prices_fname, word_pkl, hashtag_fname, handle_fname, out_fname): """ Main processing of the pickles """ import seaborn as sns def get_label(in_dat): if in_dat > 0: return 1 return 0 def get_vol_price_dat(idx): if idx < 500: return None vol_arr = np.array([ float(prices_dict[c_idx]['volume']) for c_idx in range(idx - 500, idx) ]) price_arr = np.array([ float(prices_dict[c_idx]['price']) for c_idx in range(idx - 500, idx) ]) vol_arr, price_arr = np.expand_dims(vol_arr, axis=0), np.expand_dims(price_arr, axis=0) return np.concatenate((vol_arr, price_arr), axis=0).transpose() # Get prices prices_dict = get_prices(f_name=prices_fname) # Get the dictionaries and the sets main_arr, hashtag_dict, handle_dict = load_pickle( word_pkl)['dat'], get_dict(hashtag_fname), get_dict(handle_fname) # Sort the stuff sorted(main_arr, key=lambda val: val['time']) # Main Storage, and index for time array dat_arr, lab_arr, time_idx, samples, time_arr = [], [], 0, [], [] # Current slot storage curr_dat, curr_lab = [], None num = 0 for ele in main_arr: num += 1 # If current time is higher then jump to next entry, update the arrays if ele['time'] >= prices_dict[time_idx]['time']: # Only if volume information is contained combined_out = get_vol_price_dat(time_idx - 1) if combined_out is not None: time_arr.append(prices_dict[time_idx]['time']) lab_arr.append(curr_lab) curr_dat.append(combined_out) dat_arr.append(curr_dat) curr_dat, curr_lab = [], None time_idx += 1 if time_idx == len(prices_dict): logging.warning( 'Ran out of the prices.txt file at tweet index: {}, time index: {}' .format(num, time_idx)) break # If atleast half an hour away then include in set time_diff = prices_dict[time_idx]['time'] - ele['time'] assert (0 < time_diff < 7200) if time_diff < 1800: continue # Get the data, check if hashtag is in array words, hashtag_arr = clean_tweet(tweet=ele['text']) hashtag_arr = [ hashtag_dict[hashtag] for hashtag in hashtag_arr if hashtag in hashtag_dict ] # Add number for the handle if present handle_num = None if ele['handle'] in handle_dict: handle_num = handle_dict[ele['handle']] curr_dat.append((words, [handle_num, hashtag_arr])) curr_lab = get_label(float(prices_dict[time_idx]['change'])) # Ensure that the length of the data and the number of labels are same assert (len(dat_arr) == len(lab_arr) == len(time_arr)) logging.info('Total Samples: {}'.format(len(dat_arr))) logging.info('Printing out stats') # # Get stats regarding number of tweets per time step and timestep data # timestep_out = np.asarray([time_arr[idx] - time_arr[idx - 1] for idx in range(1, len(time_arr))]) # number_tweets = np.asarray([len(dat_arr[idx]) for idx in range(1, len(time_arr))]) # # plt.clf() # logging.info('Timestep out stats, Mean: {}, Max: {}, Min: {}, Std: {}'.format( # timestep_out.mean(), timestep_out.max(), timestep_out.min(), timestep_out.std())) # sns.set(), plt.hist(timestep_out, bins=100, normed=True) # plt.xlabel('Time Step'), plt.ylabel('Probablity') # plt.savefig('data/timestep.png') # # plt.clf() # logging.info('number_tweets out stats, Mean: {}, Max: {}, Min: {}, Std: {}'.format( # number_tweets.mean(), number_tweets.max(), number_tweets.min(), number_tweets.std())) # sns.set(), plt.hist(number_tweets, bins=100, normed=True) # plt.xlabel('Number tweets per timestep'), plt.ylabel('Probablity') # plt.savefig('data/tweets.png') # # plt.clf() # density = number_tweets / timestep_out # logging.info('density out stats, Mean: {}, Max: {}, Min: {}, Std: {}'.format( # density.mean(), density.max(), density.min(), density.std())) # sns.set(), plt.hist(density, bins=100, normed=True) # plt.xlabel('Number tweets per timestep'), plt.ylabel('Probablity') # plt.savefig('data/tweets_density.png') # # plt.clf() # sns.set(), plt.hist(lab_arr, bins=5, normed=True) # plt.xlabel('Number tweets per timestep'), plt.ylabel('Probablity') # plt.savefig('data/label_dist.png') # save_pickle({ 'data': np.asarray(dat_arr), 'labels': np.asarray(lab_arr) }, out_fname) logging.info('Saved Pickle To: {}'.format(out_fname))