def loadCachedData(self, name):
     try:
         if self.filename == "remote":
             return None
         return load_pickle(self.filename + "." + name)
     except:
         return None
Exemple #2
0
def setup_DUC_sentences(task, parser=None, reload=False):

    ## load problems quickly from pickle file
    if (not reload) and os.path.isfile(task.data_pickle):
        sys.stderr.write('Loading [%s] problem data from [%s]\n' %(task.name, task.data_pickle))
        task.problems = util.load_pickle(task.data_pickle)
        return

    ## parse sentences
    text.text_processor.load_splitta_model('/u/dgillick/sbd/splitta/model_nb/')
    for problem in task.problems:
        sys.stderr.write('%s\n' %problem.id)
        problem.load_documents()
        if parser:
            for doc in problem.new_docs:
                doc.parse_sentences(parser)
                problem.parsed = True
                
    if parser:
        parser.run()
        for sentence, parsetree in parser.parsed.items():
            sentence.parsed = parsetree
        
    ## save pickled version for faster loading later
    sys.stderr.write('Saving [%s] problem data in [%s]\n' %(task.name, task.data_pickle))
    util.save_pickle(task.problems, task.data_pickle)
Exemple #3
0
def get_repo_frequencies():
    """
    Returns a map of repo id to (frequency, relative_freq) tuples.
    """
    path = os.path.join(config.CALC_DATA_PATH, 'repo_frequencies1.pickle')
    global _repo_freqs
    repo_frequencies = _repo_freqs or util.load_pickle(path)
    if repo_frequencies:
        _repo_freqs = repo_frequencies
        return repo_frequencies

    user_watches = get_user_watches()

    total_watches = sum(len(w) for w in user_watches.values())
    logger.debug("Total watches is {0}".format(total_watches))
    
    repo_frequencies = dict()
    for repos in user_watches.values():
        for watch in repos:
            if not watch in repo_frequencies:
                repo_frequencies[watch] = (1, 1/total_watches)
            else:
                freq = repo_frequencies[watch][0] + 1
                repo_frequencies[watch] = (freq, freq/total_watches)


    util.store_pickle(repo_frequencies, path, debug=True)
    _repo_freqs = repo_frequencies

    return repo_frequencies
Exemple #4
0
 def __init__(self):
     self._no_punct_pattern = re.compile('[a-zA-Z0-9- ]')
     self._stopwords = set(open(STOPWORDS).read().splitlines())
     self._stopwords.add('said')
     self._porter_stemmer = nltk.stem.porter.PorterStemmer()
     self._sent_tokenizer = util.load_pickle('%s%s' %(STATIC_DATA_ROOT, 'punkt/english.pickle'))
     self._sent_split_ABBR_LIST = set(['Mr.', 'Mrs.', 'Sen.', 'No.', 'Dr.', 'Gen.', 'St.', 'Lt.', 'Col.'])
     self._sent_split_PUNCT_LIST = set(['\" ', '\")', ') ', '\' ', '\"\''])
 def read_mapper(self):
     path = os.path.join(self.inverted_mapper_dir, '%s.inv.pickle' % self.name)
     try:
         self.inverted_id_mapper_list = util.load_pickle(path)
     except EOFError, e:
         print e
         print '!!!!! Fail to read inverted id map of %s' % self.name
         self.exit = True
Exemple #6
0
 def load(self, path, filename):
     self.filename = filename
     self.path = path
     #try:
     dict = ut.load_pickle(self.path+'/'+self.filename)
     #except:
     #    print 'loading of '+self.path+'/'+filename+' failed. WARNING: it will be overwritten on save()!'
     #    return
         
     self.datasets = dict['datasets']
	def __init__(self, goal_path=None):
		self.plan = RawPlan()
		self.goal_list = []
		self.goal_vector = []
		self.stemmer = nltk.PorterStemmer()
		if goal_path is None:
			self.plan.populate_goal_actions_map()
			self.goal_actions_map = self.plan.goal_actions_map
			self.goal_list = self.goal_actions_map.keys()
		else:
			self.goal_actions_map = load_pickle(goal_path)
			self.goal_list = self.goal_actions_map.keys()
 def test(self, feature_data = None):
     #test on current scan:
     print ut.getTime(), 'test on:', self.processor.scan_dataset.id    
         
     if feature_data == None:
         filename = self.processor.get_features_filename()
         dict = ut.load_pickle(filename)
     else:
         dict = feature_data
     
     baseline_labels = self.classify_baseline_code()
 
     return baseline_labels, self.test_results(dict, baseline_labels)  
def run(selected_feature_dir, extra_feature_dir, matrix_dir):
    feature_dir = selected_feature_dir
    util.makedir(matrix_dir)

    for file_name in os.listdir(feature_dir):
        name = file_name.split('.')[0]
        feature_path = os.path.join(feature_dir, file_name)
        feature_dict = util.load_pickle(feature_path, typ='json')
        # word features matrix
        word_matrix_dir = os.path.join(matrix_dir, 'word/')
        to_vector(name, feature_dict, word_matrix_dir)
        
        # extra features matrix
        extra_feature_path = os.path.join(extra_feature_dir, file_name)
        print extra_feature_path
        extra_feature_dict = util.load_pickle(extra_feature_path, typ='json')
        for cls in get_extra_feature_class(extra_feature_dict):
            cls_matrix_dir = os.path.join(matrix_dir, cls)
            cls_feature_dict = {}
            for rank in extra_feature_dict:
                cls_feature_dict[rank] = extra_feature_dict[rank][cls]
            to_vector(name, cls_feature_dict, cls_matrix_dir)
    return None
def run(category_dir, result_dir, config):
    result_file_extension = config["result_file_extension"]

    for file_name in os.listdir(category_dir):
        name = file_name.split('.')[0]
        category_path = os.path.join(category_dir, file_name)
        category = util.load_pickle(category_path)
        ps = PeopleSet(name, category)

        clustering_result_path = os.path.join(result_dir, '%s.%s' % (name, result_file_extension))

        ps.dump_xml(clustering_result_path)
        del ps
    return None
    def test(self, feature_data = None):
        #test on current scan:
        print ut.getTime(), 'test on:', self.processor.scan_dataset.id    
            
        if feature_data == None:
            filename = self.processor.get_features_filename()
            print 'loading', filename
            dict = ut.load_pickle(filename)
        else:
            dict = feature_data
        
        #print ut.getTime(), dict
        current_set_size = dict['set_size']
        feature_vector_length = len(self.processor.features.get_indexvector(self.features))
        print ut.getTime(), feature_vector_length
        labels = np.array(np.zeros(len(self.processor.map_polys)))
        print 'test: length of labels vector:', len(labels)
        test = cv.cvCreateMat(1,feature_vector_length,cv.CV_32FC1)
        
        if current_set_size == 0:
            print ut.getTime(), 'ERROR: test dataset is empty!'
            return labels, 1, 1, 1

        count = 0
        for index in dict['point_indices']:
            fv = (dict['features'][count])[self.processor.features.get_indexvector(self.features)]
            #print ut.getTime(), fv, dict['features'][count]

            for fv_index, fv_value in enumerate(fv):
                test[fv_index] = fv_value
             
            #print 'class',self.cv_classifier
            label = self.cv_classifier.predict(test)
            #print label.value
            labels[index] = label.value
            #print 'tdone'
            if count % 4096 == 0:
                print ut.getTime(), 'testing:', count, 'of', current_set_size, '(',(float(count)/float(current_set_size)*100.0),'%)'
                
            count += 1


        #save for later use for postprocessing:
        self.test_feature_dict = dict
        self.test_labels = labels
        #cv.cvReleaseMat(test)
        return labels, self.test_results(dict, labels)  
Exemple #12
0
def load_config(app, config_file):
    """Loads the configuration from the specified file and sets the
    properties of ```app```, ```db``` and ```machine``` application objects

    :param app: the flask application object
    :param config_file: the absolute path to the configuration file
    """
    global db, machine, category_classifier

    config = ConfigParser.SafeConfigParser()

    try:
        config.readfp(open(config_file))
    except IOError as e:
        app.logger.error("An error while reading '%s': %s" %
                        (config_file, e.strerror))

    # Initialize the database
    try:
        database_uri = config.get('database', 'sqlalchemy.url')
        pool_size = config.get('database', 'sqlalchemy.pool_size')

        # SQLAlchemy configuration
        app.config['SQLALCHEMY_DATABASE_URI'] = database_uri
        app.config['SQLALCHEMY_POOL_SIZE'] = int(pool_size)
    except ConfigParser.NoSectionError as e:
        logger.error("The specified section does not exist", e)

    db = SQLAlchemy(app)

    # Intialize the machine
    classifier_file = config.get("classifier", "classifier.file")
    if not classifier_file is None:
        if os.path.exists(classifier_file):
            _dict = util.load_pickle(classifier_file)
            category_classifier = _dict['categoryClassifier']
            if not isinstance(category_classifier, DssgCategoryClassifier):
                app.logger.error("Invalid classifier object type: %s" %
                                 type(category_classifier))
                category_classifier = None
                return
            # Proceed
            machine = Machine(category_classifier)
        else:
            app.logger.info("The classifier file '%s' does not exist" %
                            classifier_file)
Exemple #13
0
def get_user_watches():
    """
    Returns an dict of user id keys mapped to a set
    of repo ids being watched by that user
    """
    path = os.path.join(config.CALC_DATA_PATH, 'user_watches.pickle')
    global _user_watches
    user_watches = _user_watches or util.load_pickle(path)
    if user_watches:
        _user_watches = user_watches
        return user_watches
    
    user_watches = collections.defaultdict(set)
    
    for line in open(os.path.join(config.SRC_DATA_PATH, 'data.txt')):
        k,v = line.rstrip().split(':')
        user_watches[int(k)].add(int(v))

    util.store_pickle(user_watches, path, debug=True)
    _user_watches = user_watches

    return user_watches
def worker(proc_num, queue):
    while True:
#        time.sleep(random.random()*10)
        try:
            name = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        if name + ".pkl" in os.listdir(POLARITIES):
            continue
        print proc_num, "Running", name
        subredditgen.main(name)
        word_dict = util.load_pickle(DICTS.format(name))
        word_dict.filter_extremes(no_above=0.1, no_below=100)
        to_keep = sorted(word_dict.dfs, key=lambda w : word_dict.dfs[w], reverse=True)[:5000]
        word_dict.filter_tokens(good_ids=to_keep)
        sub_vecs = create_representation("SVD", constants.SUBREDDIT_EMBEDDINGS.format(name))
        pos_seeds, neg_seeds = seeds.twitter_seeds()
        sub_vecs = sub_vecs.get_subembed(set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))
        pols = polarity_induction_methods.bootstrap(sub_vecs, pos_seeds, neg_seeds, return_all=True,
                nn=25, beta=0.9, num_boots=50, n_procs=10)
        util.write_pickle(pols, POLARITIES + name + ".pkl")
 def extra_extract(self, name, name_body_text):
     version = self.config['version']
     id_mapper_path = os.path.join(util.ROOT, self.id_mapper_pickle_dir, '%s.json' % name)
     id_mapper = util.load_pickle(id_mapper_path, typ='json')
     extra_features = {}
     metadata_path = os.path.join(self.metadata_dir, '%s.xml' % name)
     with open(metadata_path) as f:
         content = f.read()
     corpus = etree.XML(content)
     for doc in corpus:
         rank = doc.get('rank')
         try:
             mapped_rank = id_mapper[rank]
         except KeyError:
             continue
         if version == '2007test':
             # The description file opposite the snippet and title in 2007 description file
             title = doc.xpath('./snippet')[0].xpath('string()')
             snippet = doc.get('title')
         elif version == '2008test':
             title = doc.get('title')
             try:
                 snippet = doc.xpath('./snippet')[0].xpath('string()')
             # snippet may not exist. e.g. /data/weps-2/data/test/metadata/FRANZ_MASEREEL.xml, rank="26"
             except IndexError:
                 snippet = ''
         url = doc.get('url')
         title_freq = self.title_tokenize(title)
         url_freq = self.url_tokenize(url)
         snippet_freq = self.snippet_tokenize(snippet)
         body_text_path = os.path.join(name_body_text, '%s.txt' % mapped_rank)
         with open(body_text_path) as f:
             email_freq = self.email_detect(f.read())
         extra_features[mapped_rank] = {'title': title_freq,
                                        'url': url_freq,
                                        'snippet': snippet_freq,
                                        'emails': email_freq}
     return extra_features
Exemple #16
0
def setup_DUC_sentences(task, parser=None, reload=False):

    ## load problems quickly from pickle file
    if (not reload) and os.path.isfile(task.data_pickle):
        sys.stderr.write('Loading [%s] problem data from [%s]\n' %(task.name, task.data_pickle))
        task.problems = util.load_pickle(task.data_pickle)
        return

    ## only parse sentences if needed
    for problem in task.problems:
        print problem.id
        problem.load_documents()
        if parser:
            for doc in problem.new_docs:
                doc.parse_sentences(parser)
                
    if parser:
        parser.run()
        for sentence, parsetree in parser.parsed.items():
            sentence.parsed = parsetree
        
    ## save pickled version for faster loading later
    sys.stderr.write('Saving [%s] problem data in [%s]\n' %(task.name, task.data_pickle))
    util.save_pickle(task.problems, task.data_pickle)
Exemple #17
0
tokenized_datasets = [[[token.lower() for token in request]
                       for request in dataset]
                      for dataset in tokenized_datasets_original]
"""
Build the whole vocabulary

Vocab lists:
• special token: "UNK_TOKEN"
• vocab_shared: intersection of word2vec vocab and politeness vocab
• vocab_freq: frequent vocab that is not in word2vec vocab
"""

UNK = "UNK_TOKEN"

if use_existing_vocab:
    vocab_politeness = load_pickle(
        "data/Stanford_politeness_corpus/vocab_politeness.pkl")
else:
    # Load word embedding model
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    model = KeyedVectors.load_word2vec_format(fname=word2vec, binary=True)

    freq_threshold = 2

    all_tokens = [
        token for dataset in tokenized_datasets for request in dataset
        for token in request
    ]

    fdist = FreqDist(all_tokens)
    fdist_lst = fdist.most_common()
Exemple #18
0
def load_trial_data(ind_obj1, ind_obj2, fn):
    fmat1 = util.load_pickle(fn + '/' + str(ind_obj1) + '.pkl')
    fmat2 = util.load_pickle(fn + '/' + str(ind_obj2) + '.pkl')

    return fmat1, fmat2
Exemple #19
0
def load_sbd_model(model_path='model_nb/'):
    sys.stderr.write('loading model from [%s]... ' % model_path)
    model = util.load_pickle(model_path + 'model.pkl')
    model.path = model_path
    sys.stderr.write('done!\n')
    return model
Exemple #20
0
def make_mpii_yolo():
    joint_info_full = JointInfo(
        'rank,rkne,rhip,lhip,lkne,lank,pelv,thor,neck,head,rwri,relb,rsho,lsho,lelb,lwri',
        'lsho-lelb-lwri,rsho-relb-rwri,lhip-lkne-lank,rhip-rkne-rank,neck-head,pelv-thor'
    )
    joint_info_used = JointInfo(
        'rank,rkne,rhip,lhip,lkne,lank,rwri,relb,lelb,lwri',
        'lelb-lwri,relb-rwri,lhip-lkne-lank,rhip-rkne-rank')
    selected_joints = [
        joint_info_full.ids[name] for name in joint_info_used.names
    ]

    mat_path = f'{paths.DATA_ROOT}/mpii/mpii_human_pose_v1_u12_1.mat'
    s = matlabfile.load(mat_path).RELEASE
    annolist = np.atleast_1d(s.annolist)
    all_boxes = util.load_pickle(
        f'{paths.DATA_ROOT}/mpii/yolov3_detections.pkl')

    examples = []
    with util.BoundedPool(None, 120) as pool:
        for anno_id, (anno, is_train, rect_ids) in enumerate(
                zip(annolist, util.progressbar(s.img_train), s.single_person)):
            if not is_train:
                continue

            image_path = f'{paths.DATA_ROOT}/mpii/images/{anno.image.name}'

            annorect = np.atleast_1d(anno.annorect)
            gt_people = []
            for rect_id, rect in enumerate(annorect):
                if 'annopoints' not in rect or len(rect.annopoints) == 0:
                    continue

                coords = np.full(shape=[joint_info_full.n_joints, 2],
                                 fill_value=np.nan,
                                 dtype=np.float32)
                for joint in np.atleast_1d(rect.annopoints.point):
                    coords[joint.id] = [joint.x, joint.y]

                bbox = boxlib.expand(boxlib.bb_of_points(coords), 1.25)
                coords = coords[selected_joints]
                ex = Pose2DExample(image_path, coords, bbox=bbox)
                gt_people.append(ex)

            if not gt_people:
                continue

            image_relpath = os.path.relpath(f'images/{anno.image.name}')
            boxes = [box for box in all_boxes[image_relpath] if box[-1] > 0.5]
            if not boxes:
                continue

            iou_matrix = np.array(
                [[boxlib.iou(gt_person.bbox, box[:4]) for box in boxes]
                 for gt_person in gt_people])
            gt_indices, box_indices = scipy.optimize.linear_sum_assignment(
                -iou_matrix)

            for i_gt, i_det in zip(gt_indices, box_indices):
                if iou_matrix[i_gt, i_det] > 0.1:
                    ex = gt_people[i_gt]
                    ex.bbox = np.array(boxes[i_det][:4])
                    new_im_path = image_path.replace('mpii',
                                                     'mpii_downscaled_yolo')
                    without_ext, ext = os.path.splitext(new_im_path)
                    new_im_path = f'{without_ext}_{rect_id:02d}{ext}'
                    pool.apply_async(make_efficient_example, (ex, new_im_path),
                                     callback=examples.append)

    examples.sort(key=lambda ex: ex.image_path)

    def n_valid_joints(example):
        return np.count_nonzero(np.all(~np.isnan(example.coords), axis=-1))

    examples = [ex for ex in examples if n_valid_joints(ex) > 6]

    return Pose2DDataset(joint_info_used, examples)
Exemple #21
0
 def load_punkt_model(self, path):
     self._sent_tokenizer = util.load_pickle(path)
 def setUp(self):
     config_path = util.abs_path('configure/2007test.NN.nltk.json')
     config = util.load_pickle(config_path, typ='json')
     self.flt = FeatureExtractor(config)
import util, concept_mapper
import gflags
import sys

if __name__ == '__main__':
  FLAGS = gflags.FLAGS
  gflags.DEFINE_string('task', None, 'Which task (tac08)', short_name='t')
  gflags.MarkFlagAsRequired('task')
  gflags.DEFINE_string('load', None, 'Path to pickled task data')
  gflags.MarkFlagAsRequired('task')

  try:
    argv = FLAGS(sys.argv)
  except gflags.FlagsError, e:
    sys.stderr.write('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))
    sys.exit(1)

  sys.stderr.write('Loading [%s] problem data in [%s]\n' %(task.name, task.data_pickle))
  task.problems = util.load_pickle(FLAGS.load)

  for problem in task.problems:
    sents = problem.get_new_sentences()
    gold_sents = problem.get_training_sentences()
    values = []
    for sent in sents:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', type=str, required=True)
    # parser.add_argument('--emotion_file', type=str, required=True)
    parser.add_argument('--conceptnet_path', type=str, required=True)
    parser.add_argument('--dataset_vocab_path', type=str, required=True)
    parser.add_argument('--top_k', type=int, default=100)

    args = parser.parse_args()
    input_file = args.input_file
    # emotion_file = args.emotion_file
    conceptnet_path = args.conceptnet_path
    dataset_vocab_path = args.dataset_vocab_path
    top_k = args.top_k

    concept_words = load_pickle(input_file)
    # emotions = read_file(emotion_file)
    CN = load_pickle(conceptnet_path)

    # load vocab
    print("Loading dataset vocab from ", dataset_vocab_path)
    vocab_ckpt = torch.load(dataset_vocab_path)
    word2id = vocab_ckpt["src"].base_field.vocab.stoi
    id2word = vocab_ckpt["src"].base_field.vocab.itos
    print("dataset vocab size: ", len(word2id))

    associated_concepts = defaultdict(list)
    for h,r,t,w in tqdm(CN):
        associated_concepts[h].append((r,t,w))
    
    # to clean concept words and save as np.array to save space
    input_labels = []
    for l in opt.in_labels.split('_'):
        input_labels.append(l)

    # Output labels
    if opt.out_labels is None:
        output_labels = in_labels
    else:
        output_labels = []
        for l in opt.out_labels.split('_'):
            output_labels.append(l)

    # Data
    if opt.preprocessing.find('true')>=0:
        d = extract_data(data_path, input_labels, opt.state_topic)
        ut.save_pickle(d, os.path.join(data_path, opt.saved_filename))
    else:
        d = ut.load_pickle(os.path.join(data_path, opt.saved_filename))

    if opt.plot.find('true')>=0 or opt.debug.find('true')>=0:
        plot_raw_data(d)
        ## cross_eval(d)
    else:
        rospy.init_node('obj_state_classifier_node')
        wse = obj_state_classifier(input_labels, output_labels, opt.state_topic, opt.srv_topic,
                                   debug=opt.debug)
        wse.run(d)


        
    # Predict 'soft' voting with probabilities

    pred1 = np.asarray([clf.predict(csc_matrix(X_list)) for clf in estimators])
    pred2 = np.average(pred1, axis=0, weights=weights)
    pred = np.argmax(pred2, axis=1)

    # Convert integer predictions to original labels:
    return pred


#x = export_classifiers()
#print('x', size(x))
#export(x)

trained = util.load_pickle(name='fs_1', path='..\\pickles\\feature_sets\\')
print('trained', size(trained))
test = util.load_pickle(name='fs_test_1', path='..\\pickles\\test_features\\')
print('test', size(test))

test_data = test['data_set']
featureset = 'fs_words_bigrams_pos'

X_train, y_train = trained[featureset], trained['labels']
X_test, y_test = test[featureset], test['labels']
feat_size = X_train.shape[1]
x = load_from_file()
svm = x['svm']
xgb = x['xgb']
knn = x['knn']
nb = x['nb']
Exemple #27
0
def main(args):
    """create word vector
    :param file_path: path of corpus
    :param window_size: window size
    :param shift: num of samples in w2v skip-gram negative-sampling(sgns)
    :param dim: the size of wordvec WV = [vocab_size, dim]
    """
    logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)
    logging.info(f"[INFO] args: {args}")

    logging.info("[INFO] Loading dictionary...")
    id_to_word, word_to_id = load_pickle(args.pickle_id2word)
    vocab_size = len(id_to_word)
    logging.debug(f"[DEBUG] vocab: {vocab_size} words")

    if args.cooccur_pretrained is not None:
        logging.info("[INFO] Loading pre-trained co-occur matrix...")
        C = load_matrix(args.cooccur_pretrained, len(id_to_word))
    else:
        logging.info("[INFO] Creating co-occur matrix...")
        C = create_co_matrix(args.file_path, word_to_id, vocab_size,
                             args.window_size)

        # threshold by min_count
        if args.threshold:
            C = threshold_cooccur(C, threshold=args.threshold)

        os.makedirs("model", exist_ok=True)
        c_name = "model/C_w-{}".format(args.window_size)
        with open(c_name, "w") as wp:
            for id, cooccur_each in enumerate(C):
                cooccur_nonzero = [
                    f"{id}:{c}" for id, c in enumerate(cooccur_each) if c > 0
                ]
                wp.write(f"{id}\t{' '.join(cooccur_nonzero)}\n")

    if args.sppmi_pretrained is not None:
        logging.info("[INFO] Loading pre-trained sppmi matrix...")
        M = load_matrix(args.sppmi_pretrained, len(id_to_word))
    else:
        logging.info("[INFO] Computing sppmi matrix...")
        # use smoothing or not in computing sppmi
        M = sppmi(C,
                  args.shift,
                  has_abs_dis=args.has_abs_dis,
                  has_cds=args.has_cds)
        m_name = "model/SPPMI_w-{}_s-{}".format(args.window_size, args.shift)
        with open(m_name, "w") as wp:
            for id, sppmi_each in enumerate(M):
                sppmi_nonzero = [
                    f"{id}:{m}" for id, m in enumerate(sppmi_each) if m > 0
                ]
                wp.write(f"{id}\t{' '.join(sppmi_nonzero)}\n")

    logging.info("[INFO] Calculating word vector...")
    try:
        from scipy.sparse.linalg import svds

        U, S, V = svds(coo_matrix(M), k=args.dim)
    except:
        U, S, V = np.linalg.svd(coo_matrix(M))

    word_vec = np.dot(U, np.sqrt(np.diag(S)))
    wv_name = "model/WV_d-{}_w-{}_s-{}".format(args.dim, args.window_size,
                                               args.shift)
    np.save(wv_name, word_vec[:, :args.dim])

    return
    # ent_emb = torch.cat([ent_emb, F.normalize(ent_emb.mean(dim=0, keepdim=True), p=2, dim=-1)], dim=0)
    ent_emb = torch.cat(
        [ent_emb,
         F.normalize(torch.randn((1, ent_emb_dim)), p=2, dim=-1)],
        dim=0)
    print("ent embedding shape: ", ent_emb.shape)

    # load vocab
    print("Loading dataset vocab from ", dataset_vocab_path)
    vocab_ckpt = torch.load(dataset_vocab_path)
    word2id = vocab_ckpt["src"].base_field.vocab.stoi
    id2word = vocab_ckpt["src"].base_field.vocab.itos
    print("dataset vocab size: ", len(word2id))

    # load stopwords
    stopwords = load_pickle("./data/KB/stopwords.pkl")

    # concept_VAD_strength_softmax = torch.ones(len(concept_embedding_dict)+1)/(len(concept_embedding_dict)+1)
    print("Loading concept VAD strength dict from ",
          concept_VAD_strength_dict_path)
    concept_VAD_strength_dict = load_pickle(concept_VAD_strength_dict_path)
    concept_VAD_strength_embedding = torch.zeros(
        len(concept_VAD_strength_dict) + 1)
    for k, v in concept_VAD_strength_dict.items():
        concept_VAD_strength_embedding[ent2id[k]] = v
    concept_VAD_strength_embedding[ent2id["<pad>"]] = 0
    # concept_VAD_strength_softmax = torch.softmax(concept_VAD_strength_embedding, dim=-1)

    smaller_suffix = "-smaller" if smaller else ""
    method_suffix = "-topk"
    value_suffix = "{0}".format(top_k)
Exemple #29
0
    estimator.fit(train_data, train_target)
    test_predict = estimator.predict(test_data)
    f1 = f1_score(test_target, test_predict, average='weighted')
    return f1


if __name__ == '__main__':
    print('Loading 20newsgroup dataset for all categories')

    ############################# Load train data
    train = fetch_20newsgroups(subset='train')
    print('Train data:\n')
    print('%d documents' % len(train.filenames))
    print('%d categories' % len(train.target_names))

    train_data = load_pickle('dataset/train-data.pkl')[:100]
    train_target = train.target[:100]
    D_train = len(train_target)

    ############################# Tune LDA
    V = 1000
    kappa = 0.5
    tau0 = 64
    var_i = 100
    num_topics = 20
    sizes = [512, 256]
    alphas = [.1, .05, .01]

    pool = Pool(processes=3)
    works = []
    kf = KFold(n_splits=3)
Exemple #30
0
def make_mupots_yolo():
    all_short_names = (
        'thor,spi4,spi2,spin,pelv,neck,head,htop,lcla,lsho,lelb,lwri,lhan,rcla,rsho,relb,rwri,'
        'rhan,lhip,lkne,lank,lfoo,ltoe,rhip,rkne,rank,rfoo,rtoe'.split(','))

    # originally: [7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 4, 3, 6]
    selected_joints = [
        7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 3, 6, 4
    ]
    order_joints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 14]
    joint_names = [all_short_names[j] for j in selected_joints]
    j = p3ds.JointInfo.make_id_map(joint_names)
    edges = [(j.htop, j.head), (j.head, j.neck), (j.neck, j.lsho),
             (j.lsho, j.lelb), (j.lelb, j.lwri), (j.neck, j.rsho),
             (j.rsho, j.relb), (j.relb, j.rwri), (j.neck, j.spin),
             (j.spin, j.pelv), (j.pelv, j.lhip), (j.lhip, j.lkne),
             (j.lkne, j.lank), (j.pelv, j.rhip), (j.rhip, j.rkne),
             (j.rkne, j.rank)]
    joint_info = p3ds.JointInfo(j, edges)

    root = f'{paths.DATA_ROOT}/mupots'
    intrinsic_matrices = util.load_json(f'{root}/camera_intrinsics.json')

    dummy_coords = np.ones((joint_info.n_joints, 3))
    detections_all = util.load_pickle(f'{root}/yolov3_detections.pkl')

    examples_val = []
    examples_test = []
    for i_seq in range(1, 21):
        annotations = matlabfile.load(
            f'{root}/TS{i_seq}/annot.mat')['annotations']
        intrinsic_matrix = intrinsic_matrices[f'TS{i_seq}']
        camera = cameralib.Camera(np.zeros(3),
                                  np.eye(3),
                                  intrinsic_matrix,
                                  distortion_coeffs=None,
                                  world_up=(0, -1, 0))

        n_people = annotations.shape[1]
        n_frames = annotations.shape[0]
        for i_frame in range(n_frames):

            image_relpath = f'TS{i_seq}/img_{i_frame:06d}.jpg'
            detections_frame = detections_all[image_relpath]
            image_path = f'{root}/{image_relpath}'
            for detection in detections_frame:
                if detection[4] > 0.1:
                    ex = p3ds.Pose3DExample(image_path,
                                            dummy_coords,
                                            detection[:4],
                                            camera,
                                            mask=None,
                                            univ_coords=dummy_coords,
                                            scene_name=f'TS{i_seq}')
                    examples_test.append(ex)

            gt_people = []

            for i_person in range(n_people):
                world_coords = np.array(
                    annotations[i_frame, i_person].annot3.T[order_joints],
                    dtype=np.float32)
                univ_world_coords = np.array(
                    annotations[i_frame, i_person].univ_annot3.T[order_joints],
                    dtype=np.float32)
                im_coords = camera.world_to_image(world_coords)
                gt_box = boxlib.expand(boxlib.bb_of_points(im_coords), 1.1)
                ex = p3ds.Pose3DExample(image_path,
                                        world_coords,
                                        gt_box,
                                        camera,
                                        mask=None,
                                        univ_coords=univ_world_coords,
                                        scene_name=f'TS{i_seq}')
                gt_people.append(ex)

            confident_detections = [
                det for det in detections_frame if det[-1] > 0.1
            ]
            if confident_detections:
                iou_matrix = np.array([[
                    boxlib.iou(gt_person.bbox, box[:4])
                    for box in confident_detections
                ] for gt_person in gt_people])
                gt_indices, detection_indices = scipy.optimize.linear_sum_assignment(
                    -iou_matrix)
                for i_gt, i_det in zip(gt_indices, detection_indices):
                    if iou_matrix[i_gt, i_det] > 0.1:
                        ex = gt_people[i_gt]
                        ex.bbox = np.array(confident_detections[i_det][:4])
                        examples_val.append(ex)

    return p3ds.Pose3DDataset(joint_info,
                              valid_examples=examples_val,
                              test_examples=examples_test)
Exemple #31
0
 def __init__(self):
     self.target_dir = './upload/'
     self.frame_ids_path = './data/state/frameids.pickle'
     self.frame_ids = load_pickle(self.frame_ids_path, [])
 def setUp(self):
     config_path = util.abs_path('configure/2007test.NN.nltk.json')
     config = util.load_pickle(config_path, typ='json')
     self.flt = FeatureExtractor(config)
Exemple #33
0
def load_svm_sbd_model(model_file_path):
    model = util.load_pickle(model_file_path)
    sys.stderr.write('done!\n')
    return model    
 def load(self):
     self.input_normalizers = load_pickle('cache/models/%s-norm.pickle' %
                                          self.name)
     self.load_weights(self.name)
Exemple #35
0
 def load_punkt_model(self, path):
     self._sent_tokenizer = util.load_pickle(path)
                        type=str,
                        required=True)
    parser.add_argument('--top_k', type=int, default=100)

    args = parser.parse_args()
    input_file = args.input_file
    emotion_file = args.emotion_file
    emotion_lexicon_file = args.emotion_lexicon_file
    # conceptnet_path = args.conceptnet_path
    dataset_vocab_path = args.dataset_vocab_path
    dataset_vocab_embedding_path = args.dataset_vocab_embedding_path
    top_k = args.top_k

    num_emotional_words = top_k // 4

    concept_words = load_pickle(input_file)  # tuples of np.array
    emotions = read_file(emotion_file)
    # CN = load_pickle(conceptnet_path)
    emotion_lexicon = load_pickle(emotion_lexicon_file)
    # vocab_embedding = torch.load(dataset_vocab_embedding_path) # (vocab, emb_dim)

    # load vocab
    print("Loading dataset vocab from ", dataset_vocab_path)
    vocab_ckpt = torch.load(dataset_vocab_path)
    word2id = vocab_ckpt["src"].base_field.vocab.stoi
    id2word = vocab_ckpt["src"].base_field.vocab.itos
    print("dataset vocab size: ", len(word2id))

    new_word_ids = []
    new_word_scores = []
    new_word_VAD_scores = []
Exemple #37
0
    def create_train_datastructures(self):
        #loop through all marked datasets
        self.processor.scan_dataset = self.processor.scans_database.get_dataset(0)
          
        training_set_size = 0
        
        data = []
        #get size of training set in total
        while False != self.processor.scan_dataset:
            if self.processor.scan_dataset.is_training_set:
                
                filename = self.processor.get_features_filename(True)
                print 'loading', filename
                dict = ut.load_pickle(filename)

                # make an equal size of points for each class: use object labels more often:
                difference = np.sum(dict['labels'] == processor.LABEL_SURFACE) - np.sum(dict['labels'] == processor.LABEL_CLUTTER)
                #print ut.getTime(), filename
                #print ut.getTime(), 'surface',np.sum(dict['labels'] == LABEL_SURFACE)
                #print ut.getTime(), 'clutter',np.sum(dict['labels'] == LABEL_CLUTTER)
                #print ut.getTime(), difference, "difference = np.sum(dict['labels'] == LABEL_SURFACE) - np.sum(dict['labels'] == LABEL_CLUTTER)"
                #print ut.getTime(), ''
                if difference > 0:
                    clutter_features = (dict['features'])[np.nonzero(dict['labels'] == processor.LABEL_CLUTTER)]
                    if len(clutter_features) > 0: #if there are none, do nothin'
                        dict['set_size'] += difference
                        dict['features'] = np.vstack((dict['features'], clutter_features[np.random.randint(0,len(clutter_features),size=difference)]))
                        dict['labels'] = np.hstack((dict['labels'], np.ones(difference) * processor.LABEL_CLUTTER))
                elif difference < 0: 
                    surface_features = (dict['features'])[np.nonzero(dict['labels'] == processor.LABEL_SURFACE)]
                    if len(surface_features) > 0: #if there are none, do nothin'
                        difference = -difference
                        dict['set_size'] += difference
                        dict['features'] = np.vstack((dict['features'], surface_features[np.random.randint(0,len(surface_features),size=difference)]))
                        dict['labels'] = np.hstack((dict['labels'], np.ones(difference) * processor.LABEL_SURFACE))
                    
                training_set_size += dict['set_size']
                data.append(dict)
            #get next one
            self.processor.scan_dataset = self.processor.scans_database.get_next_dataset()
            #print ut.getTime(),  self.scan_dataset
        
        #create training set:
        self.processor.scan_dataset = self.processor.scans_database.get_dataset(0)
        current_training_set_index = 0
        
       
        feature_vector_length = len(self.processor.features.get_indexvector(self.features))
        print ut.getTime(), feature_vector_length
        #create dataset matrices:
        print ut.getTime(), '#training set size ', training_set_size 
        
        #deactivate for now:
        max_traning_size = 1800000#2040000
        #if training_set_size < max_traning_size:
        #if True:       
        train_data = cv.cvCreateMat(training_set_size,feature_vector_length,cv.CV_32FC1) #CvMat* cvCreateMat(int rows, int cols, int type)
        train_labels = cv.cvCreateMat(training_set_size,1,cv.CV_32FC1)
        
        for dict in data:        
            for index in range(dict['set_size']):
                #only train on surface and clutter
                if dict['labels'][index] == processor.LABEL_SURFACE or dict['labels'][index]== processor.LABEL_CLUTTER:
                
                    #print ut.getTime(), point3d
                    #print ut.getTime(), 'fvindexv',self.get_features_indexvector(features)
                    #print ut.getTime(), 'len', len(self.get_features_indexvector(features))
                    fv = (dict['features'][index])[self.processor.features.get_indexvector(self.features)]

                    #print ut.getTime(), 'fv',fv
                    #print ut.getTime(), np.shape(fv)
                    for fv_index, fv_value in enumerate(fv):
                        train_data[current_training_set_index][fv_index] = fv_value
                    train_labels[current_training_set_index] = dict['labels'][index]
#                    for fv_index, fv_value in enumerate(fv):
#                        print ut.getTime(), train_data[current_training_set_index][fv_index]
#                    print ut.getTime(), '##',train_labels[current_training_set_index],'##'                    
                    #print ut.getTime(), 'fv ', fv
                    #print ut.getTime(), 'tr ',train_data[index]
                    current_training_set_index = current_training_set_index + 1
        
                    #if current_training_set_index % 4096 == 0:
                    #    print ut.getTime(), 'label', dict['labels'][index], 'fv', fv        
                    if current_training_set_index %  16384 == 0:
                        print ut.getTime(), 'reading features:', current_training_set_index, 'of', training_set_size, '(',(float(current_training_set_index)/float(training_set_size)*100.0),'%)'
##subsample from the features, NOT USED/NOT WORKING?
#        else:
#            print ut.getTime(), 'more than',max_traning_size,'features, sample from them...'
#            #select 2040000 features:
#            all_data = []
#            all_labels = []
#            for dict in data:  
#                for index in range(dict['set_size']):
#                    if dict['labels'][index] == processor.LABEL_SURFACE or dict['labels'][index]== processor.LABEL_CLUTTER:
#                        fv = (dict['features'][index])[self.processor.features.get_indexvector(self.features)]
#                        all_data += [fv]
#                        all_labels += [dict['labels'][index]]
#                        
#                        current_training_set_index = current_training_set_index + 1    
#                        if current_training_set_index %  16384 == 0:
#                            print ut.getTime(), 'reading features:', current_training_set_index, 'of', training_set_size, '(',(float(current_training_set_index)/float(training_set_size)*100.0),'%)'
#            
#            del data
#            indices = np.array(random.sample(xrange(len(all_labels)),max_traning_size))
#            all_data = np.asarray(all_data)
#            all_labels = np.asarray(all_labels)
#            
#            all_data = all_data[indices]
#            all_labels = all_labels[indices]
#            
#            train_data = cv.cvCreateMat(max_traning_size,feature_vector_length,cv.CV_32FC1) #CvMat* cvCreateMat(int rows, int cols, int type)
#            train_labels = cv.cvCreateMat(max_traning_size,1,cv.CV_32FC1)
#                        
#            for index in range(max_traning_size):
#                for fv_index, fv_value in enumerate(all_data[index]):
#                    train_data[index][fv_index] = fv_value
#                    train_labels[index] = all_labels[index]
#                if index % 16384 == 0:
#                    print ut.getTime(), 'setting features:', (float(index)/float(max_traning_size))
#          
          
        print ut.getTime(), 'start training Classifier'

        type_mask = cv.cvCreateMat(1, feature_vector_length+1, cv.CV_8UC1)
        cv.cvSet( type_mask, cv.CV_VAR_NUMERICAL, 0)
        type_mask[feature_vector_length] = cv.CV_VAR_CATEGORICAL
        
        return (train_data, train_labels, type_mask)
Exemple #38
0
def make_mpi_inf_3dhp(camera_ids=(0, 1, 2, 4, 5, 6, 7, 8)):
    all_short_names = (
        'spi3,spi4,spi2,spin,pelv,neck,head,htop,lcla,lsho,lelb,lwri,lhan,rcla,rsho,relb,rwri,'
        'rhan,lhip,lkne,lank,lfoo,ltoe,rhip,rkne,rank,rfoo,rtoe'.split(','))

    test_set_selected_joints = [*range(14), 15, 16, 14]
    selected_joints = [
        7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 3, 6, 4
    ]
    joint_names = [all_short_names[j] for j in selected_joints]

    edges = (
        'htop-head-neck-lsho-lelb-lwri,neck-rsho-relb-rwri,neck-spin-pelv-lhip-lkne-lank,'
        'pelv-rhip-rkne-rank')
    joint_info = p3ds.JointInfo(joint_names, edges)

    root_3dhp = f'{paths.DATA_ROOT}/3dhp'
    detections_all = util.load_pickle(
        f'{paths.DATA_ROOT}/3dhp/yolov3_person_detections.pkl')

    #################################
    # TRAINING AND VALIDATION SET
    #################################
    num_frames = np.asarray([[6416, 12430], [6502, 6081], [12488, 12283],
                             [6171, 6675], [12820, 12312], [6188, 6145],
                             [6239, 6320], [6468, 6054]])

    train_subjects = [0, 1, 2, 3, 4, 5, 6]
    valid_subjects = [
        7
    ]  # this is my own arbitrary split for validation (Istvan Sarandi)
    train_examples = []
    valid_examples = []

    pool = util.BoundedPool(None, 120)
    for i_subject, i_seq, i_cam in itertools.product(
            train_subjects + valid_subjects, range(2), camera_ids):
        seqpath = f'{root_3dhp}/S{i_subject + 1}/Seq{i_seq + 1}'
        print(f'Processing {seqpath} camera {i_cam}')

        cam3d_coords = [
            ann.reshape([ann.shape[0], -1, 3])[:, selected_joints]
            for ann in matlabfile.load(f'{seqpath}/annot.mat')['annot3']
        ]
        univ_cam3d_coords = [
            ann.reshape([ann.shape[0], -1, 3])[:, selected_joints]
            for ann in matlabfile.load(f'{seqpath}/annot.mat')['univ_annot3']
        ]
        cameras = load_cameras(f'{seqpath}/camera.calibration')

        examples_container = train_examples if i_subject in train_subjects else valid_examples
        frame_step = 5

        prev_coords = None
        camera = cameras[i_cam]
        n_frames = num_frames[i_subject, i_seq]

        if i_subject == 5 and i_seq == 1 and i_cam == 2:
            # This video is shorter for some reason
            n_frames = 3911

        for i_frame in util.progressbar(range(0, n_frames, frame_step)):
            image_relpath = (f'3dhp/S{i_subject + 1}/Seq{i_seq + 1}/'
                             f'imageSequence/img_{i_cam}_{i_frame:06d}.jpg')

            cam_coords = cam3d_coords[i_cam][i_frame]
            world_coords = cameras[i_cam].camera_to_world(cam_coords)

            univ_camcoords = univ_cam3d_coords[i_cam][i_frame]
            univ_world_coords = cameras[i_cam].camera_to_world(univ_camcoords)

            # Check if the joints are within the image frame bounds
            if not np.all(camera.is_visible(world_coords, [2048, 2048])):
                continue

            im_coords = camera.camera_to_image(cam_coords)
            bbox = get_bbox(im_coords, image_relpath, detections_all)

            # Adaptive temporal sampling
            if (prev_coords is not None and np.all(
                    np.linalg.norm(world_coords - prev_coords, axis=1) < 100)):
                continue
            prev_coords = world_coords

            mask_path = image_relpath.replace('imageSequence', 'FGmasks')
            new_image_relpath = image_relpath.replace('3dhp',
                                                      '3dhp_downscaled')
            ex = p3ds.Pose3DExample(image_relpath,
                                    world_coords,
                                    bbox,
                                    camera,
                                    mask=mask_path,
                                    univ_coords=univ_world_coords)

            pool.apply_async(make_efficient_example,
                             (ex, new_image_relpath, 1, True),
                             callback=examples_container.append)

    print('Waiting for tasks...')
    pool.close()
    pool.join()
    print('Done...')
    #################################
    # TEST SET
    #################################
    test_examples = []
    cam1_4 = get_test_camera_subj1_4()
    cam5_6 = get_test_camera_subj5_6()

    activity_names = [
        'Stand/Walk', 'Exercise', 'Sit on Chair', 'Reach/Crouch', 'On Floor',
        'Sports', 'Misc.'
    ]
    for i_subject in range(1, 7):
        seqpath = f'{root_3dhp}/TS{i_subject}'
        annotation_path = f'{seqpath}/annot_data.mat'

        with h5py.File(annotation_path, 'r') as m:
            cam3d_coords = np.array(m['annot3'])[:, 0,
                                                 test_set_selected_joints]
            univ_cam3d_coords = np.array(
                m['univ_annot3'])[:, 0, test_set_selected_joints]
            valid_frames = np.where(m['valid_frame'][:, 0])[0]
            activity_ids = m['activity_annotation'][:, 0].astype(int) - 1

        camera = cam1_4 if i_subject <= 4 else cam5_6
        scene = ['green-screen', 'no-green-screen',
                 'outdoor'][(i_subject - 1) // 2]

        for i_frame in valid_frames:
            image_relpath = f'3dhp/TS{i_subject}/imageSequence/img_{i_frame + 1:06d}.jpg'
            cam_coords = cam3d_coords[i_frame]
            univ_camcoords = univ_cam3d_coords[i_frame]
            activity = activity_names[activity_ids[i_frame]]
            world_coords = camera.camera_to_world(cam_coords)
            univ_world_coords = camera.camera_to_world(univ_camcoords)
            im_coords = camera.camera_to_image(cam_coords)
            bbox = get_bbox(im_coords, image_relpath, detections_all)

            ex = p3ds.Pose3DExample(image_relpath,
                                    world_coords,
                                    bbox,
                                    camera,
                                    activity_name=activity,
                                    scene_name=scene,
                                    univ_coords=univ_world_coords)
            test_examples.append(ex)

    train_examples.sort(key=lambda x: x.image_path)
    valid_examples.sort(key=lambda x: x.image_path)
    test_examples.sort(key=lambda x: x.image_path)
    return p3ds.Pose3DDataset(joint_info, train_examples, valid_examples,
                              test_examples)
Exemple #39
0
def run_demo(best_path, record_save_path, model_type):
    print("============Begin Testing============")
    test_record_path = f'{record_save_path}/test_record.csv'
    dataloader = util.load_dataset(device, args.data_path, args.batch_size,
                                   args.batch_size, args.batch_size)
    g_temp = util.add_nodes_edges(adj_filename=args.adj_path,
                                  num_of_vertices=args.num_nodes)
    scaler = dataloader['scaler']
    run_gconv = 1
    lr_decay_rate = 0.97

    sensor_ids, sensor_id_to_ind, adj_mx = util.load_adj(
        args.adj_path_forbase, args.adjtype)
    supports = [torch.tensor(i).to(device) for i in adj_mx]

    _, _, A = util.load_pickle(args.adj_path_forbase)
    A_wave = util.get_normalized_adj(A)
    A_wave = torch.from_numpy(A_wave).to(device)
    # print("A_wave:", A_wave.shape, type(A_wave))
    best_mae = 100

    if args.randomadj:
        adjinit = None
    else:
        adjinit = supports[0]
    if args.aptonly:
        supports = None

    if model_type == "GWaveNet":
        print("=========Model:GWaveNet=========")
        print("with scaler")
        model = GWNET(device,
                      args.num_nodes,
                      args.dropout,
                      supports=supports,
                      gcn_bool=args.gcn_bool,
                      addaptadj=args.addaptadj,
                      aptinit=adjinit,
                      in_dim=args.in_dim,
                      out_dim=args.seq_length,
                      residual_channels=args.nhid,
                      dilation_channels=args.nhid,
                      skip_channels=args.nhid * 8,
                      end_channels=args.nhid * 16)

    if model_type == "STGCN":
        print("=========Model:STGCN=========")
        print("with scaler")
        model = STGCN(A_wave.shape[0],
                      2,
                      num_timesteps_input=12,
                      num_timesteps_output=12)

    if model_type == "LSTM":
        print("=========Model:LSTM=========")
        input_dim = 2
        hidden_dim = 2
        output_dim = 2
        model = LSTM(input_dim, hidden_dim, output_dim)

    model.to(device)
    model.zero_grad()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    optimizer.zero_grad()
    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer, lr_lambda=lambda epoch: lr_decay_rate**epoch)

    if torch.cuda.is_available():
        model.load_state_dict(torch.load(best_path))
    else:
        model.load_state_dict(torch.load(best_path, map_location='cpu'))

    outputs = []
    target = torch.Tensor(dataloader['y_test']).to(device)
    target = target[:, :, :, 0]
    print("201 y_test:", target.shape)

    for iter, (x, y) in enumerate(dataloader['test_loader'].get_iterator()):
        testx = torch.Tensor(x).to(device).transpose(1, 3)
        testx = nn.functional.pad(testx, (1, 0, 0, 0))
        with torch.no_grad():
            pred = model.forward(testx).squeeze(3)
        print("iter: ", iter)
        print("pred: ", pred.shape)
        outputs.append(pred)

    yhat = torch.cat(outputs, dim=0)
    yhat = yhat[:target.size(0), ...]
    test_record, amape, armse, amae = [], [], [], []

    pred = scaler.inverse_transform(yhat)
    for i in range(12):
        pred_t = pred[:, i, :]
        real_target = target[:, i, :]
        evaluation = evaluate_all(pred_t, real_target)
        log = 'test for horizon {:d}, Test MAPE: {:.4f}, Test RMSE: {:.4f}, Test MAE: {:.4f}'
        print(log.format(i + 1, evaluation[0], evaluation[1], evaluation[2]))
        amape.append(evaluation[0])
        armse.append(evaluation[1])
        amae.append(evaluation[2])
        test_record.append([x for x in evaluation])
    test_record_df = pd.DataFrame(test_record,
                                  columns=['mape', 'rmse',
                                           'mae']).rename_axis('t')
    test_record_df.round(3).to_csv(test_record_path)
    log = 'On average over 12 horizons, Test MAE: {:.4f}, Test MAPE: {:.4f}, Test RMSE: {:.4f}'
    print(log.format(np.mean(amae), np.mean(amape), np.mean(armse)))
    print("=" * 10)
      sent_length = max(bfeat['sent_length']) / avg_sent_len
      sent_title_sim = max(bfeat['sent-title_sim'])
      sent_query_sim = max(bfeat['sent-query_sim'])

      bfeat['sent_pos'] = sent_pos
      bfeat['sent_length'] = sent_length
      bfeat['sent-title_sim'] = sent_title_sim
      bfeat['sent-query_sim'] = sent_query_sim

      bfeat['sent_ratio'] /= num_sents

if __name__ == '__main__':
  import concept_mapper, sys

  #problems = util.load_pickle('dat/tac09_data.pickle')
  #util.save_pickle(problems[0], 'dat/tac09_prob_1.pickle')
  problems = [util.load_pickle('dat/tac09_prob_1.pickle')]
  for problem in problems:
    sys.stderr.write("%s %d %d\n" % (problem.id, len(problem.new_docs), sum([len(doc.sentences) for doc in problem.new_docs])))
    mapper = concept_mapper.HeuristicMapper(problem)
    mapper.map_concepts()
    feature = Feature(mapper.concepts, problem)
    feature.get_bigram_feat()
    for bigram in mapper.concepts:
      print feature.feat_to_string(bigram)
      sys.exit(0)




Exemple #41
0
def make_mpi_inf_3dhp(camera_ids=(0, 1, 2, 4, 5, 6, 7, 8)):
    all_short_names = (
        'spi3,spi4,spi2,spin,pelv,neck,head,htop,lcla,lsho,lelb,lwri,lhan,rcla,rsho,relb,rwri,'
        'rhan,lhip,lkne,lank,lfoo,ltoe,rhip,rkne,rank,rfoo,rtoe'.split(','))

    # originally: [7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 4, 3, 6]
    test_set_selected_joints = [
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 14
    ]
    selected_joints = [
        7, 5, 14, 15, 16, 9, 10, 11, 23, 24, 25, 18, 19, 20, 3, 6, 4
    ]
    joint_names = [all_short_names[j] for j in selected_joints]

    edges = (
        'htop-head-neck-lsho-lelb-lwri,neck-rsho-relb-rwri,neck-spin-pelv-lhip-lkne-lank,'
        'pelv-rhip-rkne-rank')
    joint_info = p3ds.JointInfo(joint_names, edges)

    root_3dhp = f'{paths.DATA_ROOT}/3dhp'
    detections_all = util.load_pickle(
        f'{paths.DATA_ROOT}/3dhp/yolov3_person_detections.pkl')

    #################################
    # TRAINING AND VALIDATION SET
    #################################
    num_frames = np.asarray([[6416, 12430], [6502, 6081], [12488, 12283],
                             [6171, 6675], [12820, 12312], [6188, 6145],
                             [6239, 6320], [6468, 6054]])

    train_subjects = [0, 1, 2, 3, 4, 5, 6]
    valid_subjects = [
        7
    ]  # this is my own arbitrary split for validation (Istvan Sarandi)
    train_examples = []
    valid_examples = []

    pool = util.BoundedPool(None, 120)
    for i_subject, i_seq, i_cam in itertools.product(
            train_subjects + valid_subjects, range(2), camera_ids):

        seqpath = f'{root_3dhp}/S{i_subject + 1}/Seq{i_seq + 1}'
        print(seqpath, i_cam)
        cam3d_coords = [
            ann.reshape([ann.shape[0], -1, 3])[:, selected_joints]
            for ann in matlabfile.load(f'{seqpath}/annot.mat')['annot3']
        ]
        univ_cam3d_coords = [
            ann.reshape([ann.shape[0], -1, 3])[:, selected_joints]
            for ann in matlabfile.load(f'{seqpath}/annot.mat')['univ_annot3']
        ]
        cameras = load_cameras(f'{seqpath}/camera.calibration')

        examples_container = train_examples if i_subject in train_subjects else valid_examples
        frame_step = 5

        prev_coords = None
        camera = cameras[i_cam]
        n_frames = num_frames[i_subject, i_seq]

        if i_subject == 5 and i_seq == 1 and i_cam == 2:
            # This video is shorter for some reason
            n_frames = 3911

        for i_frame in util.progressbar(range(0, n_frames, frame_step)):
            image_relpath = (
                f'3dhp/S{i_subject + 1}/Seq{i_seq + 1}/Images/video_{i_cam}/' +
                f'frame_{i_frame:06d}.jpg')

            cam_coords = cam3d_coords[i_cam][i_frame]
            world_coords = cameras[i_cam].camera_to_world(cam_coords)

            univ_camcoords = univ_cam3d_coords[i_cam][i_frame]
            univ_world_coords = cameras[i_cam].camera_to_world(univ_camcoords)

            # Check if the joints are within the image frame bounds
            if not np.all(camera.is_visible(world_coords, [2048, 2048])):
                continue

            im_coords = camera.camera_to_image(cam_coords)
            bbox = get_bbox(im_coords, image_relpath, detections_all)

            # Adaptive temporal sampling
            if (prev_coords is not None and np.all(
                    np.linalg.norm(world_coords - prev_coords, axis=1) < 100)):
                continue
            prev_coords = world_coords

            ex = p3ds.Pose3DExample(image_relpath,
                                    world_coords,
                                    bbox,
                                    camera,
                                    mask=None,
                                    univ_coords=univ_world_coords)

            pool.apply_async(make_efficient_example, (ex, ),
                             callback=examples_container.append)

    print('Waiting for tasks...')
    pool.close()
    pool.join()
    print('Done...')
    #################################
    # TEST SET
    #################################
    test_examples = []

    cam1_4 = make_3dhp_test_camera(
        sensor_size=np.array([10, 10]),
        im_size=np.array([2048, 2048]),
        focal_length=7.32506,
        pixel_aspect=1.00044,
        center_offset=np.array([-0.0322884, 0.0929296]),
        distortion=None,
        origin=np.array([3427.28, 1387.86, 309.42]),
        up=np.array([-0.208215, 0.976233, 0.06014]),
        right=np.array([0.000575281, 0.0616098, -0.9981]))

    cam5_6 = make_3dhp_test_camera(
        sensor_size=np.array([10, 5.625]),
        im_size=np.array([1920, 1080]),
        focal_length=8.770747185,
        pixel_aspect=0.993236423,
        center_offset=np.array([-0.104908645, 0.104899704]),
        distortion=np.array([
            -0.276859611, 0.131125256, -0.000360494, -0.001149441, -0.049318332
        ]),
        origin=np.array([-2104.3074, 1038.6707, -4596.6367]),
        up=np.array([0.025272345, 0.995038509, 0.096227370]),
        right=np.array([-0.939647257, -0.009210289, 0.342020929]))

    activity_names = [
        'Stand/Walk', 'Exercise', 'Sit on Chair', 'Reach/Crouch', 'On Floor',
        'Sports', 'Misc.'
    ]
    for i_subject in range(1, 7):
        seqpath = f'{root_3dhp}/TS{i_subject}'
        annotation_path = f'{seqpath}/annot_data.mat'

        with h5py.File(annotation_path, 'r') as m:
            cam3d_coords = np.array(m['annot3'])[:, 0,
                                                 test_set_selected_joints]
            univ_cam3d_coords = np.array(
                m['univ_annot3'])[:, 0, test_set_selected_joints]
            valid_frames = np.where(m['valid_frame'][:, 0])[0]
            activity_ids = m['activity_annotation'][:, 0].astype(int) - 1

        camera = cam1_4 if i_subject <= 4 else cam5_6
        scene = ['green-screen', 'no-green-screen',
                 'outdoor'][(i_subject - 1) // 2]

        for i_frame in valid_frames:
            image_relpath = f'3dhp/TS{i_subject}/imageSequence/img_{i_frame + 1:06d}.jpg'
            cam_coords = cam3d_coords[i_frame]
            univ_camcoords = univ_cam3d_coords[i_frame]

            activity = activity_names[activity_ids[i_frame]]
            world_coords = camera.camera_to_world(cam_coords)
            univ_world_coords = camera.camera_to_world(univ_camcoords)
            im_coords = camera.camera_to_image(cam_coords)
            bbox = get_bbox(im_coords, image_relpath, detections_all)

            ex = p3ds.Pose3DExample(image_relpath,
                                    world_coords,
                                    bbox,
                                    camera,
                                    activity_name=activity,
                                    scene_name=scene,
                                    univ_coords=univ_world_coords)
            test_examples.append(ex)

    train_examples.sort(key=lambda x: x.image_path)
    valid_examples.sort(key=lambda x: x.image_path)
    test_examples.sort(key=lambda x: x.image_path)
    return p3ds.Pose3DDataset(joint_info, train_examples, valid_examples,
                              test_examples)
Exemple #42
0
 def __load_index__(self):
     self._bcode_off_map = util.load_pickle(self.index_path)
    def prepare(self, features_k_nearest_neighbors, nonzero_indices = None, all_save_load = False, regenerate_neightborhood_indices = False):
        #print np.shape(self.processor.pts3d_bound), 'shape pts3d_bound'

        imgTmp = cv.cvCloneImage(self.processor.img)
        self.imNP = ut.cv2np(imgTmp,format='BGR')
        ###self.processor.map2d = np.asarray(self.processor.camPts_bound) #copied from laser to image mapping
        
        if features_k_nearest_neighbors == None or features_k_nearest_neighbors == False: #use range
            self.kdtree2d = kdtree.KDTree(self.processor.pts3d_bound.T)
            
            #print len(nonzero_indices)
            #print np.shape(np.asarray((self.processor.pts3d_bound.T)[nonzero_indices]))
            
            if nonzero_indices != None:
                print ut.getTime(), 'query ball tree for ', len(nonzero_indices), 'points'
                kdtree_query = kdtree.KDTree((self.processor.pts3d_bound.T)[nonzero_indices])
            else:
                print ut.getTime(), 'query ball tree'
                kdtree_query = kdtree.KDTree(self.processor.pts3d_bound.T)
            
            filename = self.processor.config.path+'/data/'+self.processor.scan_dataset.id+'_sphere_neighborhood_indices_'+str(self.processor.feature_radius)+'.pkl'
            if all_save_load == True and os.path.exists(filename) and regenerate_neightborhood_indices == False:
                #if its already there, load it:
                print ut.getTime(), 'loading',filename
                self.kdtree_queried_indices = ut.load_pickle(filename)    
            else:
                self.kdtree_queried_indices = kdtree_query.query_ball_tree(self.kdtree2d, self.processor.feature_radius, 2.0, 0.2) #approximate
                print ut.getTime(), 'queried kdtree: ',len(self.kdtree_queried_indices),'points, radius:',self.processor.feature_radius
                if all_save_load == True:
                    ut.save_pickle(self.kdtree_queried_indices, filename)
                    
            #make dict out of list for faster operations? (doesn't seem to change speed significantly):
            #self.kdtree_queried_indices = dict(zip(xrange(len(self.kdtree_queried_indices)), self.kdtree_queried_indices))
        
        else: #experiemental: use_20_nearest_neighbors == True
            #TODO: exclude invalid values in get_featurevector (uncomment code there)
           
            self.kdtree2d = kdtree.KDTree(self.processor.pts3d_bound.T)
            self.kdtree_queried_indices = []
            print ut.getTime(), 'kdtree single queries for kNN start, k=', features_k_nearest_neighbors
            count = 0
            for point in ((self.processor.pts3d_bound.T)[nonzero_indices]):
                count = count + 1
                result = self.kdtree2d.query(point, features_k_nearest_neighbors,0.2,2,self.processor.feature_radius)
                #existing = result[0][0] != np.Inf
                #print existing
                #print result[1]
                self.kdtree_queried_indices += [result[1]] #[existing]
                if count % 4096 == 0:
                    print ut.getTime(),count
            print ut.getTime(), 'kdtree singe queries end'
            
            #convert to numpy array -> faster access
            self.kdtree_queried_indices = np.asarray(self.kdtree_queried_indices)
        
        #print self.kdtree_queried_indices
        #takes long to compute:
        #avg_len = 0
        #minlen = 999999
        #maxlen = 0
        #for x in self.kdtree_queried_indices:
        #    avg_len += len(x)
        #    minlen = min(minlen, len(x))
        #    maxlen = max(maxlen, len(x))
        #avg_len = avg_len / len(self.kdtree_queried_indices)
        #print ut.getTime(), "range neighbors: avg_len", avg_len, 'minlen', minlen, 'maxlen', maxlen
        
        
        #create HSV numpy images:
        # compute the hsv version of the image 
        image_size = cv.cvGetSize(self.processor.img)
        img_h = cv.cvCreateImage (image_size, 8, 1)
        img_s = cv.cvCreateImage (image_size, 8, 1)
        img_v = cv.cvCreateImage (image_size, 8, 1)
        img_hsv = cv.cvCreateImage (image_size, 8, 3)
        
        cv.cvCvtColor (self.processor.img, img_hsv, cv.CV_BGR2HSV)
        
        cv.cvSplit (img_hsv, img_h, img_s, img_v, None)
        self.imNP_h = ut.cv2np(img_h)
        self.imNP_s = ut.cv2np(img_s)
        self.imNP_v = ut.cv2np(img_v)
        
        textures = texture_features.eigen_texture(self.processor.img)
        self.imNP_tex1 = textures[:,:,0]
        self.imNP_tex2 = textures[:,:,1]
        
        self.debug_before_first_featurevector = True
        
        self.generate_voi_histogram(self.processor.point_of_interest,self.processor.voi_width)
Exemple #44
0
def load_sbd_model(model_path = 'model_nb/'):
    sys.stderr.write('loading model from [%s]... ' %model_path)
    model = util.load_pickle(model_path + 'model.pkl')
    model.path = model_path
    sys.stderr.write('done!\n')
    return model
Exemple #45
0
  if ilp_path is None:
    ilp_path = 'dat/%s/ilps' % (task_name)
  sent_path = args.sentpath
  if sent_path is None:
    sent_path = 'dat/%s/sents' % (task_name)
  maxrouge_path = args.maxrougepath
  if maxrouge_path is None:
    maxrouge_path = 'dat/%s/solutions/maxrouge' % (task_name)
  bigram_path = 'dat/%s/features' % (task_name)
  task = Task(task_name, topic_file, doc_path, man_path)

  # Get documents, split into sentences, tokenize and stem
  if args.load is not None:
    start_time = time.time()
    sys.stderr.write('Loading [%s] problem data in [%s]\n' %(task.name, task.data_pickle))
    task.problems = util.load_pickle(args.load)
    sys.stderr.write('Done [%.2f s]\n' % (time.time() - start_time))
  else:
    text.text_processor.load_splitta_model('lib/splitta/model_nb/')
    # Skip update data
    if task_name[:3] == 'tac':
      framework.setup_TAC08(task, True)
    elif task_name[:3] == 'duc':
      framework.setup_DUC_basic(task, True)
    elif task_name[:3] == 'new':
      framework.setup_news(task)
    else:
      raise Exception('Unknown task %s' % task)
    if task_name[:3] != 'new':
      for problem in task.problems:
        problem.load_documents()
Exemple #46
0
def load_svm_sbd_model(model_file_path):
    model = util.load_pickle(model_file_path)
    sys.stderr.write('done!\n')
    return model
Exemple #47
0
from sklearn.model_selection import learning_curve, validation_curve,\
   cross_val_score, GridSearchCV
import matplotlib.pyplot as plt
import json

############################# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

print('Loading 20newsgroup dataset for all categories')

############################# Load train data
train = fetch_20newsgroups(subset='train')
# train_data = [preprocessor(doc) for doc in train.data]
# save_pickle(train_data, 'dataset/train-data.pkl')
train_data = load_pickle('dataset/train-data.pkl')
train_target = train.target

print('Train data:\n')
print('%d documents' % len(train.filenames))
print('%d categories' % len(train.target_names))
# print(train.target_names[0])
# print(np.where(train.target == 0))
# print(train_target)
# print(train.filenames)

############################# Preprocess
preprocess = Pipeline([('count',
                        CountVectorizer(stop_words='english',
                                        max_df=.75,
                                        ngram_range=(1, 1),
Exemple #48
0
def make_efficient_example(ex):
    image_relpath = ex.image_path
    max_rotate = np.pi / 6
    padding_factor = 1 / 0.85
    scale_up_factor = 1 / 0.85
    scale_down_factor = 1 / 0.85
    shift_factor = 1.2
    base_dst_side = 256

    box_center = boxlib.center(ex.bbox)
    s, c = np.sin(max_rotate), np.cos(max_rotate)
    w, h = ex.bbox[2:]
    rot_bbox_side = max(c * w + s * h, c * h + s * w)
    rot_bbox = boxlib.box_around(box_center, rot_bbox_side)

    scale_factor = min(base_dst_side / np.max(ex.bbox[2:]) * scale_up_factor,
                       1)
    expansion_factor = padding_factor * shift_factor * scale_down_factor
    expanded_bbox = boxlib.expand(rot_bbox, expansion_factor)
    expanded_bbox = boxlib.intersect(expanded_bbox,
                                     np.array([0, 0, 2048, 2048]))

    new_camera = ex.camera.copy()
    new_camera.intrinsic_matrix[:2, 2] -= expanded_bbox[:2]
    new_camera.scale_output(scale_factor)
    new_camera.undistort()
    dst_shape = improc.rounded_int_tuple(scale_factor * expanded_bbox[[3, 2]])

    new_im_relpath = ex.image_path.replace('3dhp', f'3dhp_downscaled')
    new_im_path = os.path.join(paths.DATA_ROOT, new_im_relpath)
    if not (util.is_file_newer(new_im_path, "2019-11-14T23:32:07")
            and improc.is_image_readable(new_im_path)):
        im = improc.imread_jpeg(f'{paths.DATA_ROOT}/{image_relpath}')
        new_im = cameralib.reproject_image(im, ex.camera, new_camera,
                                           dst_shape)
        util.ensure_path_exists(new_im_path)
        imageio.imwrite(new_im_path, new_im)

    new_bbox_topleft = cameralib.reproject_image_points(
        ex.bbox[:2], ex.camera, new_camera)
    new_bbox = np.concatenate([new_bbox_topleft, ex.bbox[2:] * scale_factor])

    mask_rle_relpath = new_im_path.replace('Images', 'FGmaskImages').replace(
        '.jpg', '.pkl')
    mask_rle_path = os.path.join(paths.DATA_ROOT, mask_rle_relpath)
    if util.is_file_newer(mask_rle_path, "2020-03-11T20:46:46"):
        mask_runlength = util.load_pickle(mask_rle_path)
    else:
        mask_relpath = ex.image_path.replace('Images', 'FGmaskImages').replace(
            '.jpg', '.png')
        mask = imageio.imread(os.path.join(paths.DATA_ROOT, mask_relpath))
        mask_reproj = cameralib.reproject_image(mask, ex.camera, new_camera,
                                                dst_shape)
        mask_runlength = get_mask_with_highest_iou(mask_reproj, new_bbox)
        util.dump_pickle(mask_runlength, mask_rle_path)

    return p3ds.Pose3DExample(new_im_relpath,
                              ex.world_coords,
                              new_bbox,
                              new_camera,
                              mask=mask_runlength,
                              univ_coords=ex.univ_coords)
Exemple #49
0
    # mat2 = sys.argv[2]
    exp_dir = '../0710_data'
    svm_out = '../0710_out'
    add_slope(exp_dir)

    materials = [
        m[:-4] for m in os.listdir('../0710_data') if m.endswith('.pkl')
    ]
    print materials
    accum = 0
    buff = ''
    for pair in list(itertools.combinations(materials, 2)):
        mat1, mat2 = pair

        # print "Loading Data"
        vec1 = util.load_pickle(exp_dir + '/' + mat1 + '.pkl')
        vec2 = util.load_pickle(exp_dir + '/' + mat2 + '.pkl')

        # vec1 = transform_erfc(vec1, T_amb1)
        # vec2 = transform_erfc(vec2, T_amb2)
        # print len(vec1), len(vec2)

        data_dic = create_binary_dataset(vec1, vec2, mat1, mat2, 2)
        # print "created"
        score = run_crossvalidation_new(data_dic, num_folds)
        if not os.path.exists(svm_out):
            os.makedirs(svm_out)
        buff += '%s, %s, %f\n' % (mat1, mat2, score)
        print '%s, %s, %s' % (mat1, mat2, str(score))
    print accum
    with open(svm_out + "/out.csv", "w") as text_file:
from DBConnection import db
import numpy as np
from util import load_pickle
import os

output_dir = 'final_data/PEMS-BAY/medium/estimation/'
num_dates = 8404
no_features = 11
num_sensors = 87

sensor_ids = load_pickle('final_data/PEMS-BAY/medium/adj_mx_medium.pkl')[0]
values_sensor_ids = str(['(' + str(x) + ')'
                         for x in sensor_ids]).replace("'", "")[1:-1]

select_query = "SELECT time, sensor_id, bucket_0::float, bucket_1::float, bucket_2::float, bucket_3::float," \
               " bucket_4::float, bucket_5::float, bucket_6::float, bucket_7::float, bucket_8::float, bucket_9::float," \
               " bucket_10::float from {0} where sensor_id = ANY(VALUES {1})" \
               "order by time, sensor_id asc"

data = db.execute_query(
    select_query.format('pems_final_normalized', values_sensor_ids))
data = np.array([x[2:] for x in data])
data = data.reshape([num_dates, num_sensors, no_features])
np.save(os.path.join(output_dir, "%s.npz" % 'data'), data)

# Compute the average distribution
data = db.execute_query(select_query.format('pems_final', values_sensor_ids))
data = np.array([x[2:] for x in data])
data = data.reshape([num_dates, num_sensors, no_features])

sensor_cars_counts = np.sum(data, axis=0)  # (sensor, buckets)
Exemple #51
0
def main():
    print("*" * 10)
    print(args)
    print("*" * 10)
    dataloader = util.load_dataset(device, args.data_path, args.batch_size, args.batch_size, args.batch_size)
    scaler = dataloader['scaler']
    print("scaler: ", scaler)
    model_type = "GWaveNet"    # HA / SVR / ARIMA / STGCN / GWaveNet / LSTM

    sensor_ids, sensor_id_to_ind, adj_mx = util.load_adj(args.adj_path, args.adjtype)
    supports = [torch.tensor(i).to(device) for i in adj_mx]

    _, _, A = util.load_pickle(args.adj_path)
    A_wave = util.get_normalized_adj(A)
    A_wave = torch.from_numpy(A_wave).to(device)
    # print("A_wave:", A_wave.shape, type(A_wave))
    best_path = os.path.join(args.save, 'best_model.pth')
    best_mae = 100

    if args.randomadj:
        adjinit = None
    else:
        adjinit = supports[0]
    if args.aptonly:
        supports = None

    if model_type == "GWaveNet":
        print("=========Model:GWaveNet=========")
        print("with scaler")
        model = GWNET(device, args.num_nodes, args.dropout, supports=supports, gcn_bool=args.gcn_bool,
                      addaptadj=args.addaptadj, aptinit=adjinit, in_dim=args.in_dim, out_dim=args.seq_length,
                      residual_channels=args.nhid, dilation_channels=args.nhid, skip_channels=args.nhid * 8,
                      end_channels=args.nhid * 16)

    if model_type == "STGCN":
        print("=========Model:STGCN=========")
        print("with scaler")
        model = STGCN(A_wave.shape[0], 2, num_timesteps_input=12, num_timesteps_output=12)

    if model_type == "LSTM":
        print("=========Model:LSTM=========")
        input_dim = 2
        hidden_dim = 2
        output_dim = 2
        model = LSTM(input_dim, hidden_dim, output_dim)

    best_path = f'{args.save}/{model_type}.pkl'
    record = []
    model.to(device)
    model.zero_grad()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    optimizer.zero_grad()
    loss_MSE = torch.nn.MSELoss()
    loss_gwnet = util.masked_mae
    loss_stgcn = util.masked_mae

    print("============Begin Training============")
    his_loss = []
    val_time = []
    train_time = []
    for epoch in range(args.num_epochs):
        print('-' * 10)
        print('Epoch {}/{}'.format(epoch, args.num_epochs))
        train_loss, train_mape, train_rmse, train_mae = [], [], [], []
        t1 = time.time()
        t = time.time()
        dataloader['train_loader'].shuffle()
        for iter, (x, y) in enumerate(dataloader['train_loader'].get_iterator()):
            trainx = torch.Tensor(x).to(device)  # x: (64, 24, 207, 2)
            trainy = torch.Tensor(y).to(device)  # y: (64, 12, 207, 2)
            if trainx.shape[0] != args.batch_size:
                continue

            if model_type == "GWaveNet":
                trainx = trainx.transpose(1, 3)
                trainy = trainy.transpose(1, 3)
                trainy = trainy[:, 0, :, :]
                trainy = torch.unsqueeze(trainy, dim=1)
                trainx = nn.functional.pad(trainx, (1, 0, 0, 0))

                pred = model.forward(trainx)
                pred = pred.transpose(1, 3)
                pred = scaler.inverse_transform(pred)
                loss_train = loss_gwnet(pred, trainy, 0.0)

            if model_type == "STGCN":
                # (batch_size,num_timesteps,num_nodes,num_features=in_channels)
                # ->(batch_size,num_nodes,num_timesteps,num_features=in_channels)
                trainx = trainx.permute(0, 2, 1, 3)
                trainy = trainy[:, :, :, 0].permute(0, 2, 1)
                pred = model(A_wave, trainx)
                # pred = scaler.inverse_transform(pred)
                # loss_train = loss_MSE(pred, trainy)
                loss_train = loss_stgcn(pred, trainy, 0.0)

            if model_type == "rnn":
                [batch_size, step_size, num_of_vertices, fea_size] = trainx.size()
                trainx = trainx.permute(0, 2, 1, 3)
                trainx = trainx.reshape(-1, step_size, fea_size)
                trainy = trainy.reshape(-1, 1, fea_size)
                trainy = trainy[:, 0, :]
                pred = model.loop(trainx)
                loss_train = loss_MSE(pred, trainy)

            Y_size = trainy.shape

            if iter == 0:
                print("trainy:", trainy.shape)

            optimizer.zero_grad()
            loss_train.backward()
            clip = 5
            if clip is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

            evaluation = evaluate(pred, trainy)
            train_loss.append(loss_train.item())
            train_mape.append(evaluation[0])
            train_rmse.append(evaluation[1])
            train_mae.append(evaluation[2])

            if iter % args.interval == 0:
                log = 'Iter: {:03d}|Train Loss: {:.4f}|Train MAPE: {:.4f}|Train RMSE: {:.4f}|Train MAE: {:.4f}|Time: ' \
                      '{:.4f} '
                print(log.format(iter, train_loss[-1], train_mape[-1], train_rmse[-1], train_mae[-1], time.time() - t),
                      flush=True)
                t = time.time()
        t2 = time.time()
        train_time.append(t2 - t1)
        # validation
        valid_loss, valid_mape, valid_rmse, valid_mae = [], [], [], []
        s1 = time.time()
        for iter, (x_val, y_val) in enumerate(dataloader['val_loader'].get_iterator()):
            # validation data loader iterator init
            inputs_val = torch.Tensor(x_val).to(device)  # x: (64, 24, 207, 2)
            labels_val = torch.Tensor(y_val).to(device)

            if model_type == "GWaveNet":
                inputs_val = inputs_val.transpose(1, 3)
                labels_val = labels_val.transpose(1, 3)
                labels_val = labels_val[:, 0, :, :]
                labels_val = torch.unsqueeze(labels_val, dim=1)

                inputs_val = nn.functional.pad(inputs_val, (1, 0, 0, 0))
                pred_val = model.forward(inputs_val)
                pred_val = pred_val.transpose(1, 3)
                pred_val = scaler.inverse_transform(pred_val)
                loss_valid = loss_gwnet(pred_val, labels_val, 0.0)

            if model_type == "STGCN":
                inputs_val = inputs_val.permute(0, 2, 1, 3)
                labels_val = labels_val[:, :, :, 0].permute(0, 2, 1)
                pred_val = model(A_wave, inputs_val)
                # pred_val = scaler.inverse_transform(pred_val)
                # loss_valid = loss_MSE(pred_val, labels_val)
                loss_valid = loss_stgcn(pred_val, labels_val, 0.0)

            if model_type == "rnn":
                [batch_size, step_size, num_of_vertices, fea_size] = trainx.size()
                inputs_val = inputs_val.permute(0, 2, 1, 3)
                inputs_val = inputs_val.reshape(-1, step_size, fea_size)
                labels_val = labels_val.reshape(-1, 1, fea_size)
                labels_val = labels_val[:, 0, :]
                pred_val = model.loop(inputs_val)
                loss_valid = loss_MSE(pred_val, labels_val)

            # pred_val = scaler.inverse_transform(pred_val)
            optimizer.zero_grad()
            # loss_valid.backward()
            evaluation = evaluate(pred_val, labels_val)

            valid_loss.append(loss_valid.item())
            valid_mape.append(evaluation[0])
            valid_rmse.append(evaluation[1])
            valid_mae.append(evaluation[2])
        s2 = time.time()
        log = 'Epoch: {:03d}, Inference Time: {:.4f} secs'
        print(log.format(epoch, (s2 - s1)))
        val_time.append(s2 - s1)
        mtrain_loss = np.mean(train_loss)
        mtrain_mape = np.mean(train_mape)
        mtrain_rmse = np.mean(train_rmse)
        mtrain_mae = np.mean(train_mae)

        mvalid_loss = np.mean(valid_loss)
        mvalid_mape = np.mean(valid_mape)
        mvalid_rmse = np.mean(valid_rmse)
        mvalid_mae = np.mean(valid_mae)
        his_loss.append(mvalid_loss)

        message = dict(train_loss=mtrain_loss, train_mape=mtrain_mape, train_rmse=mtrain_rmse,
                       valid_loss=mvalid_loss, valid_mape=mvalid_mape, valid_rmse=mvalid_rmse)
        message = pd.Series(message)
        record.append(message)
        # save model parameters
        if message.valid_loss < best_mae:
            torch.save(model.state_dict(), best_path)
            best_mae = message.valid_loss
            epochs_since_best_mae = 0
            best_epoch = epoch
        else:
            epochs_since_best_mae += 1

        record_df = pd.DataFrame(record)
        record_df.round(3).to_csv(f'{args.save}/record.csv')

        log = 'Epoch: {:03d}, Training Time: {:.4f}/epoch,\n' \
              'Train Loss: {:.4f}, Train MAPE: {:.4f}, Train RMSE: {:.4f}, Train MAE: {:.4f}, \n' \
              'Valid Loss: {:.4f}, Valid MAPE: {:.4f}, Valid RMSE: {:.4f}, Valid MAE: {:.4f},'
        print(log.format(epoch, (t2 - t1),
                         mtrain_loss, mtrain_mape, mtrain_rmse, mtrain_mae,
                         mvalid_loss, mvalid_mape, mvalid_rmse, mvalid_mae), flush=True)
        print("#" * 20)

    print("=" * 10)
    print("Average Train Time: {:.4f} secs/epoch".format(np.mean(train_time)))
    print("Average Valid Time: {:.4f} secs".format(np.mean(val_time)))
    print("=" * 10)

    # Testing
    bestid = np.argmin(his_loss)
    print("bestid: ", bestid)
    model.load_state_dict(torch.load(best_path))

    outputs = []
    target = torch.Tensor(dataloader['y_test']).to(device)
    if model_type == "GWaveNet":
        target = target.transpose(1, 3)[:, 0, :, :]
    if model_type == "STGCN":
        target = target[:, :, :, 0]
        target = target.transpose(1, 2)

    for iter, (x, y) in enumerate(dataloader['test_loader'].get_iterator()):
        testx = torch.Tensor(x).to(device)  # x: (64, 24, 207, 2)
        testy = torch.Tensor(y).to(device)  # x: (64, 24, 207, 2)

        if model_type == "GWaveNet":
            with torch.no_grad():
                testx = testx.transpose(1, 3)
                pred = model.forward(testx)
                pred = pred.transpose(1, 3)
            outputs.append(pred.squeeze())

        if model_type == "STGCN":
            with torch.no_grad():
                testx = testx.permute(0, 2, 1, 3)
                testy = testy[:, :, :, 0].permute(0, 2, 1)
                pred = model(A_wave, testx)     # (64, 207, 12)
            outputs.append(pred)

    yhat = torch.cat(outputs, dim=0)
    yhat = yhat[:target.size(0), ...]
    amae, amape, armse, test_record = [], [], [], []
    print("=" * 10)
    print("yhat:", yhat.shape)      # yhat: torch.Size([6850, 207, 12])
    print("target:", target.shape)  # target: torch.Size([6850, 207, 12])
    for i in range(Y_size[-1]):
        pred = scaler.inverse_transform(yhat[:, :, i])
        # pred = yhat[:, :, i]
        real_target = target[:, :, i]
        evaluation = evaluate(pred, real_target)
        log = 'Evaluate on test data for horizon {:d}, Test MAPE: {:.4f}, Test RMSE: {:.4f}, Test MAE: {:.4f}'
        print(log.format(i + 1, evaluation[0], evaluation[1], evaluation[2]))
        amape.append(evaluation[0])
        armse.append(evaluation[1])
        amae.append(evaluation[2])
        test_record.append([x for x in evaluation])

    test_record_df = pd.DataFrame(test_record, columns=['mape', 'rmse', 'mae']).rename_axis('t')
    test_record_df.round(3).to_csv(f'{args.save}/test_record.csv')

    log = 'On average over 12 horizons, Test MAE: {:.4f}, Test MAPE: {:.4f}, Test RMSE: {:.4f}'
    print(log.format(np.mean(amae), np.mean(amape), np.mean(armse)))
    print("=" * 10)
Exemple #52
0
 def load(self):
     self.featdict = util.load_pickle(self.path + 'feats')
Exemple #53
0
def make_main_process_pkl(prices_fname, word_pkl, hashtag_fname, handle_fname,
                          out_fname):
    """
    Main processing of the pickles
    """
    import seaborn as sns

    def get_label(in_dat):
        if in_dat > 0:
            return 1
        return 0

    def get_vol_price_dat(idx):
        if idx < 500:
            return None
        vol_arr = np.array([
            float(prices_dict[c_idx]['volume'])
            for c_idx in range(idx - 500, idx)
        ])
        price_arr = np.array([
            float(prices_dict[c_idx]['price'])
            for c_idx in range(idx - 500, idx)
        ])
        vol_arr, price_arr = np.expand_dims(vol_arr,
                                            axis=0), np.expand_dims(price_arr,
                                                                    axis=0)
        return np.concatenate((vol_arr, price_arr), axis=0).transpose()

    # Get prices
    prices_dict = get_prices(f_name=prices_fname)
    # Get the dictionaries and the sets
    main_arr, hashtag_dict, handle_dict = load_pickle(
        word_pkl)['dat'], get_dict(hashtag_fname), get_dict(handle_fname)
    # Sort the stuff
    sorted(main_arr, key=lambda val: val['time'])
    # Main Storage, and index for time array
    dat_arr, lab_arr, time_idx, samples, time_arr = [], [], 0, [], []

    # Current slot storage
    curr_dat, curr_lab = [], None
    num = 0
    for ele in main_arr:
        num += 1

        # If current time is higher then jump to next entry, update the arrays
        if ele['time'] >= prices_dict[time_idx]['time']:
            # Only if volume information is contained
            combined_out = get_vol_price_dat(time_idx - 1)
            if combined_out is not None:
                time_arr.append(prices_dict[time_idx]['time'])
                lab_arr.append(curr_lab)
                curr_dat.append(combined_out)
                dat_arr.append(curr_dat)
            curr_dat, curr_lab = [], None
            time_idx += 1
            if time_idx == len(prices_dict):
                logging.warning(
                    'Ran out of the prices.txt file at tweet index: {}, time index: {}'
                    .format(num, time_idx))
                break

        # If atleast half an hour away then include in set
        time_diff = prices_dict[time_idx]['time'] - ele['time']
        assert (0 < time_diff < 7200)
        if time_diff < 1800:
            continue

        # Get the data, check if hashtag is in array
        words, hashtag_arr = clean_tweet(tweet=ele['text'])
        hashtag_arr = [
            hashtag_dict[hashtag] for hashtag in hashtag_arr
            if hashtag in hashtag_dict
        ]

        # Add number for the handle if present
        handle_num = None
        if ele['handle'] in handle_dict:
            handle_num = handle_dict[ele['handle']]
        curr_dat.append((words, [handle_num, hashtag_arr]))
        curr_lab = get_label(float(prices_dict[time_idx]['change']))

    # Ensure that the length of the data and the number of labels are same
    assert (len(dat_arr) == len(lab_arr) == len(time_arr))
    logging.info('Total Samples: {}'.format(len(dat_arr)))
    logging.info('Printing out stats')
    # # Get stats regarding number of tweets per time step and timestep data
    # timestep_out = np.asarray([time_arr[idx] - time_arr[idx - 1] for idx in range(1, len(time_arr))])
    # number_tweets = np.asarray([len(dat_arr[idx]) for idx in range(1, len(time_arr))])
    #
    # plt.clf()
    # logging.info('Timestep out stats, Mean: {}, Max: {}, Min: {}, Std: {}'.format(
    #     timestep_out.mean(), timestep_out.max(), timestep_out.min(), timestep_out.std()))
    # sns.set(), plt.hist(timestep_out, bins=100, normed=True)
    # plt.xlabel('Time Step'), plt.ylabel('Probablity')
    # plt.savefig('data/timestep.png')
    #
    # plt.clf()
    # logging.info('number_tweets out stats, Mean: {}, Max: {}, Min: {}, Std: {}'.format(
    #     number_tweets.mean(), number_tweets.max(), number_tweets.min(), number_tweets.std()))
    # sns.set(), plt.hist(number_tweets, bins=100, normed=True)
    # plt.xlabel('Number tweets per timestep'), plt.ylabel('Probablity')
    # plt.savefig('data/tweets.png')
    #
    # plt.clf()
    # density = number_tweets / timestep_out
    # logging.info('density out stats, Mean: {}, Max: {}, Min: {}, Std: {}'.format(
    #     density.mean(), density.max(), density.min(), density.std()))
    # sns.set(), plt.hist(density, bins=100, normed=True)
    # plt.xlabel('Number tweets per timestep'), plt.ylabel('Probablity')
    # plt.savefig('data/tweets_density.png')
    #
    # plt.clf()
    # sns.set(), plt.hist(lab_arr, bins=5, normed=True)
    # plt.xlabel('Number tweets per timestep'), plt.ylabel('Probablity')
    # plt.savefig('data/label_dist.png')
    #
    save_pickle({
        'data': np.asarray(dat_arr),
        'labels': np.asarray(lab_arr)
    }, out_fname)
    logging.info('Saved Pickle To: {}'.format(out_fname))