def test_count_multiple_pos(self): self.assertTrue(True) poses = defaultdict(set) counts = defaultdict(int) # 人民日报语料 # base_dir = 'D:\\commons\\中文语料\\people2014\\2014' # pattern = r'([\u4e00-\u9fa5]+)/([a-zA-Z]+)' # CTB语料 base_dir = 'D:\\commons\\中文语料\\树库\\LDC\\LDC2013T21\\ctb8.0\\data\\postagged' pattern = r'([\u4e00-\u9fa5]+)_([a-zA-Z]+)' def iter_corpus(poses_, counts_, base_dir_): for f in os.listdir(base_dir_): f = os.path.join(base_dir_, f) if os.path.isdir(f): iter_corpus(poses_, f) else: for line in iter_file(f): for w, p in re.findall(pattern, line): poses_[w].add(p) counts_[w] += 1 iter_corpus(poses, counts, base_dir) save_obj(poses, 'word.pos') save_obj(counts, 'word.count')
def save(self): #save the model_object first save_obj(self.model_object, self.model_object_path) #delete model_object from Model object del self.model_object #save Model object save_obj(self, self.model_path)
def run(self, pinglun_file, O_seeds): """ 提取特征词/评价词 :param pinglun_file: 评论文本 :param O_seeds: 种子评价词 :return: """ logger.info('pipeline run...') if not os.path.exists(self._clean_file): logger.info('清洗文本') clean.clean_file(pinglun_file, self._clean_file) if not os.path.exists(self._relation_file): logger.info('句法解析') relation_parse.parse(self._clean_file, self._relation_file) logger.info('提取特征词/评价词, double propagation算法') S = self._iter_sentences_relations(self._relation_file) F, O, fcounter, ocounter, rcount = double_propagation.extract( O_seeds, S) utils.write_file(self._dp_f_file, F) utils.write_file(self._dp_o_file, O) utils.save_obj(fcounter, self._dp_f_counter) utils.save_obj(ocounter, self._dp_o_counter) logger.info('特征词/评价词剪枝') F, O = prune.prune(F, O, fcounter, ocounter, rcount, self._threshold) utils.write_file(self._prune_f_file, F) utils.write_file(self._prune_o_file, O) if not os.path.exists(self._word2vec_file): logger.info('训练word2vec模型') T = self._iter_sentences_tokens(self._relation_file) w2c.train(T, self._word2vec_file) model = w2c.get(self._word2vec_file) logger.info('聚类特征词') cf = cluster.create(F, model, preference=-30) features = ['%s %s' % (cls, ' '.join(cf[cls])) for cls in cf] utils.write_file(self._feature_file, features) logger.info('聚类评价词') O = utils.read_file(self._prune_o_file) of = cluster.create(O, model, preference=None) opinions = ['%s %s' % (cls, ' '.join(of[cls])) for cls in of] utils.write_file(self._opinion_file, opinions) logger.info('pipeline over.') return cf, of, F, O
def save(self): #save the dataframe first save_obj(self.df, self.dataframe_path) #delete dataframe from Dataset object del self.df del self.train_df del self.test_df #save Dataset object save_obj(self, self.dataset_path) self.new_op("save")
def test_prune_xx(self): self.assertTrue(True) F = read_obj(os.path.join(RESOURCE_DIR, 'dp', 'dp.F')) O = read_obj(os.path.join(RESOURCE_DIR, 'dp', 'dp.O')) fcounter = read_obj(os.path.join(RESOURCE_DIR, 'dp', 'dp.fcounter')) ocounter = read_obj(os.path.join(RESOURCE_DIR, 'dp', 'dp.ocounter')) print('len1: ', len(F)) F, O = double_propagation.prune_by_threshold(F, O, fcounter, ocounter) print('len2: ', len(F)) F = double_propagation.prune_order_features(F, fcounter) print('len3: ', len(F)) save_obj(F, os.path.join(RESOURCE_DIR, 'dp', 'dp.F.pruned'))
parser.add_argument('--in_coord_sys', '-in_coord_sys', type=str, required=True) parser.add_argument('--out_coord_sys', '-out_coord_sys', type=str, required=True) parser.add_argument('--num_iters', '-iter', default=100000, type=int) parser.add_argument('--num_epoches', '-ne', default=100000, type=int) parser.add_argument('--batch_size', '-bs', default=4, type=int) parser.add_argument('--model_save_interval', '-mt', default=5000, type=int) parser.add_argument('--model_eval_interval', '-et', default=3000, type=int) parser.add_argument('--learning_rate', '-lr', default=0.001, type=float) parser.add_argument('--n_GPUs', '-ngpu', default=1, type=int) parser.add_argument('--num_loader_workers', '-nlw', type=int, default=2) parser.add_argument('--pretrained_model', '-pm', default=None) parser.add_argument('--pretrained_optimizer', '-po', default=None) parser.add_argument('--scratch', '-scratch', action='store_true', default=False) parser.add_argument('--data_aug', '-data_aug', action='store_true', default=False) parser.add_argument('--exp_name', '-exp_name', default='debug') args = parser.parse_args() setattr(args, 'n_GPUs', torch.cuda.device_count()) args_dict = vars(args) config.JOBS_MODEL_DIR = "./exp/%s/models" % args.exp_name config.JOBS_LOG_DIR = "./exp/%s/log" % args.exp_name config.JOBS_DIR = './exp/%s' % args.exp_name folder = makedir_if_not_exist(config.JOBS_DIR) save_obj(args_dict, os.path.join(config.JOBS_DIR, 'args.pkl')) train(**args_dict) print("End of train.py")
def save(self): save_obj(self, self._model_file) self._model.save(self._keras_model_file)
def test_count_syntax(self): self.assertTrue(True) sentiments = load_sentiment_words(os.path.join(RESOURCE_DIR, 'mobile', '1正面评价词_a+.txt')) sentiments |= load_sentiment_words(os.path.join(RESOURCE_DIR, 'mobile', '1负面评价词_a-.txt')) features = load_feature_word(os.path.join(RESOURCE_DIR, 'mobile', 'mobile.ontology')) corpus_file = os.path.join(RESOURCE_DIR, 'mobile', 'std.txt') ff_counter = Counter() oo_counter = Counter() fo_counter = Counter() ff_samples = defaultdict(set) oo_samples = defaultdict(set) fo_samples = defaultdict(set) i = 0 for line in utils.iter_file(corpus_file): i += 1 if i % 100 == 0: print(i) if i > 200000: break for sent in parser.parse2sents(line): for relation in sent.relations: token1 = relation.token1.word token2 = relation.token2.word if token1 in features and token2 in features: ff_counter.update([relation.format]) ff_samples[relation.format].add(str(relation)) if token1 in sentiments and token2 in sentiments: oo_counter.update([relation.format]) oo_samples[relation.format].add(str(relation)) if token1 in sentiments and token2 in features: fo_counter.update([relation.format]) fo_samples[relation.format].add(str(relation)) if token1 in features and token2 in sentiments: fo_counter.update([relation.format]) fo_samples[relation.format].add(str(relation)) utils.save_obj(ff_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'ff.counter')) utils.save_obj(oo_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'oo.counter')) utils.save_obj(fo_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'fo.counter')) utils.save_obj(ff_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'ff.dict')) utils.save_obj(oo_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'oo.dict')) utils.save_obj(fo_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'fo.dict'))
def save(self): save_obj(self, HomoModel.model_file)
def save_blacklist(self, blacklist): result = self.load_blacklist() for s in blacklist: result.add(s.split(u':')[0].strip()) filename = self.blacklist_filename() utils.save_obj(filename, result)
def save_dicts(self, variant, cache): filename = self.cache_filename(variant) utils.save_obj(filename, cache)
def save_model_db(self, db): save_obj(db, self.db_path)
def save_dataset_db(self, db): save_obj(db, self.db_path)
output[:, 1, :, :] *= coord_change[1] output[:, 2, :, :] *= coord_change[2] # Model does inference on 4D tensor, but eval function takes 3D # Has to be HxWx3 output = np.transpose(output[0, :, :, :], (1, 2, 0)) elif mode == 'pre-estimated': try: pred_path = os.path.join(args['model_file'], anno_id + '.npy') print("pre-estimated:", pred_path) pred_normal = np.load(pred_path) output = cv2.resize(pred_normal, (orig_width, orig_height)) # HxWx3 except Exception as e: print("Error:", str(e)) save_obj({"run_log": "Error processing %s.npy" % anno_id}, args['output_file']) import sys sys.exit() # For confirming the ground truth leads to AP 1.0 :) #normal_name = '/n/fs/pvl/datasets/3SIW/normal/{}.pkl'.format(anno_id) #dict = pickle.load(open(normal_name, 'rb')) #normal = np.zeros((orig_height, orig_width, 3)) #normal[dict['min_x']:dict['max_x']+1, dict['min_y']:dict['max_y']+1, :] = dict['normal'] #output = normal if args['front_facing']: output[:, :, 0] = 0 output[:, :, 1] = 0 output[:, :, 2] = 1
parser.add_argument('--pred_folder', '-p', default=None) parser.add_argument('--num_iters', '-iter', default=100000, type=int) parser.add_argument('--output_file', '-o', default=None) parser.add_argument('--vis_normal', '-vis', action='store_true', default=False) args = parser.parse_args() collate_fn = default_collate DataSet = OASISNormalDatasetVal test_dataset = DataSet(csv_filename=args.test_file) test_data_loader = data.DataLoader(test_dataset, batch_size=1, num_workers=1, shuffle=False, collate_fn=collate_fn) print("Testing on %s" % args.test_file) normal_result = valid_normals(data_loader=test_data_loader, max_iter=args.num_iters, verbal=True, pred_folder=args.pred_folder, b_vis_normal=args.vis_normal) print(normal_result) if args.output_file is not None: os.makedirs(os.path.dirname(args.output_file), exist_ok=True) save_obj(normal_result, args.output_file)