Ejemplo n.º 1
0
    def test_count_multiple_pos(self):
        self.assertTrue(True)

        poses = defaultdict(set)
        counts = defaultdict(int)

        # 人民日报语料
        # base_dir = 'D:\\commons\\中文语料\\people2014\\2014'
        # pattern = r'([\u4e00-\u9fa5]+)/([a-zA-Z]+)'

        # CTB语料
        base_dir = 'D:\\commons\\中文语料\\树库\\LDC\\LDC2013T21\\ctb8.0\\data\\postagged'
        pattern = r'([\u4e00-\u9fa5]+)_([a-zA-Z]+)'

        def iter_corpus(poses_, counts_, base_dir_):
            for f in os.listdir(base_dir_):
                f = os.path.join(base_dir_, f)

                if os.path.isdir(f):
                    iter_corpus(poses_, f)
                else:
                    for line in iter_file(f):
                        for w, p in re.findall(pattern, line):
                            poses_[w].add(p)
                            counts_[w] += 1

        iter_corpus(poses, counts, base_dir)

        save_obj(poses, 'word.pos')
        save_obj(counts, 'word.count')
Ejemplo n.º 2
0
    def save(self):
        #save the model_object first
        save_obj(self.model_object, self.model_object_path)

        #delete model_object from Model object
        del self.model_object

        #save Model object
        save_obj(self, self.model_path)
Ejemplo n.º 3
0
    def run(self, pinglun_file, O_seeds):
        """
        提取特征词/评价词
        :param pinglun_file: 评论文本
        :param O_seeds: 种子评价词
        :return:
        """
        logger.info('pipeline run...')

        if not os.path.exists(self._clean_file):
            logger.info('清洗文本')
            clean.clean_file(pinglun_file, self._clean_file)

        if not os.path.exists(self._relation_file):
            logger.info('句法解析')
            relation_parse.parse(self._clean_file, self._relation_file)

        logger.info('提取特征词/评价词, double propagation算法')
        S = self._iter_sentences_relations(self._relation_file)
        F, O, fcounter, ocounter, rcount = double_propagation.extract(
            O_seeds, S)

        utils.write_file(self._dp_f_file, F)
        utils.write_file(self._dp_o_file, O)
        utils.save_obj(fcounter, self._dp_f_counter)
        utils.save_obj(ocounter, self._dp_o_counter)

        logger.info('特征词/评价词剪枝')
        F, O = prune.prune(F, O, fcounter, ocounter, rcount, self._threshold)

        utils.write_file(self._prune_f_file, F)
        utils.write_file(self._prune_o_file, O)

        if not os.path.exists(self._word2vec_file):
            logger.info('训练word2vec模型')
            T = self._iter_sentences_tokens(self._relation_file)
            w2c.train(T, self._word2vec_file)

        model = w2c.get(self._word2vec_file)

        logger.info('聚类特征词')
        cf = cluster.create(F, model, preference=-30)
        features = ['%s %s' % (cls, ' '.join(cf[cls])) for cls in cf]
        utils.write_file(self._feature_file, features)

        logger.info('聚类评价词')
        O = utils.read_file(self._prune_o_file)
        of = cluster.create(O, model, preference=None)
        opinions = ['%s %s' % (cls, ' '.join(of[cls])) for cls in of]
        utils.write_file(self._opinion_file, opinions)

        logger.info('pipeline over.')

        return cf, of, F, O
Ejemplo n.º 4
0
    def save(self):
        #save the dataframe first
        save_obj(self.df, self.dataframe_path)

        #delete dataframe from Dataset object
        del self.df
        del self.train_df
        del self.test_df
        
        #save Dataset object
        save_obj(self, self.dataset_path)
        self.new_op("save")
Ejemplo n.º 5
0
    def test_prune_xx(self):
        self.assertTrue(True)

        F = read_obj(os.path.join(RESOURCE_DIR, 'dp', 'dp.F'))
        O = read_obj(os.path.join(RESOURCE_DIR, 'dp', 'dp.O'))
        fcounter = read_obj(os.path.join(RESOURCE_DIR, 'dp', 'dp.fcounter'))
        ocounter = read_obj(os.path.join(RESOURCE_DIR, 'dp', 'dp.ocounter'))

        print('len1: ', len(F))

        F, O = double_propagation.prune_by_threshold(F, O, fcounter, ocounter)

        print('len2: ', len(F))

        F = double_propagation.prune_order_features(F, fcounter)

        print('len3: ', len(F))

        save_obj(F, os.path.join(RESOURCE_DIR, 'dp', 'dp.F.pruned'))
Ejemplo n.º 6
0
  parser.add_argument('--in_coord_sys', '-in_coord_sys', type=str, required=True)
  parser.add_argument('--out_coord_sys', '-out_coord_sys', type=str, required=True)
  parser.add_argument('--num_iters', '-iter', default=100000, type=int)
  parser.add_argument('--num_epoches', '-ne', default=100000, type=int)
  parser.add_argument('--batch_size', '-bs', default=4, type=int)
  parser.add_argument('--model_save_interval', '-mt', default=5000, type=int)
  parser.add_argument('--model_eval_interval', '-et', default=3000, type=int)
  parser.add_argument('--learning_rate', '-lr', default=0.001, type=float)
  parser.add_argument('--n_GPUs', '-ngpu', default=1, type=int) 
  parser.add_argument('--num_loader_workers', '-nlw', type=int, default=2)
  parser.add_argument('--pretrained_model', '-pm', default=None)
  parser.add_argument('--pretrained_optimizer', '-po', default=None)
  parser.add_argument('--scratch', '-scratch', action='store_true', default=False)
  parser.add_argument('--data_aug', '-data_aug', action='store_true', default=False)
  parser.add_argument('--exp_name', '-exp_name', default='debug')

  args = parser.parse_args()
  setattr(args, 'n_GPUs', torch.cuda.device_count())
  args_dict = vars(args)

  config.JOBS_MODEL_DIR = "./exp/%s/models" % args.exp_name
  config.JOBS_LOG_DIR = "./exp/%s/log" % args.exp_name
  config.JOBS_DIR = './exp/%s' % args.exp_name

  folder = makedir_if_not_exist(config.JOBS_DIR)
  save_obj(args_dict, os.path.join(config.JOBS_DIR, 'args.pkl'))

  train(**args_dict)

  print("End of train.py")
Ejemplo n.º 7
0
 def save(self):
     save_obj(self, self._model_file)
     self._model.save(self._keras_model_file)
Ejemplo n.º 8
0
    def test_count_syntax(self):
        self.assertTrue(True)

        sentiments = load_sentiment_words(os.path.join(RESOURCE_DIR, 'mobile', '1正面评价词_a+.txt'))
        sentiments |= load_sentiment_words(os.path.join(RESOURCE_DIR, 'mobile', '1负面评价词_a-.txt'))
        features = load_feature_word(os.path.join(RESOURCE_DIR, 'mobile', 'mobile.ontology'))

        corpus_file = os.path.join(RESOURCE_DIR, 'mobile', 'std.txt')

        ff_counter = Counter()
        oo_counter = Counter()
        fo_counter = Counter()

        ff_samples = defaultdict(set)
        oo_samples = defaultdict(set)
        fo_samples = defaultdict(set)

        i = 0
        for line in utils.iter_file(corpus_file):
            i += 1

            if i % 100 == 0:
                print(i)

            if i > 200000:
                break

            for sent in parser.parse2sents(line):
                for relation in sent.relations:
                    token1 = relation.token1.word
                    token2 = relation.token2.word

                    if token1 in features and token2 in features:
                        ff_counter.update([relation.format])
                        ff_samples[relation.format].add(str(relation))

                    if token1 in sentiments and token2 in sentiments:
                        oo_counter.update([relation.format])
                        oo_samples[relation.format].add(str(relation))

                    if token1 in sentiments and token2 in features:
                        fo_counter.update([relation.format])
                        fo_samples[relation.format].add(str(relation))

                    if token1 in features and token2 in sentiments:
                        fo_counter.update([relation.format])
                        fo_samples[relation.format].add(str(relation))

        utils.save_obj(ff_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'ff.counter'))
        utils.save_obj(oo_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'oo.counter'))
        utils.save_obj(fo_counter, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'fo.counter'))

        utils.save_obj(ff_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'ff.dict'))
        utils.save_obj(oo_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'oo.dict'))
        utils.save_obj(fo_samples, os.path.join(RESOURCE_DIR, 'mobile', 'count', 'fo.dict'))
Ejemplo n.º 9
0
 def save(self):
     save_obj(self, HomoModel.model_file)
Ejemplo n.º 10
0
 def save_blacklist(self, blacklist):
     result = self.load_blacklist()
     for s in blacklist:
         result.add(s.split(u':')[0].strip())
     filename = self.blacklist_filename()
     utils.save_obj(filename, result)
Ejemplo n.º 11
0
 def save_dicts(self, variant, cache):
     filename = self.cache_filename(variant)
     utils.save_obj(filename, cache)
Ejemplo n.º 12
0
 def save_blacklist(self, blacklist):
     result = self.load_blacklist()
     for s in blacklist:
         result.add(s.split(u':')[0].strip())
     filename = self.blacklist_filename()
     utils.save_obj(filename, result)
Ejemplo n.º 13
0
 def save_model_db(self, db):
     save_obj(db, self.db_path)
Ejemplo n.º 14
0
 def save_dataset_db(self, db):
     save_obj(db, self.db_path)
Ejemplo n.º 15
0
 def save_dicts(self, variant, cache):
     filename = self.cache_filename(variant)
     utils.save_obj(filename, cache)
Ejemplo n.º 16
0
            output[:, 1, :, :] *= coord_change[1]
            output[:, 2, :, :] *= coord_change[2]
            # Model does inference on 4D tensor, but eval function takes 3D
            # Has to be HxWx3
            output = np.transpose(output[0, :, :, :], (1, 2, 0))

        elif mode == 'pre-estimated':
            try:
                pred_path = os.path.join(args['model_file'], anno_id + '.npy')
                print("pre-estimated:", pred_path)
                pred_normal = np.load(pred_path)
                output = cv2.resize(pred_normal,
                                    (orig_width, orig_height))  # HxWx3
            except Exception as e:
                print("Error:", str(e))
                save_obj({"run_log": "Error processing %s.npy" % anno_id},
                         args['output_file'])
                import sys
                sys.exit()

        # For confirming the ground truth leads to AP 1.0 :)
        #normal_name = '/n/fs/pvl/datasets/3SIW/normal/{}.pkl'.format(anno_id)
        #dict = pickle.load(open(normal_name, 'rb'))
        #normal = np.zeros((orig_height, orig_width, 3))
        #normal[dict['min_x']:dict['max_x']+1, dict['min_y']:dict['max_y']+1, :] = dict['normal']
        #output = normal

        if args['front_facing']:
            output[:, :, 0] = 0
            output[:, :, 1] = 0
            output[:, :, 2] = 1
Ejemplo n.º 17
0
    parser.add_argument('--pred_folder', '-p', default=None)
    parser.add_argument('--num_iters', '-iter', default=100000, type=int)
    parser.add_argument('--output_file', '-o', default=None)
    parser.add_argument('--vis_normal',
                        '-vis',
                        action='store_true',
                        default=False)

    args = parser.parse_args()

    collate_fn = default_collate
    DataSet = OASISNormalDatasetVal
    test_dataset = DataSet(csv_filename=args.test_file)
    test_data_loader = data.DataLoader(test_dataset,
                                       batch_size=1,
                                       num_workers=1,
                                       shuffle=False,
                                       collate_fn=collate_fn)
    print("Testing on %s" % args.test_file)

    normal_result = valid_normals(data_loader=test_data_loader,
                                  max_iter=args.num_iters,
                                  verbal=True,
                                  pred_folder=args.pred_folder,
                                  b_vis_normal=args.vis_normal)
    print(normal_result)

    if args.output_file is not None:
        os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
        save_obj(normal_result, args.output_file)