Beispiel #1
0
def corpus_to_bunch(bunch_path, seg_path):
    '''
    :param bunch_path: Bunch存储路径
    :param seg_path:  分词后语料库路径
    '''
    seg_class_list = listdir_nohidden(seg_path)
    bunch = base.Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(seg_class_list)

    for seg_class_dir in bunch.target_name:

        seg_class_path = seg_path + "/" + seg_class_dir + "/"
        seg_file_list = listdir_nohidden(seg_class_path)

        for seg_file in seg_file_list:
            seg_full_path = seg_class_path + seg_file
            bunch.label.append(seg_class_dir)
            bunch.filenames.append(seg_file)
            bunch.contents.append(read_file(seg_full_path))

    with open(bunch_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)

    print("===================*****====================")
    print("corpus_to_bunch end")
    print("===================*****====================")
Beispiel #2
0
def corpus_segment(corpus_path, seg_path):
    '''
    :param corpus_path: 未分词语料库路径
    :param seg_path: 分词后语料库存储路径
    '''

    class_list = listdir_nohidden(corpus_path)

    for class_dir in class_list:
        class_path = corpus_path + "/" + class_dir + "/"
        seg_class_path = seg_path + "/" + class_dir + "/"

        if not os.path.exists(seg_class_path):
            os.makedirs(seg_class_path)

        file_list = listdir_nohidden(class_path)

        for file in file_list:
            full_path = class_path + file
            content = read_file(full_path)
            content_seg = jieba.cut(content)
            # 关键词提取,标引
            # print(file, jieba.analyse.extract_tags(content, topK=5, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')))
            save_file(seg_class_path + file, bytes(" ".join(content_seg), encoding="utf8"))  # 将处理后的文件保存到分词后语料目录

    print("===================*****====================")
    print("corpus_segment end")
    print("===================*****====================")
Beispiel #3
0
 def __init__(self,
              model=None,
              network_name=None,
              classification=False,
              root_dir=None,
              part="train"):
     self.data_dir = root_dir
     self.root = os.path.join(self.data_dir, "scale")
     self.part = part
     self.pt_train_max = 6
     self.pt_test_max = 4
     self.model = model
     self.classification = classification
     self.CALSS_THRESHOLD = 0.05
     self.network_name = network_name
     print("##########", self.root)
     self.data = []
     for class_name in listdir_nohidden(self.root):
         type_index = type_to_index_map[class_name]
         type_root = os.path.join(os.path.join(self.root, class_name))
         # for filename in os.listdir(type_root):
         #     if filename.endswith('.npz'):
         #         self.data.append((os.path.join(type_root, filename), type_index))
         for object_nb, filename in list_features_shapenet_classes(
                 type_root, epoch=410):
             if filename.endswith('.npz'):
                 if self.part == "train":
                     self.data.extend(self.pt_train_max *
                                      [(filename, type_index, object_nb)])
                 else:
                     self.data.extend(self.pt_test_max *
                                      [(filename, type_index, object_nb)])
Beispiel #4
0
def experiment_corpus_segment(corpus_path, seg_path):
    '''
    :param corpus_path: 未分词语料库路径
    :param seg_path: 分词后语料库存储路径
    '''

    file_list = listdir_nohidden(corpus_path)

    seg_path0 = seg_path + "/"
    if not os.path.exists(seg_path0):
        os.makedirs(seg_path0)

    for file in file_list:
        full_path = corpus_path + "/" + file
        content = read_file(full_path)
        content_seg = jieba.cut(content)
        save_file(seg_path0 + file, bytes(" ".join(content_seg), encoding="utf8"))  # 将处理后的文件保存到分词后语料目录

    print("===================*****====================")
    print("experiment_corpus_segment end")
    print("===================*****====================")
Beispiel #5
0
    use_cuda = torch.cuda.is_available()
    assert use_cuda, 'Works only with CUDA'
    device = torch.device('cuda') if use_cuda else torch.device('cpu')
    cfg.CUDA = use_cuda
    np.random.seed(cfg.RNG_SEED)

    # Load the model.
    fasterRCNN = resnet(N_CLASSES, 101, pretrained=False)
    fasterRCNN.create_architecture()
    fasterRCNN.load_state_dict(torch.load(args.model_file))
    fasterRCNN.to(device)
    fasterRCNN.eval()
    print('Model is loaded.')

    # Load images.
    imglist = list(listdir_nohidden(args.image_dir))
    num_images = len(imglist)
    print('Number of images: {}.'.format(num_images))

    # Extract features.
    for im_file in tqdm(imglist):
        im = cv2.imread(os.path.join(args.image_dir, im_file))
        blobs, im_scales = get_image_blob(im)
        assert len(im_scales) == 1, 'Only single-image batch is implemented'

        im_data = torch.from_numpy(blobs).permute(0, 3, 1, 2).to(device)
        im_info = torch.tensor([[blobs.shape[1], blobs.shape[2],
                                 im_scales[0]]]).to(device)
        gt_boxes = torch.zeros(1, 1, 5).to(device)
        num_boxes = torch.zeros(1).to(device)