def corpus_to_bunch(bunch_path, seg_path): ''' :param bunch_path: Bunch存储路径 :param seg_path: 分词后语料库路径 ''' seg_class_list = listdir_nohidden(seg_path) bunch = base.Bunch(target_name=[], label=[], filenames=[], contents=[]) bunch.target_name.extend(seg_class_list) for seg_class_dir in bunch.target_name: seg_class_path = seg_path + "/" + seg_class_dir + "/" seg_file_list = listdir_nohidden(seg_class_path) for seg_file in seg_file_list: seg_full_path = seg_class_path + seg_file bunch.label.append(seg_class_dir) bunch.filenames.append(seg_file) bunch.contents.append(read_file(seg_full_path)) with open(bunch_path, "wb") as file_obj: pickle.dump(bunch, file_obj) print("===================*****====================") print("corpus_to_bunch end") print("===================*****====================")
def corpus_segment(corpus_path, seg_path): ''' :param corpus_path: 未分词语料库路径 :param seg_path: 分词后语料库存储路径 ''' class_list = listdir_nohidden(corpus_path) for class_dir in class_list: class_path = corpus_path + "/" + class_dir + "/" seg_class_path = seg_path + "/" + class_dir + "/" if not os.path.exists(seg_class_path): os.makedirs(seg_class_path) file_list = listdir_nohidden(class_path) for file in file_list: full_path = class_path + file content = read_file(full_path) content_seg = jieba.cut(content) # 关键词提取,标引 # print(file, jieba.analyse.extract_tags(content, topK=5, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'))) save_file(seg_class_path + file, bytes(" ".join(content_seg), encoding="utf8")) # 将处理后的文件保存到分词后语料目录 print("===================*****====================") print("corpus_segment end") print("===================*****====================")
def __init__(self, model=None, network_name=None, classification=False, root_dir=None, part="train"): self.data_dir = root_dir self.root = os.path.join(self.data_dir, "scale") self.part = part self.pt_train_max = 6 self.pt_test_max = 4 self.model = model self.classification = classification self.CALSS_THRESHOLD = 0.05 self.network_name = network_name print("##########", self.root) self.data = [] for class_name in listdir_nohidden(self.root): type_index = type_to_index_map[class_name] type_root = os.path.join(os.path.join(self.root, class_name)) # for filename in os.listdir(type_root): # if filename.endswith('.npz'): # self.data.append((os.path.join(type_root, filename), type_index)) for object_nb, filename in list_features_shapenet_classes( type_root, epoch=410): if filename.endswith('.npz'): if self.part == "train": self.data.extend(self.pt_train_max * [(filename, type_index, object_nb)]) else: self.data.extend(self.pt_test_max * [(filename, type_index, object_nb)])
def experiment_corpus_segment(corpus_path, seg_path): ''' :param corpus_path: 未分词语料库路径 :param seg_path: 分词后语料库存储路径 ''' file_list = listdir_nohidden(corpus_path) seg_path0 = seg_path + "/" if not os.path.exists(seg_path0): os.makedirs(seg_path0) for file in file_list: full_path = corpus_path + "/" + file content = read_file(full_path) content_seg = jieba.cut(content) save_file(seg_path0 + file, bytes(" ".join(content_seg), encoding="utf8")) # 将处理后的文件保存到分词后语料目录 print("===================*****====================") print("experiment_corpus_segment end") print("===================*****====================")
use_cuda = torch.cuda.is_available() assert use_cuda, 'Works only with CUDA' device = torch.device('cuda') if use_cuda else torch.device('cpu') cfg.CUDA = use_cuda np.random.seed(cfg.RNG_SEED) # Load the model. fasterRCNN = resnet(N_CLASSES, 101, pretrained=False) fasterRCNN.create_architecture() fasterRCNN.load_state_dict(torch.load(args.model_file)) fasterRCNN.to(device) fasterRCNN.eval() print('Model is loaded.') # Load images. imglist = list(listdir_nohidden(args.image_dir)) num_images = len(imglist) print('Number of images: {}.'.format(num_images)) # Extract features. for im_file in tqdm(imglist): im = cv2.imread(os.path.join(args.image_dir, im_file)) blobs, im_scales = get_image_blob(im) assert len(im_scales) == 1, 'Only single-image batch is implemented' im_data = torch.from_numpy(blobs).permute(0, 3, 1, 2).to(device) im_info = torch.tensor([[blobs.shape[1], blobs.shape[2], im_scales[0]]]).to(device) gt_boxes = torch.zeros(1, 1, 5).to(device) num_boxes = torch.zeros(1).to(device)