def search(self, sentence): if not sentence or type(sentence) != str: return None res = list() c_bucket = list() seg_sen = list(self.seg.cut(sentence)) seg_sen = list(filter(lambda x: x not in self.stop_words, seg_sen)) for w in seg_sen: if w in self.p_bucket: c_bucket += self.p_bucket[w] c_bucket = list(set(c_bucket)) cmp, score = list(), list() for bucket in c_bucket: bucket_path = os.path.join(self.path, bucket) check_file(bucket_path) infile = open(bucket_path, 'r', encoding="utf-8") for inline in infile: inline = inline.rstrip() line = inline.split(':::')[0] seg_list = list(self.seg.cut(line)) seg_list = list( filter(lambda x: x not in self.stop_words, seg_list)) sc = jaccard(seg_sen, seg_list) if sc < self.args.threshold: continue cmp.append(inline) score.append(sc) infile.close() zipped = zip(cmp, score) zipped = sorted(zipped, key=lambda x: x[1], reverse=True) right = None if self.args.top_k <= 0 else self.args.top_k for (cp, sc) in zipped[:right]: res.append(cp) return res
def builder(): args = _get_parser() check_file(args.infile) ensure_dir(args.output) A = ahocorasick.Automaton() origin, annotation = list(), list() infile = open(args.infile, 'r', encoding='utf-8') for line in infile: line = line.rstrip() if not line: continue phrase, means = line.split(':::') if not phrase or not means: continue origin.append(phrase) annotation.append(means) infile.close() assert len(origin) == len(annotation) for idx, phrase in enumerate(origin): A.add_word(phrase, (idx, phrase)) A.make_automaton() ac_name = os.path.join(args.output, args.ac_name) means = os.path.join(args.output, args.mean_name) with open(ac_name, 'wb') as outfile: pickle.dump(A, outfile, protocol=pickle.HIGHEST_PROTOCOL) with open(means, 'wb') as outfile: pickle.dump(annotation, outfile, protocol=pickle.HIGHEST_PROTOCOL)
def __init__(self, args): check_file(args.ac_path) check_file(args.mean_path) with open(args.ac_path, 'rb') as infile: ac = pickle.load(infile) if isinstance(ac, ahocorasick.Automaton): self.ac = ac else: raise TypeError("{} must be ahocorasick.Automaton".format( args.ac_path)) with open(args.mean_path, 'rb') as infile: mean = pickle.load(infile) if isinstance(mean, list) and all( isinstance(elem, str) for elem in mean): self.mean = mean else: raise TypeError("{} must be list of str".format(args.mean_path))
def get_model(path, num_classes): if path: path = check_file(path) ckpt = torch.load(path) model = VireoNet(num_classes=num_classes) model.load_state_dict(ckpt['network'], strict=False, map_location='cpu') else: model = VireoNet(num_classes=num_classes) return model
def get_model(path, num_classes, model_name): model_cls = { # 'v3dm': V3Dm, # 'v3dmv2': V3DmV2, # 'vireodev': VireoDev, # 'vireonew': VireoNew, # 'vireou': VireoU, # 'vireopro': VireoPro, # 'vireomax': VireoMax, # 'densevireo': DenseVireo, # 'densevireov2': DenseVireoV2, # 'densevireov3': DenseVireoV3, # 'densevireov4': DenseVireoV4, 'vireonet': VireoNet, 'x3dm': X3Dm, 'x3dv': X3DV, 'x3dv2': X3DV2, 'x3dv3': X3DV3, 'x3dv4': X3DV4, 'x3dc': X3Dc, 'x3dcv2': X3DcV2, 'light3d': Light3D, 'light3dv2': Light3DV2, 'light3dv3': Light3DV3, 'light3dv4': Light3DV4, }[model_name] print('using {}'.format(model_name)) if path: path = check_file(path) ckpt = torch.load(path) model = model_cls(num_classes=num_classes) model.load_state_dict(ckpt['network'], strict=False, map_location='cpu') else: model = model_cls(num_classes=num_classes) return model
def train(path, img_size, cfg='yolov5s.yaml', bs=2, one_batch_training=False): device = 'cuda' if torch.cuda.is_available() else 'cpu' dls, dsets = create_dataloaders(path, img_size, bs, device, one_batch_training) n_classes = len(dls.vocab) model = Model(cfg=check_file(cfg), nc=n_classes) if 'cuda' == device: model.cuda() hyp['cls'] *= n_classes / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = n_classes # attach number of classes to model model.hyp = hyp model.gr = 1.0 learner = Learner(dls, model, loss_func=partial(compute_loss, model=model), cbs=[EvaluatorCallback()]) with learner.no_bar(): learner.fit_one_cycle(args.epochs, lr_max=3e-3) learner.save('/content/model_temp') learner.export(fname='/content/learner_05_02_2021.pkl') return learner
def main(): global connection, cursor cpu = multiprocessing.cpu_count() print("CPU {}".format(cpu)) # preliminary work check_file(args.infile) ensure_dir(args.output) all_lines = 0 if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work all_bucked = defaultdict(list) p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load tokenizer print('Splitting sentence into different clusters ...') infile = open(args.infile, 'r', encoding="utf-8") i = 0 all_data = infile.readlines() n = 10000 # 大列表中几个数据组成一个小列表 lstgs = [all_data[i:i + n] for i in range(0, len(all_data), n)] print(len(lstgs)) r = [] tr = [] pool = multiprocessing.Pool(processes=4) for xyz in lstgs: tr.append(pool.apply_async(fenci, (xyz, ))) pool.close() pool.join() for res in tr: tmp = res.get() for z in tmp: if z not in jieba_cache.keys(): jieba_cache[z] = tmp[z] else: print(z) for st in stop_words: stop_words_cache[st] = 1 r.clear() r = None all_lines = len(jieba_cache) print("开始执行 总 {} 行".format(all_lines)) print("缓存成功jieba {}".format(len(jieba_cache))) print("缓存成功停用词 {}".format(len(stop_words_cache))) all_data = jieba_cache.keys() for inline in all_data: if inline == '太原去贵阳怎么走': print("") i = i + 1 print("当前第 {} 行----总 {}".format(i, all_lines)) inline = inline.rstrip() line = inline.split(':::')[0] is_match = False seg_list = jieba_cache[line] llll = [] if stop_words: for mmmm in seg_list: if mmmm not in stop_words_cache.keys(): llll.append(mmmm) seg_list = llll for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: array = all_bucked[bucket] selected = sample_dict(array, args.sample_number) selected = list(map(lambda x: x.split(':::')[0], selected)) selected = list(map(lambda x: jieba_cache[x], selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: llll = [] for mmmm in sen: if mmmm not in stop_words_cache.keys(): llll.append(mmmm) filt_selected.append(llll) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True all_bucked[bucket].append(line) for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break # print("{} jaccard耗时 {}".format( inline, endtime - starttime)) if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_array = [line] all_bucked[bucket_name] = bucket_array for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 infile.close() batch_size = 0 for zzzz in all_bucked: batch_size = batch_size + 1 connection = pymysql.connect(host='47.99.87.74', user='******', password='******', db='august', port=33306) cursor = connection.cursor() all_bucked_data = [] for zx in all_bucked[zzzz]: all_bucked_data.append([all_bucked[zzzz][0], zx, today]) print("当前批次 {} 共 {}".format(batch_size, len(all_bucked))) cursor.executemany( "insert into 凤巢长尾词分组(group_id,keyword,created_date) values(%s,%s,%s)", (all_bucked_data)) connection.commit() cursor.close() connection.close() print('All is well')
def run(self, questions): args = self._get_parser() # preliminary work ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = questions for inline in tqdm(infile): inline = inline.rstrip() line = inline.split(':::')[0] is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: x.split(':::')[0], selected)) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list( filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) name_map = dict() for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_name = id_name.format(idx) new_path = os.path.join(args.output, new_name) os.rename(origin_path, new_path) name_map[file_name] = new_name for k, v in p_bucket.items(): p_bucket[k] = list(map(lambda x: name_map[x], v)) #合并文件 output_file = os.path.join(args.output, 'all_cluster.txt') try: if os.path.isfile(output_file): os.unlink(output_file) except Exception as e: print(e) file_list = os.listdir(args.output) fw = open(output_file, 'w+') for file in file_list: with open(os.path.join(args.output, file)) as f: for line in f.readlines(): fw.write(str(int(file)) + ',' + line) fw.close() df = pd.read_csv(output_file, names=['id', 'text']) df.columns = ['cluster_id', 'ques'] print('All is well') # json.dumps(dict(ques=ques)) df_dict = df.set_index('cluster_id').T.to_dict('records')[0] #dataframe 的数据格式转换 #df 0 aa # 0 aaa => aa [aaa] # 1 bb bb [] #df_dict = {0: aa, 1: bb} print(df_dict) result_dict = {} for cluster_id, ques in df_dict.items(): li = df[df['cluster_id'] == cluster_id].ques.values.tolist() # if(ques in li): li.remove(ques) result_dict[ques] = li my_list = [result_dict] my_df = pd.DataFrame(my_list).T my_df = my_df.reset_index() my_df.columns = ['ques', 'info'] print(my_df) return my_df.to_json(orient="records", force_ascii=False)
import torch from preprocess.data_preprocess import TrainAugmentation import random import cv2 import numpy as np from utils import visual_utils if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--batch-size', type=int, default=8) parser.add_argument('--data', type=str, default='./datasets/configs/kitti.yaml', help='*.yaml path') parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes') parser.add_argument('--is-mosaic', action='store_true', help='load image by applying mosaic') parser.add_argument('--is-rect', action='store_true', help='resize image apply rect mode not square mode') opt = parser.parse_args() opt.data = utils.check_file(opt.data) # check file cfg = {} cfg.update(opt.__dict__) # dataset with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict cfg.update(data_dict) dataset_path = data_dict['dataset_path'] brg_mean = data_dict['brg_mean'] dr = DatasetReader('./datasets/data/kitti', cfg, TrainAugmentation(cfg['img_size'][0], mean=brg_mean)) batch_size = min(1, len(dr)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers # dataloader = torch.utils.data.DataLoader(dr, # batch_size=batch_size,
def main(): args = _get_parser() # preliminary work check_file(args.infile) ensure_dir(args.output) if args.name_len_update: line_cnt = line_counter(args.infile) args.name_len = len(str(line_cnt)) + 1 clean_dir(args.output, args.name_len) # end preliminary work p_bucket = defaultdict(list) save_idx = 0 id_name = '{0:0' + str(args.name_len) + 'd}' # load stop words stop_words = get_stop_words(args.stop_words) if os.path.exists( args.stop_words) else list() # load tokenizer seg = Segmentor(args) print('Splitting sentence into different clusters ...') infile = open(args.infile, 'r', encoding="utf-8") for line in tqdm(infile): line = line.rstrip() is_match = False seg_list = list(seg.cut(line)) if stop_words: seg_list = list(filter(lambda x: x not in stop_words, seg_list)) for wd in seg_list: if is_match: break w_bucket = p_bucket[wd] for bucket in w_bucket: bucket_path = os.path.join(args.output, bucket) check_file(bucket_path) selected = sample_file(bucket_path, args.sample_number) selected = list(map(lambda x: list(seg.cut(x)), selected)) # remove stop words if stop_words: filt_selected = list() for sen in selected: sen = list(filter(lambda x: x not in stop_words, sen)) filt_selected.append(sen) selected = filt_selected # calculate similarity with each bucket if all( jaccard(seg_list, cmp_list) > args.threshold for cmp_list in selected): is_match = True with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: if bucket not in p_bucket[w]: p_bucket[w].append(bucket) break if not is_match: bucket_name = ('tmp' + id_name).format(save_idx) bucket_path = os.path.join(args.output, bucket_name) with open(bucket_path, 'a', encoding='utf-8') as outfile: outfile.write(line + '\n') for w in seg_list: p_bucket[w].append(bucket_name) save_idx += 1 infile.close() # sort and rename file file_list = os.listdir(args.output) file_list = list(filter(lambda x: x.startswith('tmp'), file_list)) cnt = dict() for file in file_list: file_path = os.path.join(args.output, file) cnt[file] = line_counter(file_path) sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True) for idx, (file_name, times) in enumerate(sorted_cnt): origin_path = os.path.join(args.output, file_name) new_path = os.path.join(args.output, id_name.format(idx)) os.rename(origin_path, new_path) print('All is well')
type=str, default='./runs', help='do not train the params in exclude_scopes') parser.add_argument('--is-mosaic', action='store_true', help='load image by applying mosaic') parser.add_argument('--is-rect', action='store_true', help='resize image apply rect mode not square mode') parser.add_argument('--only-3d', action='store_true', help='only train 3d') parser.add_argument('--only-2d', action='store_true', help='only train 2d, that is, excluding 3d') opt = parser.parse_args() # opt.weights = last if opt.resume else opt.weights opt.cfg = utils.check_file(opt.cfg) # check file opt.data = utils.check_file(opt.data) # check file print(opt) cfg = None with open(config_path) as f: cfg = yaml.load(f, Loader=yaml.FullLoader) cfg.update(opt.__dict__) with open(opt.cfg) as f: model_cfg = yaml.load(f, Loader=yaml.FullLoader) # model config cfg.update(model_cfg) # dataset with open(cfg['data']) as f: data_cfg = yaml.load(f, Loader=yaml.FullLoader) # data config cfg.update(data_cfg)
def get_scheduler(optim, epoch, warm_up=10): def sche_with_warmup(x): if x < warm_up: lr = 0.9 * (x / epoch) + 0.1 else: lr = ((1 + math.cos(x * math.pi / epoch)) / 2) * (1 - params.lrf) + params.lrf return lr return LambdaLR(optim, sche_with_warmup) if __name__ == "__main__": params_file = 'params.yml' params_file = check_file(params_file) params = Params('params.yml') debug = True if params.mode == 'debug' else False params.save_dir = os.path.join(os.getcwd(), params.save_dir) os.makedirs(params.save_dir, exist_ok=True) device = select_device(params.device, batch_size=params.batch_size) init_seeds(10086) loaders = get_loaders(params.input_dir, params.batch_size, params.num_workers, params.frames, params.img_size, debug=debug) net = get_model(params.weights, params.num_classes) # net = nn.DataParallel(net).to(device, non_blocking=True) net = net.to(device, non_blocking=True) loss = get_loss()
import onnx #from onnxsim import simplify if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='./models/configs/yolo3d_5m.yaml', help='model.yaml') parser.add_argument('--device', default='0', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') opt = parser.parse_args() opt.cfg = utils.check_file(opt.cfg) # check file device = 'cpu' with open(opt.cfg) as f: cfg = yaml.load(f, Loader=yaml.FullLoader) cfg.update(opt.__dict__) print(cfg) # Create model model = Model(cfg) ckpt = torch.load( './weights/yolov5m.pt', map_location=torch.device(device))['model'].float().state_dict() state = model.state_dict() # model = UpSample(3, None, 2) model.eval() model.to(device) for k, v in model.named_parameters():
help='initial weights path') parser.add_argument( '--name', default='', help='renames results.txt to results_name.txt if supplied') parser.add_argument('--device', default='cuda:0', help='device id (i.e. 0 or 0,1 or cpu)') parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') parser.add_argument('--freeze-layers', type=bool, default=True, help='Freeze non-output layers') opt = parser.parse_args() # 检查文件是否存在 opt.cfg = check_file(opt.cfg) opt.data = check_file(opt.data) opt.hyp = check_file(opt.hyp) with open(opt.hyp) as f: hyp = yaml.load(f, Loader=yaml.FullLoader) print( 'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/' ) tb_writer = SummaryWriter(comment=opt.name) train(hyp)
layers.append(m_) ch.append(c2) return nn.Sequential(*layers), sorted(save) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--cfg', type=str, default='yolov5s.yaml', help='model.yaml') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') opt = parser.parse_args() opt.cfg = check_file(opt.cfg) # check file set_logging() device = select_device(opt.device) # Create model model = Model(opt.cfg).to(device) model.train() # Profile # img = torch.rand(8 if torch.cuda.is_available() else 1, 3, 640, 640).to(device) # y = model(img, profile=True) # ONNX export # model.model[-1].export = True # torch.onnx.export(model, img, opt.cfg.replace('.yaml', '.onnx'), verbose=True, opset_version=11)