Example #1
0
    def search(self, sentence):
        if not sentence or type(sentence) != str:
            return None
        res = list()
        c_bucket = list()
        seg_sen = list(self.seg.cut(sentence))
        seg_sen = list(filter(lambda x: x not in self.stop_words, seg_sen))
        for w in seg_sen:
            if w in self.p_bucket:
                c_bucket += self.p_bucket[w]
        c_bucket = list(set(c_bucket))
        cmp, score = list(), list()
        for bucket in c_bucket:
            bucket_path = os.path.join(self.path, bucket)
            check_file(bucket_path)
            infile = open(bucket_path, 'r', encoding="utf-8")
            for inline in infile:
                inline = inline.rstrip()
                line = inline.split(':::')[0]
                seg_list = list(self.seg.cut(line))
                seg_list = list(
                    filter(lambda x: x not in self.stop_words, seg_list))
                sc = jaccard(seg_sen, seg_list)
                if sc < self.args.threshold:
                    continue
                cmp.append(inline)
                score.append(sc)
            infile.close()

        zipped = zip(cmp, score)
        zipped = sorted(zipped, key=lambda x: x[1], reverse=True)
        right = None if self.args.top_k <= 0 else self.args.top_k
        for (cp, sc) in zipped[:right]:
            res.append(cp)
        return res
def builder():
    args = _get_parser()
    check_file(args.infile)
    ensure_dir(args.output)

    A = ahocorasick.Automaton()
    origin, annotation = list(), list()

    infile = open(args.infile, 'r', encoding='utf-8')
    for line in infile:
        line = line.rstrip()
        if not line:
            continue
        phrase, means = line.split(':::')
        if not phrase or not means:
            continue
        origin.append(phrase)
        annotation.append(means)

    infile.close()
    assert len(origin) == len(annotation)

    for idx, phrase in enumerate(origin):
        A.add_word(phrase, (idx, phrase))

    A.make_automaton()

    ac_name = os.path.join(args.output, args.ac_name)
    means = os.path.join(args.output, args.mean_name)
    with open(ac_name, 'wb') as outfile:
        pickle.dump(A, outfile, protocol=pickle.HIGHEST_PROTOCOL)
    with open(means, 'wb') as outfile:
        pickle.dump(annotation, outfile, protocol=pickle.HIGHEST_PROTOCOL)
 def __init__(self, args):
     check_file(args.ac_path)
     check_file(args.mean_path)
     with open(args.ac_path, 'rb') as infile:
         ac = pickle.load(infile)
     if isinstance(ac, ahocorasick.Automaton):
         self.ac = ac
     else:
         raise TypeError("{} must be ahocorasick.Automaton".format(
             args.ac_path))
     with open(args.mean_path, 'rb') as infile:
         mean = pickle.load(infile)
     if isinstance(mean, list) and all(
             isinstance(elem, str) for elem in mean):
         self.mean = mean
     else:
         raise TypeError("{} must be list of str".format(args.mean_path))
Example #4
0
def get_model(path, num_classes):
    if path:
        path = check_file(path)
        ckpt = torch.load(path)
        model = VireoNet(num_classes=num_classes)
        model.load_state_dict(ckpt['network'], strict=False, map_location='cpu')
    else:
        model = VireoNet(num_classes=num_classes)

    return model
Example #5
0
def get_model(path, num_classes, model_name):

    model_cls = {
        # 'v3dm': V3Dm,
        # 'v3dmv2': V3DmV2,
        # 'vireodev': VireoDev,
        # 'vireonew': VireoNew,
        # 'vireou': VireoU,
        # 'vireopro': VireoPro,
        # 'vireomax': VireoMax,
        # 'densevireo': DenseVireo,
        # 'densevireov2': DenseVireoV2,
        # 'densevireov3': DenseVireoV3,
        # 'densevireov4': DenseVireoV4,
        'vireonet': VireoNet,
        'x3dm': X3Dm,
        'x3dv': X3DV,
        'x3dv2': X3DV2,
        'x3dv3': X3DV3,
        'x3dv4': X3DV4,
        'x3dc': X3Dc,
        'x3dcv2': X3DcV2,
        'light3d': Light3D,
        'light3dv2': Light3DV2,
        'light3dv3': Light3DV3,
        'light3dv4': Light3DV4,
    }[model_name]

    print('using {}'.format(model_name))
    if path:
        path = check_file(path)
        ckpt = torch.load(path)
        model = model_cls(num_classes=num_classes)
        model.load_state_dict(ckpt['network'],
                              strict=False,
                              map_location='cpu')
    else:
        model = model_cls(num_classes=num_classes)

    return model
Example #6
0
def train(path, img_size, cfg='yolov5s.yaml', bs=2, one_batch_training=False):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    dls, dsets = create_dataloaders(path, img_size, bs, device,
                                    one_batch_training)
    n_classes = len(dls.vocab)

    model = Model(cfg=check_file(cfg), nc=n_classes)
    if 'cuda' == device:
        model.cuda()

    hyp['cls'] *= n_classes / 80.  # scale coco-tuned hyp['cls'] to current dataset
    model.nc = n_classes  # attach number of classes to model
    model.hyp = hyp
    model.gr = 1.0
    learner = Learner(dls,
                      model,
                      loss_func=partial(compute_loss, model=model),
                      cbs=[EvaluatorCallback()])
    with learner.no_bar():
        learner.fit_one_cycle(args.epochs, lr_max=3e-3)
    learner.save('/content/model_temp')
    learner.export(fname='/content/learner_05_02_2021.pkl')

    return learner
Example #7
0
def main():
    global connection, cursor
    cpu = multiprocessing.cpu_count()
    print("CPU {}".format(cpu))
    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)
    all_lines = 0
    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1

    clean_dir(args.output, args.name_len)
    # end preliminary work

    all_bucked = defaultdict(list)
    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'

    # load tokenizer

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    i = 0
    all_data = infile.readlines()
    n = 10000  # 大列表中几个数据组成一个小列表
    lstgs = [all_data[i:i + n] for i in range(0, len(all_data), n)]
    print(len(lstgs))
    r = []
    tr = []
    pool = multiprocessing.Pool(processes=4)
    for xyz in lstgs:
        tr.append(pool.apply_async(fenci, (xyz, )))
    pool.close()
    pool.join()

    for res in tr:
        tmp = res.get()
        for z in tmp:
            if z not in jieba_cache.keys():
                jieba_cache[z] = tmp[z]
            else:
                print(z)
    for st in stop_words:
        stop_words_cache[st] = 1

    r.clear()
    r = None

    all_lines = len(jieba_cache)
    print("开始执行 总 {} 行".format(all_lines))
    print("缓存成功jieba {}".format(len(jieba_cache)))
    print("缓存成功停用词 {}".format(len(stop_words_cache)))
    all_data = jieba_cache.keys()
    for inline in all_data:
        if inline == '太原去贵阳怎么走':
            print("")
        i = i + 1
        print("当前第 {} 行----总 {}".format(i, all_lines))
        inline = inline.rstrip()
        line = inline.split(':::')[0]
        is_match = False
        seg_list = jieba_cache[line]
        llll = []
        if stop_words:
            for mmmm in seg_list:
                if mmmm not in stop_words_cache.keys():
                    llll.append(mmmm)
            seg_list = llll
        for wd in seg_list:
            if is_match:
                break
            w_bucket = p_bucket[wd]
            for bucket in w_bucket:
                array = all_bucked[bucket]
                selected = sample_dict(array, args.sample_number)
                selected = list(map(lambda x: x.split(':::')[0], selected))
                selected = list(map(lambda x: jieba_cache[x], selected))
                # remove stop words
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        llll = []
                        for mmmm in sen:
                            if mmmm not in stop_words_cache.keys():
                                llll.append(mmmm)
                        filt_selected.append(llll)
                    selected = filt_selected
                # calculate similarity with each bucket
                if all(
                        jaccard(seg_list, cmp_list) > args.threshold
                        for cmp_list in selected):
                    is_match = True
                    all_bucked[bucket].append(line)
                    for w in seg_list:
                        if bucket not in p_bucket[w]:
                            p_bucket[w].append(bucket)
                    break
                # print("{} jaccard耗时 {}".format( inline, endtime - starttime))
        if not is_match:
            bucket_name = ('tmp' + id_name).format(save_idx)
            bucket_array = [line]
            all_bucked[bucket_name] = bucket_array
            for w in seg_list:
                p_bucket[w].append(bucket_name)
            save_idx += 1

    infile.close()

    batch_size = 0
    for zzzz in all_bucked:
        batch_size = batch_size + 1
        connection = pymysql.connect(host='47.99.87.74',
                                     user='******',
                                     password='******',
                                     db='august',
                                     port=33306)
        cursor = connection.cursor()

        all_bucked_data = []
        for zx in all_bucked[zzzz]:
            all_bucked_data.append([all_bucked[zzzz][0], zx, today])
        print("当前批次  {} 共 {}".format(batch_size, len(all_bucked)))
        cursor.executemany(
            "insert into 凤巢长尾词分组(group_id,keyword,created_date) values(%s,%s,%s)",
            (all_bucked_data))
        connection.commit()
        cursor.close()
        connection.close()

    print('All is well')
Example #8
0
    def run(self, questions):
        args = self._get_parser()

        # preliminary work
        ensure_dir(args.output)

        if args.name_len_update:
            line_cnt = line_counter(args.infile)
            args.name_len = len(str(line_cnt)) + 1

        clean_dir(args.output, args.name_len)
        # end preliminary work

        p_bucket = defaultdict(list)
        save_idx = 0
        id_name = '{0:0' + str(args.name_len) + 'd}'
        # load stop words
        stop_words = get_stop_words(args.stop_words) if os.path.exists(
            args.stop_words) else list()
        # load tokenizer
        seg = Segmentor(args)

        print('Splitting sentence into different clusters ...')
        infile = questions
        for inline in tqdm(infile):
            inline = inline.rstrip()
            line = inline.split(':::')[0]
            is_match = False
            seg_list = list(seg.cut(line))
            if stop_words:
                seg_list = list(filter(lambda x: x not in stop_words,
                                       seg_list))
            for wd in seg_list:
                if is_match:
                    break
                w_bucket = p_bucket[wd]
                for bucket in w_bucket:
                    bucket_path = os.path.join(args.output, bucket)
                    check_file(bucket_path)
                    selected = sample_file(bucket_path, args.sample_number)
                    selected = list(map(lambda x: x.split(':::')[0], selected))
                    selected = list(map(lambda x: list(seg.cut(x)), selected))
                    # remove stop words
                    if stop_words:
                        filt_selected = list()
                        for sen in selected:
                            sen = list(
                                filter(lambda x: x not in stop_words, sen))
                            filt_selected.append(sen)
                        selected = filt_selected
                    # calculate similarity with each bucket
                    if all(
                            jaccard(seg_list, cmp_list) > args.threshold
                            for cmp_list in selected):
                        is_match = True
                        with open(bucket_path, 'a',
                                  encoding='utf-8') as outfile:
                            outfile.write(line + '\n')
                        for w in seg_list:
                            if bucket not in p_bucket[w]:
                                p_bucket[w].append(bucket)
                        break
            if not is_match:
                bucket_name = ('tmp' + id_name).format(save_idx)
                bucket_path = os.path.join(args.output, bucket_name)
                with open(bucket_path, 'a', encoding='utf-8') as outfile:
                    outfile.write(line + '\n')
                for w in seg_list:
                    p_bucket[w].append(bucket_name)
                save_idx += 1

        # sort and rename file
        file_list = os.listdir(args.output)
        file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
        cnt = dict()
        for file in file_list:
            file_path = os.path.join(args.output, file)
            cnt[file] = line_counter(file_path)

        sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
        name_map = dict()
        for idx, (file_name, times) in enumerate(sorted_cnt):
            origin_path = os.path.join(args.output, file_name)
            new_name = id_name.format(idx)
            new_path = os.path.join(args.output, new_name)
            os.rename(origin_path, new_path)
            name_map[file_name] = new_name

        for k, v in p_bucket.items():
            p_bucket[k] = list(map(lambda x: name_map[x], v))

        #合并文件
        output_file = os.path.join(args.output, 'all_cluster.txt')
        try:
            if os.path.isfile(output_file):
                os.unlink(output_file)
        except Exception as e:
            print(e)
        file_list = os.listdir(args.output)
        fw = open(output_file, 'w+')
        for file in file_list:
            with open(os.path.join(args.output, file)) as f:
                for line in f.readlines():
                    fw.write(str(int(file)) + ',' + line)
        fw.close()
        df = pd.read_csv(output_file, names=['id', 'text'])
        df.columns = ['cluster_id', 'ques']
        print('All is well')
        # json.dumps(dict(ques=ques))
        df_dict = df.set_index('cluster_id').T.to_dict('records')[0]

        #dataframe 的数据格式转换
        #df 0 aa
        #   0 aaa                   => aa  [aaa]
        #   1 bb                       bb  []
        #df_dict = {0: aa, 1: bb}
        print(df_dict)
        result_dict = {}
        for cluster_id, ques in df_dict.items():
            li = df[df['cluster_id'] == cluster_id].ques.values.tolist()
            # if(ques in li): li.remove(ques)
            result_dict[ques] = li

        my_list = [result_dict]
        my_df = pd.DataFrame(my_list).T
        my_df = my_df.reset_index()
        my_df.columns = ['ques', 'info']
        print(my_df)
        return my_df.to_json(orient="records", force_ascii=False)
import torch
from preprocess.data_preprocess import TrainAugmentation
import random
import cv2
import numpy as np
from utils import visual_utils

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch-size', type=int, default=8)
    parser.add_argument('--data', type=str, default='./datasets/configs/kitti.yaml', help='*.yaml path')
    parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='train,test sizes')
    parser.add_argument('--is-mosaic', action='store_true', help='load image by applying mosaic')
    parser.add_argument('--is-rect', action='store_true', help='resize image apply rect mode not square mode')
    opt = parser.parse_args()
    opt.data = utils.check_file(opt.data)  # check file
    cfg = {}
    cfg.update(opt.__dict__)
    # dataset
    with open(opt.data) as f:
        data_dict = yaml.load(f, Loader=yaml.FullLoader)  # model dict
        cfg.update(data_dict)
    dataset_path = data_dict['dataset_path']

    brg_mean = data_dict['brg_mean']
    dr = DatasetReader('./datasets/data/kitti',  cfg, TrainAugmentation(cfg['img_size'][0], mean=brg_mean))

    batch_size = min(1, len(dr))
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
    # dataloader = torch.utils.data.DataLoader(dr,
    #                                          batch_size=batch_size,
Example #10
0
def main():
    args = _get_parser()

    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)

    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1

    clean_dir(args.output, args.name_len)
    # end preliminary work

    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'
    # load stop words
    stop_words = get_stop_words(args.stop_words) if os.path.exists(
        args.stop_words) else list()
    # load tokenizer
    seg = Segmentor(args)

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    for line in tqdm(infile):
        line = line.rstrip()
        is_match = False
        seg_list = list(seg.cut(line))
        if stop_words:
            seg_list = list(filter(lambda x: x not in stop_words, seg_list))
        for wd in seg_list:
            if is_match:
                break
            w_bucket = p_bucket[wd]
            for bucket in w_bucket:
                bucket_path = os.path.join(args.output, bucket)
                check_file(bucket_path)
                selected = sample_file(bucket_path, args.sample_number)
                selected = list(map(lambda x: list(seg.cut(x)), selected))
                # remove stop words
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        sen = list(filter(lambda x: x not in stop_words, sen))
                        filt_selected.append(sen)
                    selected = filt_selected
                # calculate similarity with each bucket
                if all(
                        jaccard(seg_list, cmp_list) > args.threshold
                        for cmp_list in selected):
                    is_match = True
                    with open(bucket_path, 'a', encoding='utf-8') as outfile:
                        outfile.write(line + '\n')
                    for w in seg_list:
                        if bucket not in p_bucket[w]:
                            p_bucket[w].append(bucket)
                    break
        if not is_match:
            bucket_name = ('tmp' + id_name).format(save_idx)
            bucket_path = os.path.join(args.output, bucket_name)
            with open(bucket_path, 'a', encoding='utf-8') as outfile:
                outfile.write(line + '\n')
            for w in seg_list:
                p_bucket[w].append(bucket_name)
            save_idx += 1

    infile.close()

    # sort and rename file
    file_list = os.listdir(args.output)
    file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
    cnt = dict()
    for file in file_list:
        file_path = os.path.join(args.output, file)
        cnt[file] = line_counter(file_path)

    sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
    for idx, (file_name, times) in enumerate(sorted_cnt):
        origin_path = os.path.join(args.output, file_name)
        new_path = os.path.join(args.output, id_name.format(idx))
        os.rename(origin_path, new_path)

    print('All is well')
Example #11
0
                        type=str,
                        default='./runs',
                        help='do not train the params in exclude_scopes')
    parser.add_argument('--is-mosaic',
                        action='store_true',
                        help='load image by applying mosaic')
    parser.add_argument('--is-rect',
                        action='store_true',
                        help='resize image apply rect mode not square mode')
    parser.add_argument('--only-3d', action='store_true', help='only train 3d')
    parser.add_argument('--only-2d',
                        action='store_true',
                        help='only train 2d, that is, excluding 3d')
    opt = parser.parse_args()
    # opt.weights = last if opt.resume else opt.weights
    opt.cfg = utils.check_file(opt.cfg)  # check file
    opt.data = utils.check_file(opt.data)  # check file
    print(opt)
    cfg = None
    with open(config_path) as f:
        cfg = yaml.load(f, Loader=yaml.FullLoader)
        cfg.update(opt.__dict__)

    with open(opt.cfg) as f:
        model_cfg = yaml.load(f, Loader=yaml.FullLoader)  # model config
        cfg.update(model_cfg)

    # dataset
    with open(cfg['data']) as f:
        data_cfg = yaml.load(f, Loader=yaml.FullLoader)  # data config
        cfg.update(data_cfg)
Example #12
0
def get_scheduler(optim, epoch, warm_up=10):

    def sche_with_warmup(x):
        if x < warm_up:
            lr = 0.9 * (x / epoch) + 0.1
        else:
            lr = ((1 + math.cos(x * math.pi / epoch)) / 2) * (1 - params.lrf) + params.lrf
        return lr

    return LambdaLR(optim, sche_with_warmup)



if __name__ == "__main__":
    params_file = 'params.yml'
    params_file = check_file(params_file)
    params = Params('params.yml')

    debug = True if params.mode == 'debug' else False

    params.save_dir = os.path.join(os.getcwd(), params.save_dir)
    os.makedirs(params.save_dir, exist_ok=True)

    device = select_device(params.device, batch_size=params.batch_size)
    init_seeds(10086)

    loaders = get_loaders(params.input_dir, params.batch_size, params.num_workers, params.frames, params.img_size, debug=debug)
    net = get_model(params.weights, params.num_classes)
    # net = nn.DataParallel(net).to(device, non_blocking=True)
    net = net.to(device, non_blocking=True)
    loss = get_loss()
Example #13
0

import onnx
#from onnxsim import simplify

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--cfg',
                        type=str,
                        default='./models/configs/yolo3d_5m.yaml',
                        help='model.yaml')
    parser.add_argument('--device',
                        default='0',
                        help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    opt = parser.parse_args()
    opt.cfg = utils.check_file(opt.cfg)  # check file
    device = 'cpu'
    with open(opt.cfg) as f:
        cfg = yaml.load(f, Loader=yaml.FullLoader)
        cfg.update(opt.__dict__)
        print(cfg)
    # Create model
    model = Model(cfg)
    ckpt = torch.load(
        './weights/yolov5m.pt',
        map_location=torch.device(device))['model'].float().state_dict()
    state = model.state_dict()
    # model = UpSample(3, None, 2)
    model.eval()
    model.to(device)
    for k, v in model.named_parameters():
Example #14
0
                        help='initial weights path')
    parser.add_argument(
        '--name',
        default='',
        help='renames results.txt to results_name.txt if supplied')
    parser.add_argument('--device',
                        default='cuda:0',
                        help='device id (i.e. 0 or 0,1 or cpu)')
    parser.add_argument('--single-cls',
                        action='store_true',
                        help='train as single-class dataset')
    parser.add_argument('--freeze-layers',
                        type=bool,
                        default=True,
                        help='Freeze non-output layers')
    opt = parser.parse_args()

    # 检查文件是否存在
    opt.cfg = check_file(opt.cfg)
    opt.data = check_file(opt.data)
    opt.hyp = check_file(opt.hyp)

    with open(opt.hyp) as f:
        hyp = yaml.load(f, Loader=yaml.FullLoader)

    print(
        'Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/'
    )
    tb_writer = SummaryWriter(comment=opt.name)
    train(hyp)
Example #15
0
        layers.append(m_)
        ch.append(c2)
    return nn.Sequential(*layers), sorted(save)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--cfg',
                        type=str,
                        default='yolov5s.yaml',
                        help='model.yaml')
    parser.add_argument('--device',
                        default='',
                        help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    opt = parser.parse_args()
    opt.cfg = check_file(opt.cfg)  # check file
    set_logging()
    device = select_device(opt.device)

    # Create model
    model = Model(opt.cfg).to(device)
    model.train()

    # Profile
    # img = torch.rand(8 if torch.cuda.is_available() else 1, 3, 640, 640).to(device)
    # y = model(img, profile=True)

    # ONNX export
    # model.model[-1].export = True
    # torch.onnx.export(model, img, opt.cfg.replace('.yaml', '.onnx'), verbose=True, opset_version=11)