Esempi in Python per jaccard, esempi in Python per utils.similar.jaccard

Esempio n. 1

0

Mostra file

File: search.py Progetto: zsyf102900/TextCluster

    def search(self, sentence):
        if not sentence or type(sentence) != str:
            return None
        res = list()
        c_bucket = list()
        seg_sen = list(self.seg.cut(sentence))
        seg_sen = list(filter(lambda x: x not in self.stop_words, seg_sen))
        for w in seg_sen:
            if w in self.p_bucket:
                c_bucket += self.p_bucket[w]
        c_bucket = list(set(c_bucket))
        cmp, score = list(), list()
        for bucket in c_bucket:
            bucket_path = os.path.join(self.path, bucket)
            check_file(bucket_path)
            infile = open(bucket_path, 'r', encoding="utf-8")
            for inline in infile:
                inline = inline.rstrip()
                line = inline.split(':::')[0]
                seg_list = list(self.seg.cut(line))
                seg_list = list(
                    filter(lambda x: x not in self.stop_words, seg_list))
                sc = jaccard(seg_sen, seg_list)
                if sc < self.args.threshold:
                    continue
                cmp.append(inline)
                score.append(sc)
            infile.close()

        zipped = zip(cmp, score)
        zipped = sorted(zipped, key=lambda x: x[1], reverse=True)
        right = None if self.args.top_k <= 0 else self.args.top_k
        for (cp, sc) in zipped[:right]:
            res.append(cp)
        return res

Esempio n. 2

0

Mostra file

File: cluster_plus.py Progetto: august0228/TextCluster

def main():
    global connection, cursor
    cpu = multiprocessing.cpu_count()
    print("CPU {}".format(cpu))
    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)
    all_lines = 0
    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1

    clean_dir(args.output, args.name_len)
    # end preliminary work

    all_bucked = defaultdict(list)
    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'

    # load tokenizer

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    i = 0
    all_data = infile.readlines()
    n = 10000  # 大列表中几个数据组成一个小列表
    lstgs = [all_data[i:i + n] for i in range(0, len(all_data), n)]
    print(len(lstgs))
    r = []
    tr = []
    pool = multiprocessing.Pool(processes=4)
    for xyz in lstgs:
        tr.append(pool.apply_async(fenci, (xyz, )))
    pool.close()
    pool.join()

    for res in tr:
        tmp = res.get()
        for z in tmp:
            if z not in jieba_cache.keys():
                jieba_cache[z] = tmp[z]
            else:
                print(z)
    for st in stop_words:
        stop_words_cache[st] = 1

    r.clear()
    r = None

    all_lines = len(jieba_cache)
    print("开始执行 总 {} 行".format(all_lines))
    print("缓存成功jieba {}".format(len(jieba_cache)))
    print("缓存成功停用词 {}".format(len(stop_words_cache)))
    all_data = jieba_cache.keys()
    for inline in all_data:
        if inline == '太原去贵阳怎么走':
            print("")
        i = i + 1
        print("当前第 {} 行----总 {}".format(i, all_lines))
        inline = inline.rstrip()
        line = inline.split(':::')[0]
        is_match = False
        seg_list = jieba_cache[line]
        llll = []
        if stop_words:
            for mmmm in seg_list:
                if mmmm not in stop_words_cache.keys():
                    llll.append(mmmm)
            seg_list = llll
        for wd in seg_list:
            if is_match:
                break
            w_bucket = p_bucket[wd]
            for bucket in w_bucket:
                array = all_bucked[bucket]
                selected = sample_dict(array, args.sample_number)
                selected = list(map(lambda x: x.split(':::')[0], selected))
                selected = list(map(lambda x: jieba_cache[x], selected))
                # remove stop words
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        llll = []
                        for mmmm in sen:
                            if mmmm not in stop_words_cache.keys():
                                llll.append(mmmm)
                        filt_selected.append(llll)
                    selected = filt_selected
                # calculate similarity with each bucket
                if all(
                        jaccard(seg_list, cmp_list) > args.threshold
                        for cmp_list in selected):
                    is_match = True
                    all_bucked[bucket].append(line)
                    for w in seg_list:
                        if bucket not in p_bucket[w]:
                            p_bucket[w].append(bucket)
                    break
                # print("{} jaccard耗时 {}".format( inline, endtime - starttime))
        if not is_match:
            bucket_name = ('tmp' + id_name).format(save_idx)
            bucket_array = [line]
            all_bucked[bucket_name] = bucket_array
            for w in seg_list:
                p_bucket[w].append(bucket_name)
            save_idx += 1

    infile.close()

    batch_size = 0
    for zzzz in all_bucked:
        batch_size = batch_size + 1
        connection = pymysql.connect(host='47.99.87.74',
                                     user='******',
                                     password='******',
                                     db='august',
                                     port=33306)
        cursor = connection.cursor()

        all_bucked_data = []
        for zx in all_bucked[zzzz]:
            all_bucked_data.append([all_bucked[zzzz][0], zx, today])
        print("当前批次  {} 共 {}".format(batch_size, len(all_bucked)))
        cursor.executemany(
            "insert into 凤巢长尾词分组(group_id,keyword,created_date) values(%s,%s,%s)",
            (all_bucked_data))
        connection.commit()
        cursor.close()
        connection.close()

    print('All is well')

Esempio n. 3

0

Mostra file

    def run(self, questions):
        args = self._get_parser()

        # preliminary work
        ensure_dir(args.output)

        if args.name_len_update:
            line_cnt = line_counter(args.infile)
            args.name_len = len(str(line_cnt)) + 1

        clean_dir(args.output, args.name_len)
        # end preliminary work

        p_bucket = defaultdict(list)
        save_idx = 0
        id_name = '{0:0' + str(args.name_len) + 'd}'
        # load stop words
        stop_words = get_stop_words(args.stop_words) if os.path.exists(
            args.stop_words) else list()
        # load tokenizer
        seg = Segmentor(args)

        print('Splitting sentence into different clusters ...')
        infile = questions
        for inline in tqdm(infile):
            inline = inline.rstrip()
            line = inline.split(':::')[0]
            is_match = False
            seg_list = list(seg.cut(line))
            if stop_words:
                seg_list = list(filter(lambda x: x not in stop_words,
                                       seg_list))
            for wd in seg_list:
                if is_match:
                    break
                w_bucket = p_bucket[wd]
                for bucket in w_bucket:
                    bucket_path = os.path.join(args.output, bucket)
                    check_file(bucket_path)
                    selected = sample_file(bucket_path, args.sample_number)
                    selected = list(map(lambda x: x.split(':::')[0], selected))
                    selected = list(map(lambda x: list(seg.cut(x)), selected))
                    # remove stop words
                    if stop_words:
                        filt_selected = list()
                        for sen in selected:
                            sen = list(
                                filter(lambda x: x not in stop_words, sen))
                            filt_selected.append(sen)
                        selected = filt_selected
                    # calculate similarity with each bucket
                    if all(
                            jaccard(seg_list, cmp_list) > args.threshold
                            for cmp_list in selected):
                        is_match = True
                        with open(bucket_path, 'a',
                                  encoding='utf-8') as outfile:
                            outfile.write(line + '\n')
                        for w in seg_list:
                            if bucket not in p_bucket[w]:
                                p_bucket[w].append(bucket)
                        break
            if not is_match:
                bucket_name = ('tmp' + id_name).format(save_idx)
                bucket_path = os.path.join(args.output, bucket_name)
                with open(bucket_path, 'a', encoding='utf-8') as outfile:
                    outfile.write(line + '\n')
                for w in seg_list:
                    p_bucket[w].append(bucket_name)
                save_idx += 1

        # sort and rename file
        file_list = os.listdir(args.output)
        file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
        cnt = dict()
        for file in file_list:
            file_path = os.path.join(args.output, file)
            cnt[file] = line_counter(file_path)

        sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
        name_map = dict()
        for idx, (file_name, times) in enumerate(sorted_cnt):
            origin_path = os.path.join(args.output, file_name)
            new_name = id_name.format(idx)
            new_path = os.path.join(args.output, new_name)
            os.rename(origin_path, new_path)
            name_map[file_name] = new_name

        for k, v in p_bucket.items():
            p_bucket[k] = list(map(lambda x: name_map[x], v))

        #合并文件
        output_file = os.path.join(args.output, 'all_cluster.txt')
        try:
            if os.path.isfile(output_file):
                os.unlink(output_file)
        except Exception as e:
            print(e)
        file_list = os.listdir(args.output)
        fw = open(output_file, 'w+')
        for file in file_list:
            with open(os.path.join(args.output, file)) as f:
                for line in f.readlines():
                    fw.write(str(int(file)) + ',' + line)
        fw.close()
        df = pd.read_csv(output_file, names=['id', 'text'])
        df.columns = ['cluster_id', 'ques']
        print('All is well')
        # json.dumps(dict(ques=ques))
        df_dict = df.set_index('cluster_id').T.to_dict('records')[0]

        #dataframe 的数据格式转换
        #df 0 aa
        #   0 aaa                   => aa  [aaa]
        #   1 bb                       bb  []
        #df_dict = {0: aa, 1: bb}
        print(df_dict)
        result_dict = {}
        for cluster_id, ques in df_dict.items():
            li = df[df['cluster_id'] == cluster_id].ques.values.tolist()
            # if(ques in li): li.remove(ques)
            result_dict[ques] = li

        my_list = [result_dict]
        my_df = pd.DataFrame(my_list).T
        my_df = my_df.reset_index()
        my_df.columns = ['ques', 'info']
        print(my_df)
        return my_df.to_json(orient="records", force_ascii=False)

Esempio n. 4

0

Mostra file

File: cluster.py Progetto: sjyttkl/TextCluster

def main():
    args = _get_parser()

    # preliminary work
    check_file(args.infile)
    ensure_dir(args.output)

    if args.name_len_update:
        line_cnt = line_counter(args.infile)
        args.name_len = len(str(line_cnt)) + 1

    clean_dir(args.output, args.name_len)
    # end preliminary work

    p_bucket = defaultdict(list)
    save_idx = 0
    id_name = '{0:0' + str(args.name_len) + 'd}'
    # load stop words
    stop_words = get_stop_words(args.stop_words) if os.path.exists(
        args.stop_words) else list()
    # load tokenizer
    seg = Segmentor(args)

    print('Splitting sentence into different clusters ...')
    infile = open(args.infile, 'r', encoding="utf-8")
    for line in tqdm(infile):
        line = line.rstrip()
        is_match = False
        seg_list = list(seg.cut(line))
        if stop_words:
            seg_list = list(filter(lambda x: x not in stop_words, seg_list))
        for wd in seg_list:
            if is_match:
                break
            w_bucket = p_bucket[wd]
            for bucket in w_bucket:
                bucket_path = os.path.join(args.output, bucket)
                check_file(bucket_path)
                selected = sample_file(bucket_path, args.sample_number)
                selected = list(map(lambda x: list(seg.cut(x)), selected))
                # remove stop words
                if stop_words:
                    filt_selected = list()
                    for sen in selected:
                        sen = list(filter(lambda x: x not in stop_words, sen))
                        filt_selected.append(sen)
                    selected = filt_selected
                # calculate similarity with each bucket
                if all(
                        jaccard(seg_list, cmp_list) > args.threshold
                        for cmp_list in selected):
                    is_match = True
                    with open(bucket_path, 'a', encoding='utf-8') as outfile:
                        outfile.write(line + '\n')
                    for w in seg_list:
                        if bucket not in p_bucket[w]:
                            p_bucket[w].append(bucket)
                    break
        if not is_match:
            bucket_name = ('tmp' + id_name).format(save_idx)
            bucket_path = os.path.join(args.output, bucket_name)
            with open(bucket_path, 'a', encoding='utf-8') as outfile:
                outfile.write(line + '\n')
            for w in seg_list:
                p_bucket[w].append(bucket_name)
            save_idx += 1

    infile.close()

    # sort and rename file
    file_list = os.listdir(args.output)
    file_list = list(filter(lambda x: x.startswith('tmp'), file_list))
    cnt = dict()
    for file in file_list:
        file_path = os.path.join(args.output, file)
        cnt[file] = line_counter(file_path)

    sorted_cnt = sorted(cnt.items(), key=lambda kv: kv[1], reverse=True)
    for idx, (file_name, times) in enumerate(sorted_cnt):
        origin_path = os.path.join(args.output, file_name)
        new_path = os.path.join(args.output, id_name.format(idx))
        os.rename(origin_path, new_path)

    print('All is well')