def process(self, corpus, record_name, workers=8, max_queue_size=2000):
        """
        处理函数(主要调用的就是这个函数)
        处理输入语料(corpus),最终转为tfrecord格式(record_name),生成对应的文件
        自带多进程支持,如果cpu核心数多,请加入workers和max_queue_size.
        """
        # 创建一个TFRecordWriter对象,这个对象(writer)就负责写记录到指定的文件中去了.
        # TFRecordWriter把记录写入到TFRecords文件的类.
        writer = tf.io.TFRecordWriter(record_name)

        globals()['count'] = 0

        def write_to_tfrecord(serialized_instances):
            globals()['count'] += len(serialized_instances)
            for serialized_instance in serialized_instances:
                writer.write(serialized_instance)  # 写入到文件中

        def paragraph_process(texts):
            instances = self.paragraph_process(texts)  # 段落处理
            serialized_instances = self.tfrecord_serialize(
                instances)  # 文本序列化(转化为tfrecord的字符串)
            return serialized_instances

        # 多进程/多线程处理
        parallel_apply(
            func=paragraph_process,
            iterable=corpus,
            workers=workers,
            max_queue_size=max_queue_size,
            callback=write_to_tfrecord,
        )
        writer.close()  # 关闭对象.
        print('write %s examples into %s' % (globals()['count'], record_name))
Ejemplo n.º 2
0
    def process(self, corpus, record_name, workers=8, max_queue_size=2000):
        """处理输入语料(corpus),最终转为tfrecord格式(record_name)
        自带多进程支持,如果cpu核心数多,请加大workers和max_queue_size。
        """
        writer = tf.io.TFRecordWriter(record_name)
        globals()['count'] = 0

        def write_to_tfrecord(results):
            globals()['count'] += len(results)
            for tf_serialized in results:
                writer.write(tf_serialized)

        def paragraph_process(texts):
            results = self.paragraph_process(texts)
            results = self.tfrecord_serialize(results)
            return results

        parallel_apply(
            func=paragraph_process,
            iterable=corpus,
            workers=workers,
            max_queue_size=max_queue_size,
            callback=write_to_tfrecord,
        )

        writer.close()
        print('write %s examples into %s' % (count, record_name))
Ejemplo n.º 3
0
def convert(data):
    """分句,并转换为抽取式摘要
    """
    D = parallel_apply(func=extract_flow,
                       iterable=tqdm(data, desc=u'转换数据'),
                       workers=100,
                       max_queue_size=200)
    total_metric = sum([d[3] for d in D])
    D = [d[:3] for d in D]
    print(u'抽取结果的平均指标: %s' % (total_metric / len(D)))
    return D
Ejemplo n.º 4
0
        for text in texts:
            for token in _tokenizer.tokenize(text):
                _tokens[token] = _tokens.get(token, 0) + 1
        return _tokens

    tokens = {}

    def _total_count(result):
        for k, v in result.items():
            tokens[k] = tokens.get(k, 0) + v

    # 10进程来完成词频统计
    parallel_apply(
        func=_tokenize_and_count,
        iterable=tqdm(_batch_texts(), desc=u'构建词汇表中'),
        workers=10,
        max_queue_size=100,
        callback=_total_count,
        # dummy=True,  # 如果在Windows跑,请设置dummy=True
    )

    tokens = [(i, j) for i, j in tokens.items() if j >= min_count]
    tokens = sorted(tokens, key=lambda t: -t[1])
    tokens = [t[0] for t in tokens]
    json.dump(tokens,
              open(seq2seq_config, 'w', encoding='utf-8'),
              indent=4,
              ensure_ascii=False)

token_dict, keep_words = {}, []  # keep_words是在bert中保留的字表

for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
Ejemplo n.º 5
0
    for text in texts:
        for token in _tokenizer.tokenize(text):
            _tokens[token] = _tokens.get(token, 0) + 1
    return _tokens

tokens = {}

def _total_count(result):
    for k, v in result.items():
        tokens[k] = tokens.get(k, 0) + v

# 词频统计
parallel_apply(
    func=_tokenize_and_count,
    iterable=tqdm(_batch_texts(), desc=u'构建词汇表中'),
    workers=10,
    max_queue_size=500,
    callback=_total_count,
)

tokens = [(i, j) for i, j in tokens.items() if j >= min_count]
tokens = sorted(tokens, key=lambda t: -t[1])
tokens = [t[0] for t in tokens]

token_dict, keep_words = {}, []  # keep_words是在bert中保留的字表

for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
    token_dict[t] = len(token_dict)
    keep_words.append(_token_dict[t])

for t in tokens:
Ejemplo n.º 6
0
Archivo: shuffle.py Proyecto: wshzd/NLP
                    batch = []
                    k += 1
    if batch:
        yield batch, k


def local_shuf(batch_k):
    batch, k = batch_k
    np.random.shuffle(batch)
    with open('corpus_local_shuf/%05d.json' % k, 'w') as f:
        for text in batch:
            f.write(text)


parallel_apply(
    func=local_shuf, iterable=generator(), workers=5, max_queue_size=10
)

#
# =========== 全局打乱 ===========
#

jsons = glob.glob('corpus_local_shuf/*.json')
opens = [open(j) for j in jsons]

n, k = 0, 0
F = open('corpus_shuf/%05d.json' % k, 'w')
for i in tqdm(range(batch_size), ncols=0, desc='Global Shuffling'):
    orders = np.random.permutation(len(opens))
    for i in orders:
        text = opens[i].readline()
Ejemplo n.º 7
0
    if texts:
        yield texts


def count(texts):
    tokens = {}
    for text in texts:
        for t in sp_model.encode_as_pieces(text):
            tokens[t] = tokens.get(t, 0) + 1
    return tokens


def callback(tokens):
    for k, v in tokens.items():
        global_tokens[k] = global_tokens.get(k, 0) + v


parallel_apply(
    func=count,
    iterable=tqdm(corpus()),
    workers=20,
    max_queue_size=1000,
    callback=callback,
)

import pandas as pd

dic = pd.Series(global_tokens).sort_values(ascending=False)
dic.to_csv('result.csv', header=None, encoding='utf-8', sep='\t')
json.dump(global_tokens, open('result.json', 'w'))