Exemple #1
0
def makeDataUnderMaxTokenLen():
    # tokenizer
    sentencepieceTokenizer = sentencePieceTokenizer()

    # Files for read and write
    file = open('../data/bm_novel_1/prerpcessed_bm_novel_utf8_3.txt',
                'r',
                encoding='utf-8')
    untokenized_file = open('../data/bm_novel_1/untokenized_bm_data.txt',
                            'w',
                            encoding='utf-8')
    tokenized_file = open('../data/bm_novel_1/tokenized_bm_data.txt',
                          'w',
                          encoding='utf-8')

    # Data for saving that will use on training
    untokenized = ""
    tokenized = ""
    data_length = 0

    # Preprocess datas
    while True:
        line = file.readline()

        if not line:
            untokenized_file.write(untokenized)
            tokenized_file.write(tokenized)
            break

        tokenized_line = sentencepieceTokenizer(line)

        # Data length for writing has to under 1022
        # input data can get 1024 token
        # but we need to use BOS and EOS token
        if data_length + len(
                tokenized_line) + 2 >= 1022:  # bos와 eos 토큰 갯수 고려 +2
            untokenized_file.write(untokenized + '\n')
            tokenized_file.write(tokenized + '\n')

            untokenized = ""
            tokenized = ""
            data_length = 0

        untokenized = untokenized + "<s>" + line[:-1] + "</s>"
        tokenized = tokenized + "<s>" + toString(tokenized_line) + "</s>"

        data_length = data_length + len(
            tokenized_line) + 2  # bos와 eos 토큰 갯수 고려 +2

    file.close()
    untokenized_file.close()
    tokenized_file.close()
Exemple #2
0
def checkDataLineLen():
  # tokenizer
  sentencepieceTokenizer= sentencePieceTokenizer()

  # Files for read and write
  untokenized_file = open('../data/backmyo_novel_1/untokenized_bm_data.txt', 'w', encoding='utf-8')

  # Preprocess datas
  while True:
    line = untokenized_file.readline()
    tokenized_line = sentencepieceTokenizer(line)
    if len(tokenized_line)>=1024:
      print('1024 초과')
    if not line:
      break
Exemple #3
0
def checkLineTokenLen(path):
    # tokenizer
    sentencepieceTokenizer = sentencePieceTokenizer()

    # Files for read and write
    untokenized_file = open(path, 'r', encoding='utf-8')

    # Preprocess datas
    while True:
        line = untokenized_file.readline()
        tokenized_line = sentencepieceTokenizer(line)
        # print('line: ', line)
        if len(tokenized_line) > 1000:
            print('1024 초과: ', len(tokenized_line))
            print('1024 초과: ', tokenized_line)

        if not line:
            break
Exemple #4
0
  # Files for read and write
  untokenized_file = open('../data/backmyo_novel_1/untokenized_bm_data.txt', 'w', encoding='utf-8')

  # Preprocess datas
  while True:
    line = untokenized_file.readline()
    tokenized_line = sentencepieceTokenizer(line)
    if len(tokenized_line)>=1024:
      print('1024 초과')
    if not line:
      break

if __name__ == "__main__":
    # execute only if run as a script
    # makeDataUnderMaxTokenLen()
    print('1')

    sentencepieceTokenizer = sentencePieceTokenizer()
    print('2')

    # Files for read and write
    untokenized_file = open('../data/backmyo_novel_1/untokenized_bm_data.txt', 'w', encoding='utf-8')

    # Preprocess datas
    while True:
      line = untokenized_file.readline()
      tokenized_line = sentencepieceTokenizer(line)
      if len(tokenized_line) >= 1024:
        print('1024 초과')
      if not line:
        break
Exemple #5
0
def makeDataUnderMaxTokenLen():
    # tokenizer
    sentencepieceTokenizer = sentencePieceTokenizer()

    # Files for read and write
    file = open(
        '/Users/a60058238/Desktop/dev/workspace/nlp/Data/fairy_tale_utf-8.txt',
        'r',
        encoding='utf-8')
    untokenized_file = open(
        '/Users/a60058238/Desktop/dev/workspace/nlp/Data/train_fairy_tale_data_utf8.txt',
        'w',
        encoding='utf-8')

    # Data for saving that will use on training
    untokenized = ""
    tokenized = ""
    data_length = 0
    print("tmp_line: ", sentencepieceTokenizer('\n'))

    # Preprocess datas
    while True:
        line = file.readline()
        if "#####" in line:
            untokenized_file.write(untokenized + '\n')
            untokenized = ""
            data_length = 0
            continue

        if not line:
            untokenized_file.write(untokenized)
            break

        tmp_line = line[:-1]
        tokenized_line = sentencepieceTokenizer(tmp_line)
        tokenized_line_len = len(tokenized_line)
        # print("tmp_line: ",tmp_line)
        # print("tokenized_line_len: ",tokenized_line_len)

        # Data length for writing has to under 1022
        # input data can get 1024 token
        # but we need to use BOS and EOS token
        pre_data_len = data_length
        data_length = data_length + tokenized_line_len  # bos와 eos 토큰 갯수 고려 +2

        if data_length >= 1000:
            if pre_data_len != len(sentencepieceTokenizer(untokenized)):
                print('pre_data_len: ', pre_data_len)
                print('len(sentencepieceTokenizer(untokenized)): ',
                      len(sentencepieceTokenizer(untokenized)))
            untokenized_file.write(untokenized + '\n')
            untokenized = ""
            data_length = tokenized_line_len
        if untokenized == "":
            untokenized = tmp_line
        else:
            untokenized = untokenized + " " + tmp_line

        # data_length =  # bos와 eos 토큰 갯수 고려 +2
    file.close()
    untokenized_file.close()