Ejemplo n.º 1
0
def makeVocabulary(filename, size, char=False):
    vocab = dict.Dict(
        [dict.PAD_WORD, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD])
    if char:
        vocab.addSpecial(dict.SPA_WORD)

    # lengths = []

    with open(filename, 'rb') as f:
        # lines = [line.decode('utf-8').strip().split() for line in txtfile.readlines()]
        for sent in f.readlines():
            # print(sent[0].decode('utf-8'))
            for word in sent.decode('utf-8').strip().split():
                # print(word)
                # lengths.append(len(word))
                # if char:
                #     for ch in word:
                #         vocab.add(ch)
                # else:
                # print(word)
                if word != 'END':
                    vocab.add(word + " ")  # why add " " here ?
    # originalSize = vocab.size()
    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))
    return vocab
Ejemplo n.º 2
0
def initVocabulary(name, dataFile, vocabFile, vocabSize, char=False):
    '''
    Inputs:
        name: str, 制作的字典的名字.
        dataFile: str, 文本源文件的路径.
        vocabFile: str, 已经存在的字典的路径, 每一行的第一列为label,第二列为idx.
        vocabSize: int, 制作的字典的size, 含有单词的个数.
        char: bool, 是否以字符为单位.
    Return:
        Dict对象, 里面有各种方法以及属性里idxToLabel, labelToIdx, frequencies三个字典.
    '''

    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...')
        vocab = dict.Dict()
        vocab.loadFile(vocabFile)  # 每一行的第一列为label,第二列为idx
        print('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

    if vocab is None:
        # If a dictionary is still missing, generate it.
        print('Building ' + name + ' vocabulary...')
        genWordVocab = makeVocabulary(dataFile, vocabSize,
                                      char=char)  # vocab_size为字典的大小
        vocab = genWordVocab

    print()
    return vocab
def makeVocabulary(filename, size, is_target, char=False):
    if is_target:
        vocab = dict.Dict([], lower=opt.lower)
    else:
        vocab = dict.Dict(
            [dict.PAD_WORD, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD],
            lower=opt.lower)
    if char:
        vocab.addSpecial(dict.SPA_WORD)

    lengths = []

    if type(filename) == list:
        for _filename in filename:
            with open(_filename) as f:
                for sent in f.readlines():
                    for word in sent.strip().split():
                        lengths.append(len(word))
                        if char:
                            for ch in word:
                                vocab.add(ch)
                        else:
                            vocab.add(word + " ")
    else:
        with open(filename) as f:
            for sent in f.readlines():
                for word in sent.strip().split():
                    lengths.append(len(word))
                    if char:
                        for ch in word:
                            vocab.add(ch)
                    else:
                        vocab.add(word + " ")

    print('max: %d, min: %d, avg: %.2f' %
          (max(lengths), min(lengths), sum(lengths) / len(lengths)))

    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 4
0
def makeVocabulary(filename, size, char=False):
    '''
    Inputs:
        filename: str, 文本源文件的路径.
        size: int, 制作的字典的size, 含有单词的个数.
        char: bool, 是否以字符为单位.
    Return:
        Dict对象, 里面有各种方法以及属性里idxToLabel, labelToIdx, frequencies三个字典.
    '''

    # PAD, UNK, BOS, EOS表示是特殊字符. 分别对应0, 1, 2, 3
    vocab = dict.Dict(
        [dict.PAD_WORD, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD],
        lower=opt.lower)
    if char:
        vocab.addSpecial(dict.SPA_WORD)

    lengths = []

    if type(filename) == list:
        for _filename in filename:
            with open(_filename) as f:
                for sent in f.readlines():
                    for word in sent.strip().split():
                        lengths.append(len(word))
                        if char:
                            for ch in word:
                                vocab.add(ch)
                        else:
                            vocab.add(word + " ")  # 为什么要加空格??????????????????
    else:
        with open(filename) as f:
            for sent in f.readlines():
                for word in sent.strip().split():
                    lengths.append(len(word))
                    if char:
                        for ch in word:
                            vocab.add(ch)
                    else:
                        vocab.add(word + " ")
    # 单词的最大、最小、平均长度
    print('max: %d, min: %d, avg: %.2f' %
          (max(lengths), min(lengths), sum(lengths) / len(lengths)))

    originalSize = vocab.size()
    vocab = vocab.prune(size)  # 返回最频繁的size个单词的字典
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
def initVocabulary(name, dataFile, vocabFile, vocabSize, char=False):
    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...')
        vocab = dict.Dict()
        vocab.loadFile(vocabFile)
        print('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

    if vocab is None:
        # If a dictionary is still missing, generate it.
        print('Building ' + name + ' vocabulary...')
        vocab = makeVocabulary(dataFile, vocabSize, char=char)

    return vocab
Ejemplo n.º 6
0
def makeVocabulary(filename, size):
    vocab = dict.Dict(
        [dict.PAD_WORD, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD],
        lower=opt.lower)

    with open(filename) as f:
        for sent in f.readlines():
            for word in sent.split():
                vocab.add(word)

    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 7
0
def makeVocabulary(filename, size):
    vocab = dict.Dict(
        [dict.PAD_WORD, dict.UNK_WORD, dict.BOS_WORD, dict.EOS_WORD],
        lower=opt.lower)
    max_tokens = opt.trun_src
    max_lengths = opt.src_length
    char = opt.src_char
    if type(filename) == str:
        filename = [filename]

    for _filename in filename:
        if opt.src_suf in _filename:
            max_tokens = opt.trun_src
            max_lengths = opt.src_length
            char = opt.src_char
            print(_filename, ' max tokens: ', max_tokens)
            print(_filename, ' max lengths: ', max_lengths)
        elif opt.tgt_suf in _filename:
            max_tokens = opt.trun_tgt
            max_lengths = opt.tgt_length
            char = opt.tgt_char
            print(_filename, ' max tokens: ', max_tokens)
            print(_filename, ' max lengths: ', max_lengths)
        with open(_filename, encoding='utf8') as f:
            for sent in f.readlines():
                if char:
                    tokens = list(sent.strip())
                else:
                    tokens = sent.strip().split()
                if max_lengths > 0 and len(tokens) > max_lengths:
                    continue
                if max_tokens > 0:
                    tokens = tokens[:max_tokens]
                for word in tokens:
                    vocab.add(word + " ")

    originalSize = vocab.size()
    if size == 0:
        size = originalSize
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 8
0
def makeVocabulary(filename, size, sep=' ', char=False):

    vocab = dict.Dict([dict.PAD_WORD, dict.UNK_WORD], lower=True)
    if char:
        vocab.addSpecial(dict.SPA_WORD)

    lengths = []

    if type(filename) == list:
        for _filename in filename:
            with codecs.open(_filename, 'r', 'utf-8') as f:
                data = js.load(f)
                for sent in data:
                    for word in sent.strip().split(sep):
                        lengths.append(len(word))
                        if char:
                            for ch in word.strip():
                                vocab.add(ch)
                        else:
                            vocab.add(word.strip() + " ")
    else:
        with codecs.open(filename, 'r', 'utf-8') as f:
            data = js.load(f)
            for sent in data:
                for word in sent.strip().split(sep):
                    lengths.append(len(word))
                    if char:
                        for ch in word.strip():
                            vocab.add(ch)
                    else:
                        vocab.add(word.strip() + " ")

    print('max: %d, min: %d, avg: %.2f' %
          (max(lengths), min(lengths), sum(lengths) / len(lengths)))

    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 9
0
def main():
    text_dict = dict.Dict('./data/data/text_dict')
    authors_dict = dict.Dict('./data/data/authors_dict')
    with codecs.open('./data/data/label_dict.json', 'r', 'utf-8') as f:
        label_dict = js.load(f)

    intro = {
        'text_file': './data/data/intro_train',
        'text_dict': text_dict,
        'doc_len': 100,
        'text_len': 25
    }
    related = {
        'text_file': './data/data/related_train',
        'text_dict': text_dict,
        'doc_len': 500,
        'text_len': 25
    }
    methods = {
        'text_file': './data/data/methods_train',
        'text_dict': text_dict,
        'doc_len': 600,
        'text_len': 25
    }
    conclu = {
        'text_file': './data/data/conclusion_train',
        'text_dict': text_dict,
        'doc_len': 150,
        'text_len': 25
    }
    abstract = {
        'text_file': './data/data/abstract_train',
        'text_dict': text_dict,
        'doc_len': 10,
        'text_len': 25
    }
    title = {
        'text_file': './data/data/title_train',
        'text_dict': text_dict,
        'text_len': 20
    }
    authors = {
        'text_file': './data/data/authors_train',
        'text_dict': authors_dict,
        'text_len': 7
    }
    label = {'label_file': './data/data/label_train', 'label_dict': label_dict}
    text = [intro, related, methods, conclu, abstract]

    intro_val = {
        'text_file': './data/data/intro_val',
        'text_dict': text_dict,
        'doc_len': 100,
        'text_len': 25
    }
    related_val = {
        'text_file': './data/data/related_val',
        'text_dict': text_dict,
        'doc_len': 500,
        'text_len': 25
    }
    methods_val = {
        'text_file': './data/data/methods_val',
        'text_dict': text_dict,
        'doc_len': 600,
        'text_len': 25
    }
    conclu_val = {
        'text_file': './data/data/conclusion_val',
        'text_dict': text_dict,
        'doc_len': 150,
        'text_len': 25
    }
    abstract_val = {
        'text_file': './data/data/abstract_val',
        'text_dict': text_dict,
        'doc_len': 10,
        'text_len': 25
    }
    title_val = {
        'text_file': './data/data/title_val',
        'text_dict': text_dict,
        'text_len': 20
    }
    authors_val = {
        'text_file': './data/data/authors_val',
        'text_dict': authors_dict,
        'text_len': 7
    }
    label_val = {
        'label_file': './data/data/label_val',
        'label_dict': label_dict
    }
    text_val = [intro_val, related_val, methods_val, conclu_val, abstract_val]

    intro_test = {
        'text_file': './data/data/intro_test',
        'text_dict': text_dict,
        'doc_len': 100,
        'text_len': 25
    }
    related_test = {
        'text_file': './data/data/related_test',
        'text_dict': text_dict,
        'doc_len': 500,
        'text_len': 25
    }
    methods_test = {
        'text_file': './data/data/methods_test',
        'text_dict': text_dict,
        'doc_len': 600,
        'text_len': 25
    }
    conclu_test = {
        'text_file': './data/data/conclusion_test',
        'text_dict': text_dict,
        'doc_len': 150,
        'text_len': 25
    }
    abstract_test = {
        'text_file': './data/data/abstract_test',
        'text_dict': text_dict,
        'doc_len': 10,
        'text_len': 25
    }
    title_test = {
        'text_file': './data/data/title_test',
        'text_dict': text_dict,
        'text_len': 20
    }
    authors_test = {
        'text_file': './data/data/authors_test',
        'text_dict': authors_dict,
        'text_len': 7
    }
    label_test = {
        'label_file': './data/data/label_test',
        'label_dict': label_dict
    }
    text_test = [
        intro_test, related_test, methods_test, conclu_test, abstract_test
    ]

    train = make_data(text, title, authors, label)
    val = make_data(text_val, title_val, authors_val, label_val)
    test = make_data(text_test, title_test, authors_test, label_test)
    save_data = {'train': train, 'val': val, 'test': test}
    torch.save(save_data, './data/data/save_data')