Beispiel #1
0
def generate_vocabulary(corpus, methods, vocab_filename):
    if type(corpus) is not str:
        raise TypeError('Invalid non-string type for parameter \'corpus\'')
    if corpus == '':
        raise AttributeError('Invalid empty string for parameter \'corpus\'')
    if type(methods) is not list:
        raise TypeError('Invalid non-list type for parameter \'methods\'')
    if type(vocab_filename) is not str:
        raise TypeError(
            'Invalid non-string type for parameter \'vocab_filename\'')
    if vocab_filename == '':
        raise AttributeError(
            'Invalid empty string for parameter \'vocab_filename\'')

    corpus_filename = arg.retrieve_corpus_file(corpus, methods)

    if not os.path.isfile(corpus_filename):
        raise ValueError(
            f'{corpus_filename} does not exist, must generate before building vocabulary'
        )

    is_json = check_json(corpus_filename)
    text_tag = arg.corpus_text_dict[corpus]

    if perform_hashing(methods):
        v = HashVocabInfo(hashing_value(methods))
#    elif perform_bpe(methods):
#        bpe_set = create_bpe_set(corpus, methods, is_json, text_tag, bpe_value(methods))
#        v = BPEVocabInfo(bpe_set)
    else:
        v = VocabInfo()

    #v = VocabInfo() if not perform_hashing(methods) else HashVocabInfo(hashing_value(methods))

    import gzip
    with gzip.open(arg.retrieve_corpus_file(corpus, methods), 'rb') as f:
        for line in f:
            words = retrieve_text(line.decode('utf-8'), is_json,
                                  tag=text_tag).split()
            for word in set(words):
                v.increment_doc_frequency(word)

            for word in words:
                v.increment_term_frequency(word)

    with open(vocab_filename, 'wb') as f:
        pickle.dump(v, f, protocol=4)

    return v
def preprocess_corpus(corpus, treatments, offset=None, run_number=None):
    if corpus == '':
        raise AttributeError('Invalid empty corpus name')
    if corpus is None:
        raise AttributeError('Invalid corpus NoneType')
    if offset is not None and offset <= 0:
        raise AttributeError('Invalid offset value must be positive')
    if run_number is not None and run_number < 0:
        raise AttributeError('Invalid run_number value must be nonnegative')

    from preprocess import argmanager

    import os
    import gzip

    corpus_file = argmanager.retrieve_corpus_file(corpus, treatments)
    if os.path.isfile(corpus_file):
        return gzip.open(corpus_file, 'rb')

    if offset is None or run_number is None:
        raise AttributeError('Insufficient data to retrieve corpus')

    new_corpus_file = argmanager.retrieve_corpus_file(corpus, treatments,
                                                      run_number)
    base_corpus_file = argmanager.retrieve_corpus_file(corpus, [])
    text_dict = argmanager.corpus_text_dict

    preprocessor = methods.create_preprocessor(treatments)
    extractor = retrieve_extractor(new_corpus_file)(preprocessor,
                                                    tag=text_dict[corpus])

    with gzip.open(new_corpus_file,
                   'wb') as f, gzip.open(base_corpus_file, 'rb') as g:
        start_line = run_number * offset
        end_line = start_line + offset

        for i, line in enumerate(g):
            if i < start_line: continue
            elif i >= end_line: break
            elif i >= start_line and i < end_line:
                line = line.decode('utf-8')
                f.write(f'{extractor(line)}\n'.encode('utf-8'))
Beispiel #3
0
def retrieve_preprocessed_corpus(corpus, methods):
    if type(corpus) is not str:
        raise TypeError('Non-string value invalid for parameter \'corpus\'')
    if not corpus:
        raise AttributeError('Parameter \'corpus\' must be non-empty')
    if type(methods) is not list:
        raise TypeError('Non-list value invalid for parameter \'methods\'')
    documents = list()

    corpus_file = arg.retrieve_corpus_file(corpus, methods)
    with gzip.open(corpus_file, 'rb') as f:
        for line in f:
            documents.append(line.decode('utf-8').strip('\n'))

    return documents
Beispiel #4
0
def retrieve_bpe_set(corpus, methods):
    bpe_set_filename = create_bpe_set_filename(corpus, methods)

    if os.path.isfile(bpe_set_filename):
        with open(bpe_set_filename, 'rb') as f:
            bpe_set = pickle.load(f)
    else:
        corpus_filename = arg.retrieve_corpus_file(corpus, methods)
        is_json = check_json(corpus_filename)
        text_tag = arg.corpus_text_dict[corpus]

        bpe_set = create_bpe_set(corpus, methods, is_json, text_tag,
                                 bpe_value(methods))

    return bpe_set
Beispiel #5
0
def create_bpe_set(corpus,
                   methods,
                   is_json,
                   tag,
                   vocab_size,
                   reduce_vocab=True):
    if type(corpus) is not str:
        raise TypeError('Invalid non-string type for parameter \'corpus\'')
    if corpus == '':
        raise AttributeError('Invalid empty string for parameter \'corpus\'')
    if type(methods) is not list:
        raise TypeError('Invalid non-list type for parameter \'methods\'')
    if type(is_json) is not bool:
        raise TypeError('Invalid non-bool value for parameter is_json')
    if type(tag) is not str:
        raise TypeError('Invalid non-string value for parameter tag')
    if type(vocab_size) is not int:
        raise TypeError('Invalid non-int value for parameter vocab_size')

    partial_vocab = 0.95
    full_vocab = 1.0

    import gzip
    import re

    vocab = Counter()
    import gzip

    with gzip.open(arg.retrieve_corpus_file(corpus, methods), 'rb') as f:
        for line in f:
            words = retrieve_text(line.decode('utf-8'), is_json,
                                  tag=tag).split()
            for word in words:
                vocab.update([' '.join([c for c in word])])

    percentage_break = partial_vocab if reduce_vocab else full_vocab

    reduced_vocab = dict()
    total_tokens = np.sum(list(vocab.values()))
    tokens_so_far = int(0)

    for (term, frequency) in vocab.most_common():
        tokens_so_far += frequency
        percent = tokens_so_far / total_tokens
        reduced_vocab[term] = frequency

        if percent > percentage_break: break

    def get_stats(vocab):
        pairs = defaultdict(int)

        for word, freq in vocab.items():
            symbols = word.split()

            for i in range(len(symbols) - 1):
                pairs[symbols[i], symbols[i + 1]] += freq

        return pairs

    def merge_vocab(pair, v_in):
        v_out = {}
        bigram = re.escape(' '.join(pair))
        p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')

        for word in v_in:
            w_out = p.sub(''.join(pair), word)
            v_out[w_out] = v_in[word]
        return v_out

    char_set = set(''.join(' '.join(reduced_vocab.keys()).split()))
    num_merges = vocab_size - len(char_set)

    for _ in range(num_merges):
        pairs = get_stats(reduced_vocab)
        best = max(pairs, key=pairs.get)
        reduced_vocab = merge_vocab(best, reduced_vocab)

    bpe_set = set(' '.join(reduced_vocab.keys()).split())

    bpe_set_filename = create_bpe_set_filename(corpus, methods)

    with open(bpe_set_filename, 'wb') as f:
        pickle.dump(bpe_set, f, protocol=4)

    return bpe_set