Beispiel #1
0
def common_crawl_unsupervised(path, k=None, one_sent=False):
    """
    Prepares file for unsupervised learning based on the Common Crawl WARC file.
    :param path: path to the WARC file
    :param k: keep only first <k> training samples
    :param one_sent: one sentence per line needed
    """
    data = read_warc(path, clean_html=True)
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    tmp = os.path.split(path)
    opath = os.path.join(tmp[0], os.path.splitext(tmp[1])[0] + '.txt')

    samples = 0
    out = []
    for el in tqdm(data, mininterval=1.0):
        if k is not None and samples >= k:
            break
        if one_sent:
            content = []
            for entry in el[1]:
                content += tokenizer.tokenize(entry)
        else:
            content = el[1]
        content = tokenize_sentences(content)
        samples += len(content)
        out += content
    for i in range(len(out)):
        out[i] += '\n'

    with open(opath, 'w+') as f:
        f.writelines(out)
Beispiel #2
0
def alquist_unsupervised(path, preprocess=False):
    """
    Prepares file for unsupervised learning based on the Alquist csv file.
    :param path: path to the Alquist train file
    :param preprocess: lowercase and tokenize the sentences with Tweet tokenizer
    """
    data = load_alquist(path)
    sentences = data['X']

    tmp = os.path.split(path)
    if preprocess:
        sentences = tokenize_sentences(sentences)
        opath = os.path.join(tmp[0], 'unsupervised_training', os.path.splitext(tmp[1])[0] + '-prep.txt')
    else:
        opath = os.path.join(tmp[0], 'unsupervised_training', os.path.splitext(tmp[1])[0] + '.txt')

    endings = '.!?'
    for i, s in enumerate(sentences):
        s = ' '.join(s.split())
        if s[-1] not in endings:
            end = ' .' if preprocess else '.'
            s = s + end
        sentences[i] = s + '\n'

    with open(opath, 'w+') as f:
        f.writelines(sentences)
Beispiel #3
0
 def classify_sentences(self, sentences):
     """
     Classify sentences into classes trained in the FastText model.
     :param sentences: list of sentences to classify
     :return: list of classes
     """
     labels = self.model.predict(tokenize_sentences(sentences))[0]
     return [w[0].replace('__label__', '') for w in labels]
Beispiel #4
0
 def transform_sentences(self, sentences):
     """
     Transform a list of sentences into vector representation.
     :param sentences: list of sentences
     :return: list of numpy vectors
     """
     return [
         self.model.embed_sentence(s) for s in tokenize_sentences(sentences)
     ]
Beispiel #5
0
def sts_starspace(path, mode='train'):
    """
    Prepares STS Benchmark files in a format needed by StarSpace trainMode 3.
    :param path: path to the folder where STS files are located
    :param mode: train / dev / test
    """

    assert mode in ['train', 'dev', 'test']
    data = load_sts(os.path.join(path, f'sts-{mode}.csv'), lower=True)
    x1 = tokenize_sentences(data['X1'])
    x2 = tokenize_sentences(data['X2'])

    out = []
    for s1, s2, y in zip(x1, x2, data['y']):
        if mode not in ['train', 'dev'] or y > 4:
            out.append(convert_numbers(s1) + '\t' + convert_numbers(s2) + '\n')

    with open(os.path.join(path, f'starspace/sts-{mode}.txt'), 'w+') as f:
        f.writelines(out)
Beispiel #6
0
def alquist_starspace(path):
    """
    Prepares Alquist files in a format needed by StarSpace.
    :param path: path to Alquist train file
    """
    data = load_alquist(path)
    sentences = data['X']
    intents = data['y']
    sentences = tokenize_sentences(sentences)

    out = []
    for s, i in zip(sentences, intents):
        out.append(s + '\t' + '__label__' + i + '\n')

    tmp = os.path.split(path)
    opath = os.path.join(tmp[0], 'StarSpace_preprocessed', os.path.splitext(tmp[1])[0] + '.txt')
    with open(opath, 'w+') as f:
        f.writelines(out)
Beispiel #7
0
def sts_unsupervised(path, preprocess=False):
    """
    Prepares file for unsupervised learning based on the STS csv file.
    :param path: path to the STS train file
    :param preprocess: lowercase and tokenize the sentences with Tweet tokenizer
    """
    data = load_sts(path)
    sentences = data['X1'] + data['X2']

    tmp = os.path.split(path)
    if preprocess:
        sentences = tokenize_sentences(sentences)
        opath = os.path.join(tmp[0], 'unsupervised_training', os.path.splitext(tmp[1])[0] + '-prep.txt')
    else:
        opath = os.path.join(tmp[0], 'unsupervised_training', os.path.splitext(tmp[1])[0] + '.txt')

    for i in range(len(sentences)):
        sentences[i] += '\n'

    with open(opath, 'w+') as f:
        f.writelines(sentences)
Beispiel #8
0
    for rects in [rects1, rects2]:
        for rect in rects:
            height = rect.get_height()
            ax.text(rect.get_x() + rect.get_width()/2 - 0.1, height, '%d' % int(height),
                    ha='left', va='bottom', rotation=70)


DATA_PATH = DATA_DIR + 'alquist/dm-data-snapshot-uniq.csv'
STOPWORDS = stopwords.words('english')
PUNCTUATION = ['.', '?', '!']
# BLACKLIST = ['no', 'yes', 'yeah', 'okay', 'sure', 'right']
BLACKLIST = []

if __name__ == '__main__':
    sents, intents = load_file_raw(DATA_PATH)
    sents = tokenize_sentences(sents)
    int_uniq = sorted(set(intents))

    print('\n===SENTENCES (full / unique)===')
    cnt = Counter(intents)
    cnt_uniq = count_unique_sentences(sents, intents)
    print('Total:', len(sents), '/', cnt_uniq['total'])
    c1, c2 = [], []
    for intent in int_uniq:
        c1.append(cnt[intent])
        c2.append(cnt_uniq[intent])
        print(intent+':', c1[-1], '/', c2[-1])
    plot_bar(c1, c2, int_uniq, f"Sentence counts (total: {len(sents)} / {cnt_uniq['total']})", ('full', 'unique'))

    print('\n===VOCABULARY (full / without stopwords)===')
    voc = build_vocabulary(sents, remove=PUNCTUATION)
Beispiel #9
0
def compress(emb_path,
             emb_dim=300,
             prune_freq=None,
             prune_norm=None,
             trn_path=None,
             reduce_dim=None,
             quantize=False,
             normalize=False,
             distinct=False,
             d_sv=5,
             d_cb=256,
             qnt_trn=10000,
             out_name='compressed',
             pickle_output=False,
             precision=5):
    """
    Main model compression function.
    :param emb_path: path to the embedding model
    :param emb_dim: input embedding dimension
    :param prune_freq: number of words to keep after pruning by vector frequency
    :param prune_norm: number of words to keep after pruning by vector norm
    :param trn_path: path to a training file - keep words present in this file
    :param reduce_dim: embedding dimension after dimensionality reduction
    :param quantize: use vector quantization
    :param normalize: normalize the vectors to unit length before quantization
    :param distinct: create a distinct codebook for each sub-vector position
    :param d_sv: size of sub-vectors the embeddings are split into
    :param d_cb: codebook size
    :param qnt_trn: maximum number of randomly picked vectors for computing the codebook
    :param out_name: name of the output model (without extension)
    :param pickle_output: create also a pickled version of the quantized model
    :param precision: maximum number of decimals used in the output model
    """

    if not quantize:
        normalize, distinct = False, False
    if reduce_dim is not None and reduce_dim >= emb_dim:
        reduce_dim = None

    out = f'{out_name}.txt'
    out_cb = f'{out_name}.cb.txt'

    trn_words = None
    if trn_path:
        trn_words = []
        with open(trn_path) as f:
            for line in f:
                trn_words += line.strip().split()
        trn_words = set(trn_words)

    print('Loading data (+ pruning vocabulary by frequency)...')
    if emb_path.endswith('.bin'):
        vocab, vecs, sizes = load_model_ft_bin(emb_path,
                                               k=prune_freq,
                                               normalize=normalize,
                                               keep=trn_words)
    else:
        vocab, vecs, sizes = load_model_txt(emb_path,
                                            k=prune_freq,
                                            normalize=normalize,
                                            dim=emb_dim,
                                            header=True,
                                            keep=trn_words)

    if prune_norm:
        # ToDo: Possibility to prune by any training set, not just STS.
        print('Pruning vocabulary by norm...')
        sts = load_sts(DATA_DIR + 'stsbenchmark/sts-train.csv')
        sts = tokenize_sentences(sts['X1'] + sts['X2'], to_lower=True)
        vocab, vecs, sizes = prune_by_norm(vocab,
                                           vecs,
                                           sizes,
                                           trn=sts,
                                           keep=prune_norm)
        # vocab, vecs, sizes = prune_by_trn(vocab, vecs, sizes, trn=sts)
        print('- pruned vocabulary size:', len(vocab))

    if reduce_dim:
        print('Reducing dimension...')
        emb_dim = reduce_dim
        # pca = PCA(n_components=reduce_dim, copy=False)
        # vecs = pca.fit_transform(vecs)
        vecs = vecs[:, :reduce_dim]

    if quantize:
        print('Computing codebook...')
        cb_out = []
        lbg_data = split_vecs(vecs, n=d_sv, limit=qnt_trn, distinct=distinct)
        if distinct:
            cb = dict()
            for pos in lbg_data:
                print('--- position:', pos, '---')
                cb[pos] = generate_codebook(lbg_data[pos], cb_size=d_cb)[0]
            for pos in cb:
                codebook_to_strings(cb[pos].round(precision), cb_out)
        else:
            cb = generate_codebook(lbg_data, cb_size=d_cb)[0]
            codebook_to_strings(cb.round(precision), cb_out)

        print('Writing codebook...')
        with open(out_cb, 'w', encoding='utf-8') as file:
            header = f'{d_cb} {d_sv}\n'
            file.write(header)
            file.writelines(cb_out)

        print('Quantizing vectors...')
        convert_func = convert_vec_distinct if distinct else convert_vec
        vecs = np.asarray([convert_func(vec, d_sv, cb) for vec in vecs])

    print('Preparing compressed model...')
    emb_out = []
    if not quantize:
        vecs = vecs.round(precision)
    for idx, word in enumerate(vocab):
        s = word
        for num in vecs[idx]:
            s += f' {num}'
        if normalize:
            s += f' {round(sizes[idx], precision)}'
        emb_out.append(s + '\n')

    print('Writing compressed model...')
    dim = int(emb_dim / d_sv) if quantize else emb_dim
    with open(out, 'w', encoding='utf-8') as file:
        header = f'{len(emb_out)} {dim}'
        if normalize:
            header += ' NORM'
        if distinct:
            header += ' DIST'
        header += '\n'
        file.write(header)
        file.writelines(emb_out)

    if pickle_output and quantize:
        print('Pickling...')
        pickle_compressed_model(out, out_cb, f'{out_name}.pickle')
Beispiel #10
0
    else:
        trn_words = None

    print('Loading data (+ pruning vocabulary by frequency)...')
    if params.emb_path.endswith('.bin'):
        vocab, vecs, sizes = load_model_ft_bin(params.emb_path, k=params.prune_freq, normalize=params.normalize,
                                               keep=trn_words)
    else:
        vocab, vecs, sizes = load_model_txt(params.emb_path, k=params.prune_freq, normalize=params.normalize,
                                            dim=params.emb_dim, header=True, keep=trn_words)

    if params.prune_norm:
        # TODO: Possibility to prune by any training set, not just STS.
        print('Pruning vocabulary by norm...')
        sts = load_sts('data/stsbenchmark/sts-train.csv')
        sts = tokenize_sentences(sts['X1'] + sts['X2'], to_lower=True)
        vocab, vecs, sizes = prune_by_norm(vocab, vecs, sizes, trn=sts, keep=params.prune_norm)
        # vocab, vecs, sizes = prune_by_trn(vocab, vecs, sizes, trn=sts)
        print('- pruned vocabulary size:', len(vocab))

    if params.reduce_dim:
        print('Reducing dimension...')
        params.emb_dim = params.dim
        # pca = PCA(n_components=params.dim, copy=False)
        # vecs = pca.fit_transform(vecs)
        vecs = vecs[:, :params.dim]

    if params.quantize:
        # TODO: Quantize also the vector sizes after normalization?
        print('Computing codebook...')
        cb_out = []