Example #1
0
def add_label_to_dict(src_path, dl_pair_path, dictionary, r_dictionary):

    label_path = '/home/ubuntu/workspace/text_summary_data/task_w2v/data_label_pairs/toytmp_label_dict.pickle'
    if os.path.isfile(label_path):
        label_dict_ori = statics.loadfrompickle(label_path)
    else:
        label_dict_ori = create_label_dict(src_path)
        statics.savetopickle(label_path, label_dict_ori)

    label_dict = label_dict_ori[3]
    for l in label_dict:

        labelidx = len(dictionary)
        dictionary[l] = labelidx
        r_dictionary[labelidx] = l

    for l in label_dict:

        word_onehot = np.zeros(len(dictionary))
        word_onehot[dictionary[l]] = 1
        label = np.zeros(len(label_dict))
        label[label_dict[l]] = 1

        index = len(os.listdir(dl_pair_path))
        dl_pair = [word_onehot, label]
        statics.savetopickle(
            os.path.join(dl_pair_path,
                         str(index + 1) + '.pickle'), dl_pair)

    return dictionary, r_dictionary
def pad_words(wdict, rawdata_path):

    words = statics.loadfrompickle(rawdata_path)
    encode_w = []

    for w in words:
        blank_words = np.zeros(len(wdict), np.int64)
        blank_words[wdict[w]] = 1

        encode_w.append(blank_words)

    return encode_w
Example #3
0
def create_label_dict(path):

    files = os.listdir(path)
    models = []
    OS = []
    category = []
    labels = []

    total_file = len(files)
    process_file = 0

    for f in files:
        subfold = os.path.join(path, f)
        subfiles = os.listdir(subfold)

        process_file = process_file + 1
        print("create_label_dict: {}/{}".format(process_file, total_file))

        for sf in subfiles:
            if sf == 'label.pickle':

                l = statics.loadfrompickle(os.path.join(subfold, sf))
                models.append(l['model'][0].replace(" ", ""))
                OS.append(l['OS'][0].replace(" ", ""))
                category.append(l['category'][0].replace(" ", ""))

    labels = list(set(models)) + list(set(OS)) + list(set(category))

    labels_dict = {}

    idx = 0

    for i in range(len(labels)):

        if labels[i] not in labels_dict:

            labels_dict[labels[i]] = idx
            idx = idx + 1

    return list(set(models)), list(set(OS)), list(set(category)), labels_dict
Example #4
0
def random_batch(dl_pair_path, index, shufflelist):

    batch_d = []
    batch_l = []

    if shufflelist == []:

        shufflelist = os.listdir(dl_pair_path)
        shuffle(shufflelist)
        print('Shuffle List')
    batch_file = shufflelist[index:index + batch_size]

    for f in batch_file:

        dl = statics.loadfrompickle(os.path.join(dl_pair_path, f))

        batch_d.append(dl[0])
        batch_l.append(dl[1])

    batch_d = np.stack(batch_d, axis=0)
    batch_l = np.stack(batch_l, axis=0)
    return batch_d, batch_l, shufflelist
Example #5
0
def collect_all_words(path):

    all_words = []

    files = os.listdir(path)
    total_file = len(files)
    process_file = 0

    for idx in range(total_file):

        f = files[idx]
        subfold = os.path.join(path, f)
        subfiles = os.listdir(subfold)

        process_file = process_file + 1
        print("create_vocab_dict: {}/{}".format(process_file, total_file))

        for sf in subfiles:
            if sf != 'label.pickle':
                file = os.path.join(subfold, sf)
                words = statics.loadfrompickle(file)
                all_words = all_words + words

    return all_words
import numpy as np
import os
import statics
import collections
import random



final_ldict_path = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2/final_ldict.pickle'
final_wdict_path = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2/final_wdict20k.pickle'
process_data_root = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2'
word_pool_path = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/w2vec/words_pool.pickle'


wdict = statics.loadfrompickle(final_wdict_path)
ldict = statics.loadfrompickle(final_ldict_path)


#-----------------Create all words-------------------------------

src_list = os.listdir(process_data_root)

dl_pair = []


if os.path.isfile(word_pool_path): words = statics.loadfrompickle(word_pool_path)
else: words = []

for i in range(6000, len(src_list)):

    
Example #7
0
def create_train_pair_by_dict(src_path, all_words, dl_pair_path, dictionary):

    label_path = '/home/ubuntu/workspace/text_summary_data/task_w2v/data_label_pairs/tmp_label_dict.pickle'
    if os.path.isfile(label_path):
        label_dict_ori = statics.loadfrompickle(label_path)
    else:
        label_dict_ori = create_label_dict(src_path)
        statics.savetopickle(label_path, label_dict_ori)

    label_dict = label_dict_ori[3]
    files = os.listdir(src_path)
    total_file = len(files)

    unk_onehot = np.zeros(len(dictionary))
    unk_onehot[0] = 1

    count = 0
    for w in dictionary:

        count = count + 1
        print("create_train_pair_by_dict:{}/{}".format(count, len(dictionary)))

        word_onehot = np.zeros(len(dictionary))
        word_onehot[dictionary[w]] = 1

        for idx in range(total_file):

            f = files[idx]
            subfold = os.path.join(src_path, f)
            subfiles = os.listdir(subfold)

            l = statics.loadfrompickle(os.path.join(subfold, 'label.pickle'))
            label = np.zeros(len(label_dict))

            if l['model'][0].replace(" ", "") in label_dict:
                label[label_dict[l['model'][0].replace(" ", "")]] = 1
            if l['OS'][0].replace(" ", "") in label_dict:
                label[label_dict[l['OS'][0].replace(" ", "")]] = 1
            if l['category'][0].replace(" ", "") in label_dict:
                label[label_dict[l['category'][0].replace(" ", "")]] = 1

            for sf in subfiles:

                if sf != 'label.pickle':

                    file = os.path.join(subfold, sf)
                    words = statics.loadfrompickle(file)

                    if w in words:

                        index = len(os.listdir(dl_pair_path))
                        dl_pair = [word_onehot, label]
                        statics.savetopickle(
                            os.path.join(dl_pair_path,
                                         str(index + 1) + '.pickle'), dl_pair)

                        index = len(os.listdir(dl_pair_path))
                        dl_pair = [unk_onehot, label]
                        statics.savetopickle(
                            os.path.join(dl_pair_path,
                                         str(index + 1) + '.pickle'), dl_pair)

                        break
Example #8
0
import math
import tensorflow as tf
import os
from random import shuffle

from scipy import spatial
import word2vec_utility as w2v
import statics

words_dict_path = '/home/dashmoment/workspace/text_summary_data/data_label_pair/toy_words_dict_for_taskw2v.pickle'
label_dict_path = '/home/dashmoment/workspace/text_summary_data/data_label_pair/toy_label_dict_for_taskw2v.pickle'
word_label_pair_path = '/home/dashmoment/workspace/text_summary_data/data_label_pair/toy_dl_pair_for_taskw2v.pickle'
dl_pair_path = '/home/dashmoment/workspace/text_summary_data/data_label_pair/task_w2v_dl'
w2v_dict_path = 'w2v_dict_2000_2.pickle'

words = statics.loadfrompickle(words_dict_path)
word_label_pair = statics.loadfrompickle(word_label_pair_path)
label_dict = statics.loadfrompickle(label_dict_path)
w2v_dict = statics.loadfrompickle(w2v_dict_path)

sim_list = {}

for w in w2v_dict:

    if w in word_label_pair:
        #if w == 'ioLogik':
        avg_sim = 0
        total_N = 0

        embed = w2v_dict[w][1]
    for w in words:
        blank_words = np.zeros(len(wdict), np.int64)
        blank_words[wdict[w]] = 1

        encode_w.append(blank_words)

    return encode_w


final_ldict_path = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2/final_ldict.pickle'
final_wdict_path = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2/final_wdict20k.pickle'
process_data_root = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/processed_v2'
dlpair_root = '/media/ubuntu/65db2e03-ffde-4f3d-8f33-55d73836211a/dataset/ts_cases_dataset/dl_pair_for_training'

wdict = statics.loadfrompickle(final_wdict_path)
ldict = statics.loadfrompickle(final_ldict_path)

src_list = os.listdir(process_data_root)

dir_path = os.path.join(process_data_root, src_list[1])
file_path = os.listdir(dir_path)

FILE_LENGTH = 500
contents = []

raw_w = []
coded_l = {}

for i in file_path:
Example #10
0
def create_data_label_path(dataset_path_list, dl_pair_path):

    if os.path.isfile(dl_pair_path):
        return statics.loadfrompickle(dl_pair_path)

    data_dict = {}

    count = 0

    for d in dataset_path_list:

        lsdir = os.listdir(d)

        for folder in lsdir:

            count = count + 1
            sys.stdout.write("Create path dict:{}/{}\n".format(
                count, len(lsdir)))
            sys.stdout.flush()

            folderpath = os.path.join(d, folder)
            filelist = os.listdir(folderpath)
            data_dict[folder] = {}

            if len(filelist) > 1:
                filelist.remove('labels.txt')
                filepath = [{x: os.path.join(folderpath, x)} for x in filelist]

                if type(filepath) is not list: filepath = [filepath]

                data_dict[folder]['context'] = filepath
            else:
                data_dict[folder]['context'] = {}

            label_path = os.path.join(folderpath, 'labels.txt')

            text = open(label_path, "r").read()

            if data_dict[folder]['context'] == {}:

                if len(text) == 0:

                    shutil.rmtree(folderpath)
                else:
                    sl = slipt_label(label_path)

                    if len(sl['description']) != 0:

                        context_path = os.path.join(folderpath, 'context.txt')

                        with open(context_path, 'w') as f:
                            for line in sl['description']:
                                f.write(line + " ")
                            f.close

                        data_dict[folder]['label'] = label_path
                        data_dict[folder]['context'] = context_path

                    else:
                        del data_dict[folder]
            else:
                data_dict[folder]['label'] = label_path

    data_dict = statics.savetopickle(dl_pair_path, data_dict)
    return data_dict
Example #11
0
def create_wdict_ldict_general(Nword, wdict_path, ldict_path, final_wdict_path,
                               final_ldict_path):

    qwords = ["what", "is", " model", "OS", "category"]

    wdict = statics.loadfrompickle(wdict_path)
    ldict = statics.loadfrompickle(ldict_path)
    sorted_wdict = sorted(wdict.items(), key=operator.itemgetter(1))
    print(len(sorted_wdict))

    r_wdict = {}

    #===============================================================

    pure_dict = {}
    label_type = ['OS', 'category', 'model']

    r_ldict = {}
    pure_ldict = {}

    count = 1
    for lt in label_type:

        pure_ldict['UNK_' + lt] = len(pure_ldict)
        r_ldict[len(r_ldict)] = 'UNK_' + lt

        for l in ldict[lt]:

            if len(l) > 1 and l not in pure_ldict:
                r_ldict[count] = l
                pure_ldict[l] = len(pure_ldict)
                count = count + 1

            elif len(l) > 1:

                l = lt + l
                r_ldict[count] = l
                pure_ldict[l] = len(pure_ldict)
                count = count + 1
                print(l)

#============================================================
    r_wdict[0] = 'UNK'
    pure_dict['UNK'] = 0

    idx = 1
    count = 1
    while idx < Nword + 1:

        #       print("Create_dict:{}/{}".format(count, len(sorted_wdict)))

        if len(getChinese(sorted_wdict[-count][0])) == 0:
            r_wdict[idx] = sorted_wdict[-count][0]
            pure_dict[r_wdict[idx]] = idx
            idx = idx + 1

        count = count + 1

    for i in range(1, len(r_ldict)):

        if r_ldict[i] not in pure_dict:
            pure_dict[r_ldict[i]] = len(r_wdict)
            r_wdict[len(r_wdict)] = r_ldict[i]

    for i in range(len(qwords)):

        if qwords[i] not in pure_dict:
            pure_dict[qwords[i]] = len(r_wdict)
            r_wdict[len(r_wdict)] = qwords[i]


#    statics.savetopickle(final_wdict_path, pure_dict)
    statics.savetopickle(final_ldict_path, pure_ldict)
    #    statics.savetopickle(final_rwdict_path, r_wdict)

    return pure_dict, pure_ldict, r_wdict
Example #12
0
def process_data_to_pickle(process_root, path_dict, wdict_path, ldict_path):

    if os.path.isfile(wdict_path):
        wdicts = statics.loadfrompickle(wdict_path)
    else:
        wdicts = {}

    if os.path.isfile(ldict_path):

        ldicts = statics.loadfrompickle(ldict_path)
    else:
        ldicts = {'model': {}, 'OS': {}, 'category': {}}

    count = 0

    for d in path_dict:

        count = count + 1
        sys.stdout.write("Data to pickle:{}/{}\n".format(
            count, len(path_dict)))
        sys.stdout.flush()

        casefolder = os.path.join(process_root, d)

        if not os.path.isdir(casefolder):
            os.mkdir(casefolder)

        file_idx = 0

        if type(path_dict[d]['context']) is not list:
            path_dict[d]['context'] = [path_dict[d]['context']]

        for clist in path_dict[d]['context']:

            for c in clist:

                savepath = os.path.join(casefolder, str(file_idx) + '.pickle')

                if os.path.isfile(savepath): continue

                if not os.path.isfile(savepath):

                    stripe = slipt_doc_by_space(clist[c])
                    wdicts = collect_dict(stripe, wdict_path, wdicts)

                    with open(savepath, 'wb') as f:
                        pickle.dump(stripe,
                                    f,
                                    protocol=pickle.HIGHEST_PROTOCOL)

            file_idx = file_idx + 1

        lsavepath = os.path.join(casefolder, 'label.pickle')
        if os.path.isfile(lsavepath): continue

        if not os.path.isfile(lsavepath):
            with open(lsavepath, 'wb') as lf:

                labels = slipt_label(path_dict[d]['label'])
                pickle.dump(labels, lf, protocol=pickle.HIGHEST_PROTOCOL)
                ldicts = collect_dict(labels, ldict_path, ldicts)
Example #13
0
import statics


w2vfile = '/home/ubuntu/workspace/text_summary_data/w2v.pickle'
tfidf_path = '/home/ubuntu/workspace/text_summary_data/tfidf_score/tfidfscore.pickle'
save_path = '/home/ubuntu/workspace/text_summary_data/tfidf_score/w2v_tfidf.pickle'

w2v = statics.loadfrompickle(w2vfile)
tiidf = statics.loadfrompickle(tfidf_path)

tfidf_w2v = {}

for w in w2v:
    tfidf_w2v[w] = w2v[w]*tiidf[1][w][2]
    
statics.savetopickle(save_path, tfidf_w2v)