Esempio n. 1
0
def add_label_to_dict(src_path, dl_pair_path, dictionary, r_dictionary):

    label_path = '/home/ubuntu/workspace/text_summary_data/task_w2v/data_label_pairs/toytmp_label_dict.pickle'
    if os.path.isfile(label_path):
        label_dict_ori = statics.loadfrompickle(label_path)
    else:
        label_dict_ori = create_label_dict(src_path)
        statics.savetopickle(label_path, label_dict_ori)

    label_dict = label_dict_ori[3]
    for l in label_dict:

        labelidx = len(dictionary)
        dictionary[l] = labelidx
        r_dictionary[labelidx] = l

    for l in label_dict:

        word_onehot = np.zeros(len(dictionary))
        word_onehot[dictionary[l]] = 1
        label = np.zeros(len(label_dict))
        label[label_dict[l]] = 1

        index = len(os.listdir(dl_pair_path))
        dl_pair = [word_onehot, label]
        statics.savetopickle(
            os.path.join(dl_pair_path,
                         str(index + 1) + '.pickle'), dl_pair)

    return dictionary, r_dictionary
Esempio n. 2
0
def collect_dict(data,
                 dict_path,
                 wdicts={
                     'model': {},
                     'OS': {},
                     'category': {}
                 }):

    if type(data) is dict:

        lmodel = data['model'][0]
        los = data['OS'][0]
        lcat = data['category'][0]

        if lmodel not in wdicts['model']:
            wdicts['model'][lmodel] = len(wdicts['model'])
        if los not in wdicts['OS']: wdicts['OS'][los] = len(wdicts['OS'])
        if lcat not in wdicts['category']:
            wdicts['category'][lcat] = len(wdicts['category'])

    else:

        for w in data:

            if w in wdicts:
                wdicts[w] = wdicts[w] + 1
            else:
                wdicts[w] = 1

    statics.savetopickle(dict_path, wdicts)

    return wdicts
    
    print("Process: {}/{}".format(i, len(src_list)))
    
    dir_path = os.path.join(process_data_root, src_list[i])
    
    if os.path.isdir(dir_path) : file_path = os.listdir(dir_path)
    else: continue
      
    
    for f in file_path:
            
        if f != 'label.pickle':
            path = os.path.join(dir_path, f)
            words = words + statics.loadfrompickle(path)
            
    if i%1000 == 0: statics.savetopickle(word_pool_path, words)  


#-----------------Encode words by dict-------------------------------      
#code_words = []
#for i in words:
#    code_words.append(wdict[i])
    
#data = code_words
#data_index = 0
#
#def generate_batch(batch_size, num_skips, skip_window):
#  global data_index
#  assert batch_size % num_skips == 0
#  assert num_skips <= 2 * skip_window
#  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
Esempio n. 4
0
def create_train_pair_by_dict(src_path, all_words, dl_pair_path, dictionary):

    label_path = '/home/ubuntu/workspace/text_summary_data/task_w2v/data_label_pairs/tmp_label_dict.pickle'
    if os.path.isfile(label_path):
        label_dict_ori = statics.loadfrompickle(label_path)
    else:
        label_dict_ori = create_label_dict(src_path)
        statics.savetopickle(label_path, label_dict_ori)

    label_dict = label_dict_ori[3]
    files = os.listdir(src_path)
    total_file = len(files)

    unk_onehot = np.zeros(len(dictionary))
    unk_onehot[0] = 1

    count = 0
    for w in dictionary:

        count = count + 1
        print("create_train_pair_by_dict:{}/{}".format(count, len(dictionary)))

        word_onehot = np.zeros(len(dictionary))
        word_onehot[dictionary[w]] = 1

        for idx in range(total_file):

            f = files[idx]
            subfold = os.path.join(src_path, f)
            subfiles = os.listdir(subfold)

            l = statics.loadfrompickle(os.path.join(subfold, 'label.pickle'))
            label = np.zeros(len(label_dict))

            if l['model'][0].replace(" ", "") in label_dict:
                label[label_dict[l['model'][0].replace(" ", "")]] = 1
            if l['OS'][0].replace(" ", "") in label_dict:
                label[label_dict[l['OS'][0].replace(" ", "")]] = 1
            if l['category'][0].replace(" ", "") in label_dict:
                label[label_dict[l['category'][0].replace(" ", "")]] = 1

            for sf in subfiles:

                if sf != 'label.pickle':

                    file = os.path.join(subfold, sf)
                    words = statics.loadfrompickle(file)

                    if w in words:

                        index = len(os.listdir(dl_pair_path))
                        dl_pair = [word_onehot, label]
                        statics.savetopickle(
                            os.path.join(dl_pair_path,
                                         str(index + 1) + '.pickle'), dl_pair)

                        index = len(os.listdir(dl_pair_path))
                        dl_pair = [unk_onehot, label]
                        statics.savetopickle(
                            os.path.join(dl_pair_path,
                                         str(index + 1) + '.pickle'), dl_pair)

                        break
Esempio n. 5
0
def create_data_label_path(dataset_path_list, dl_pair_path):

    if os.path.isfile(dl_pair_path):
        return statics.loadfrompickle(dl_pair_path)

    data_dict = {}

    count = 0

    for d in dataset_path_list:

        lsdir = os.listdir(d)

        for folder in lsdir:

            count = count + 1
            sys.stdout.write("Create path dict:{}/{}\n".format(
                count, len(lsdir)))
            sys.stdout.flush()

            folderpath = os.path.join(d, folder)
            filelist = os.listdir(folderpath)
            data_dict[folder] = {}

            if len(filelist) > 1:
                filelist.remove('labels.txt')
                filepath = [{x: os.path.join(folderpath, x)} for x in filelist]

                if type(filepath) is not list: filepath = [filepath]

                data_dict[folder]['context'] = filepath
            else:
                data_dict[folder]['context'] = {}

            label_path = os.path.join(folderpath, 'labels.txt')

            text = open(label_path, "r").read()

            if data_dict[folder]['context'] == {}:

                if len(text) == 0:

                    shutil.rmtree(folderpath)
                else:
                    sl = slipt_label(label_path)

                    if len(sl['description']) != 0:

                        context_path = os.path.join(folderpath, 'context.txt')

                        with open(context_path, 'w') as f:
                            for line in sl['description']:
                                f.write(line + " ")
                            f.close

                        data_dict[folder]['label'] = label_path
                        data_dict[folder]['context'] = context_path

                    else:
                        del data_dict[folder]
            else:
                data_dict[folder]['label'] = label_path

    data_dict = statics.savetopickle(dl_pair_path, data_dict)
    return data_dict
Esempio n. 6
0
def create_wdict_ldict_general(Nword, wdict_path, ldict_path, final_wdict_path,
                               final_ldict_path):

    qwords = ["what", "is", " model", "OS", "category"]

    wdict = statics.loadfrompickle(wdict_path)
    ldict = statics.loadfrompickle(ldict_path)
    sorted_wdict = sorted(wdict.items(), key=operator.itemgetter(1))
    print(len(sorted_wdict))

    r_wdict = {}

    #===============================================================

    pure_dict = {}
    label_type = ['OS', 'category', 'model']

    r_ldict = {}
    pure_ldict = {}

    count = 1
    for lt in label_type:

        pure_ldict['UNK_' + lt] = len(pure_ldict)
        r_ldict[len(r_ldict)] = 'UNK_' + lt

        for l in ldict[lt]:

            if len(l) > 1 and l not in pure_ldict:
                r_ldict[count] = l
                pure_ldict[l] = len(pure_ldict)
                count = count + 1

            elif len(l) > 1:

                l = lt + l
                r_ldict[count] = l
                pure_ldict[l] = len(pure_ldict)
                count = count + 1
                print(l)

#============================================================
    r_wdict[0] = 'UNK'
    pure_dict['UNK'] = 0

    idx = 1
    count = 1
    while idx < Nword + 1:

        #       print("Create_dict:{}/{}".format(count, len(sorted_wdict)))

        if len(getChinese(sorted_wdict[-count][0])) == 0:
            r_wdict[idx] = sorted_wdict[-count][0]
            pure_dict[r_wdict[idx]] = idx
            idx = idx + 1

        count = count + 1

    for i in range(1, len(r_ldict)):

        if r_ldict[i] not in pure_dict:
            pure_dict[r_ldict[i]] = len(r_wdict)
            r_wdict[len(r_wdict)] = r_ldict[i]

    for i in range(len(qwords)):

        if qwords[i] not in pure_dict:
            pure_dict[qwords[i]] = len(r_wdict)
            r_wdict[len(r_wdict)] = qwords[i]


#    statics.savetopickle(final_wdict_path, pure_dict)
    statics.savetopickle(final_ldict_path, pure_ldict)
    #    statics.savetopickle(final_rwdict_path, r_wdict)

    return pure_dict, pure_ldict, r_wdict
Esempio n. 7
0
import statics


w2vfile = '/home/ubuntu/workspace/text_summary_data/w2v.pickle'
tfidf_path = '/home/ubuntu/workspace/text_summary_data/tfidf_score/tfidfscore.pickle'
save_path = '/home/ubuntu/workspace/text_summary_data/tfidf_score/w2v_tfidf.pickle'

w2v = statics.loadfrompickle(w2vfile)
tiidf = statics.loadfrompickle(tfidf_path)

tfidf_w2v = {}

for w in w2v:
    tfidf_w2v[w] = w2v[w]*tiidf[1][w][2]
    
statics.savetopickle(save_path, tfidf_w2v)