Ejemplo n.º 1
0
    def _generate_inter_files(self):
        print("loading question_df...")
        question_df = DataSet.load_all_questions()
        corpus = question_df[self.data_space]
        print("generating item2idx...")
        sen_list = corpus.values.tolist()
        self.item2idx = {_PAD_:0}
        for sen in sen_list:
            for word in sen.split():
                if word not in self.item2idx:
                    self.item2idx[word] = len(self.item2idx)
        print("generating idx2item...")
        self.idx2item = {v:k for k,v in self.item2idx.items()}

        print("load %s data..."%(self.train_test))
        if self.train_test=="train":
            self.data_set = DataSet.load_train()
        else:
            self.data_set = DataSet.load_test()

        if self.data_space == "words":
            q1 = self.data_set["word_len_q1"]
            q2 = self.data_set["word_len_q2"]
        else:
            q1 = self.data_set["char_len_q1"]
            q2 = self.data_set["char_len_q2"]

        print("bucketing...")
        q_pair = list(zip(q1,q2))
        bucket = GreedyBucket()
        fit_res = bucket.fit(q_pair)
        self.buckets,self.bounds = bucket.get_split_results(fit_res,self.bucket_num)
        #print("len of self.bounds",len(self.bounds))
        print("generating id vectors...")
        data_set_id_vectors = []
        for ind in range(self.data_set.shape[0]):
            cur_row = self.data_set.iloc[ind]
            cur_q1 = cur_row["q1"]
            cur_q1_items = question_df.loc[cur_q1][self.data_space].split()
            cur_q1_inds = [self.item2idx[x] for x in cur_q1_items]

            cur_q2 = cur_row["q2"]
            cur_q2_items = question_df.loc[cur_q2][self.data_space].split()
            cur_q2_inds = [self.item2idx[x] for x in cur_q2_items]

            cur_bound = self.bounds[ind]
            q1_pad_len = cur_bound - len(cur_q1_inds)
            q2_pad_len = cur_bound - len(cur_q2_inds)

            if self.pad_prefix:
                cur_q1_padded = [0]*q1_pad_len+cur_q1_inds
                cur_q2_padded = [0]*q2_pad_len+cur_q2_inds
            else:
                cur_q1_padded = cur_q1_inds+[0]*q1_pad_len
                cur_q2_padded = cur_q2_inds+[0]*q2_pad_len
            cur_pair_padded = cur_q1_padded + cur_q2_padded
            data_set_id_vectors.append(cur_pair_padded)
        data_set_id_vectors = np.array(data_set_id_vectors)

        print("generating bucket_idx_vectors...")
        self.bucket_idx_vectors = {}
        for b,id_list in self.buckets.items():
            tmp = {}
            if self.train_test == "train":
                tmplabels = self.data_set["label"].iloc[id_list].values
                tmp["label"] = tmplabels
            tmpdata = np.array(data_set_id_vectors[id_list].tolist())
            tmp["data"] = tmpdata
            self.bucket_idx_vectors[b] = tmp

        print("finish generating inter files.")
        print("begin caching..")
        all_cached = {}
        all_cached["item2idx"] =  self.item2idx
        all_cached["idx2item"] =  self.idx2item
        all_cached["buckets"]  =  self.buckets
        all_cached["bounds"]   =  self.bounds
        all_cached["bucket_idx_vectors"] = self.bucket_idx_vectors
        try:
            os.makedirs("./temp")
        except:
            pass
        pickle.dump(all_cached,open(self._temp_file,"wb"))
        print("finish caching")
Ejemplo n.º 2
0
 def __init__(self,data_df,space,bucket_num,batch_size,is_prefix_pad,is_shuffle,is_test):
     assert space in ["words","chars"]
     self.data_df = data_df
     self.space = space
     self.bucket_num = bucket_num
     self.batch_size = batch_size
     self.is_prefix_pad = is_prefix_pad
     self.is_shuffle = is_shuffle
     self.is_test = is_test
     if os.path.exists(self._temp_file):
         print("detect cached intermediate files...loading...")
         if DataGenerator.item2idx is None:
             all_cached = pickle.load(open(self._temp_file,"rb"))
             DataGenerator.item2idx = all_cached["item2idx"]
             DataGenerator.idx2item = all_cached["idx2item"]
             DataGenerator.item_embed = all_cached["item_embed"]
             DataGenerator.q2idvec = all_cached["q2idvec"]
         print("finish")
     else:
         print("Generating intermediate files...")
         DataGenerator.item2idx = {}
         DataGenerator.idx2item = {}
         DataGenerator.item_embed = {}
         DataGenerator.q2idvec = {}
         spaces = ["words","chars"]
         question_df = DataSet.load_all_questions()
         all_qids = DataSet.load_all_unique_ids_train_test()
         for space in spaces:
             print("for",space)
             corpus = question_df[space]
             w2i,i2w = self._get_item2id_id2item(corpus)
             DataGenerator.item2idx[space] = w2i
             DataGenerator.idx2item[space] = i2w
             ##Finish mapping table
             term_embed = DataSet.load_term_embed(space)
             embed_size = term_embed.shape[1]
             pad_embed = np.array([0] * embed_size).reshape(1, -1)
             all_embeding = np.vstack([pad_embed, term_embed])
             all_index = [_PAD_] + term_embed.index.values.tolist()
             all_embeding_df = pd.DataFrame(data=all_embeding, index=all_index)
             sort_word = [i2w[i] for i in range(len(i2w))]
             DataGenerator.item_embed[space] = all_embeding_df.loc[sort_word].values
             ##Finish item embedding
             tmp_q2idvec = {}
             for qid in all_qids:
                 items = question_df.loc[qid][space].split()
                 idvec = np.array([w2i[w] for w in items])
                 tmp_q2idvec[qid] = idvec
                 DataGenerator.q2idvec[space]=tmp_q2idvec
             ##Finish map from question to id vector
         print("finish generating inter files.")
         print("begin caching..")
         all_cached = {}
         all_cached["item2idx"] =  DataGenerator.item2idx
         all_cached["idx2item"] =  DataGenerator.idx2item
         all_cached["item_embed"] = DataGenerator.item_embed
         all_cached["q2idvec"] = DataGenerator.q2idvec
         try:
             os.makedirs("./temp")
         except:
             pass
         pickle.dump(all_cached,open(self._temp_file,"wb"))
         print("finish caching")