Beispiel #1
0
 def get_item_embeddings(self):
     term_embed = DataSet.load_term_embed(self.data_space)
     embed_size = term_embed.shape[1]
     pad_embed = np.array([0]*embed_size).reshape(1,-1)
     all_embeding = np.vstack([pad_embed,term_embed])
     all_index = [_PAD_]+term_embed.index.values.tolist()
     all_embeding_df = pd.DataFrame(data=all_embeding,index=all_index)
     sort_word = [self.idx2item[i] for i in range(len(self.idx2item))]
     return all_embeding_df.loc[sort_word].values
Beispiel #2
0
 def __init__(self,data_df,space,bucket_num,batch_size,is_prefix_pad,is_shuffle,is_test):
     assert space in ["words","chars"]
     self.data_df = data_df
     self.space = space
     self.bucket_num = bucket_num
     self.batch_size = batch_size
     self.is_prefix_pad = is_prefix_pad
     self.is_shuffle = is_shuffle
     self.is_test = is_test
     if os.path.exists(self._temp_file):
         print("detect cached intermediate files...loading...")
         if DataGenerator.item2idx is None:
             all_cached = pickle.load(open(self._temp_file,"rb"))
             DataGenerator.item2idx = all_cached["item2idx"]
             DataGenerator.idx2item = all_cached["idx2item"]
             DataGenerator.item_embed = all_cached["item_embed"]
             DataGenerator.q2idvec = all_cached["q2idvec"]
         print("finish")
     else:
         print("Generating intermediate files...")
         DataGenerator.item2idx = {}
         DataGenerator.idx2item = {}
         DataGenerator.item_embed = {}
         DataGenerator.q2idvec = {}
         spaces = ["words","chars"]
         question_df = DataSet.load_all_questions()
         all_qids = DataSet.load_all_unique_ids_train_test()
         for space in spaces:
             print("for",space)
             corpus = question_df[space]
             w2i,i2w = self._get_item2id_id2item(corpus)
             DataGenerator.item2idx[space] = w2i
             DataGenerator.idx2item[space] = i2w
             ##Finish mapping table
             term_embed = DataSet.load_term_embed(space)
             embed_size = term_embed.shape[1]
             pad_embed = np.array([0] * embed_size).reshape(1, -1)
             all_embeding = np.vstack([pad_embed, term_embed])
             all_index = [_PAD_] + term_embed.index.values.tolist()
             all_embeding_df = pd.DataFrame(data=all_embeding, index=all_index)
             sort_word = [i2w[i] for i in range(len(i2w))]
             DataGenerator.item_embed[space] = all_embeding_df.loc[sort_word].values
             ##Finish item embedding
             tmp_q2idvec = {}
             for qid in all_qids:
                 items = question_df.loc[qid][space].split()
                 idvec = np.array([w2i[w] for w in items])
                 tmp_q2idvec[qid] = idvec
                 DataGenerator.q2idvec[space]=tmp_q2idvec
             ##Finish map from question to id vector
         print("finish generating inter files.")
         print("begin caching..")
         all_cached = {}
         all_cached["item2idx"] =  DataGenerator.item2idx
         all_cached["idx2item"] =  DataGenerator.idx2item
         all_cached["item_embed"] = DataGenerator.item_embed
         all_cached["q2idvec"] = DataGenerator.q2idvec
         try:
             os.makedirs("./temp")
         except:
             pass
         pickle.dump(all_cached,open(self._temp_file,"wb"))
         print("finish caching")