def matrix_term_document(args): def calculate_tf_weights(lst_contents, words): rows = len(words) columns = len(lst_contents) TF_matrix = np.zeros((rows, columns), dtype=np.float32) for i, word in enumerate(words): for j, content in enumerate(lst_contents): TF_matrix[i, j] = content.count(word) / len(content) return TF_matrix def calculate_idf_weights(TF): IDF = 1 + np.log(TF.shape[1] / np.sum(TF != 0, axis=1)) return np.array([IDF]).T # Bước 1: Load data from directory lst_contents, file_paths = load_data_from_directory(args['data_path']) # Bước 2: Build dictionary vocal = build_dictionary(lst_contents) # Bước 3: Calculate the TF weights for each document. TF_matrix = calculate_tf_weights(lst_contents, vocal) # Bước 4: Calculate the IDF weights IDF = calculate_idf_weights(TF_matrix) # Bước 5: Calculate the TF-IDF TF_IDF = TF_matrix * IDF # Bước 6: Make query qwords = args['query'].split() qTF = calculate_tf_weights([qwords], vocal) qTF_IDF = qTF * IDF # Bước 6: Calculate the similarity between qTF_IDF and TF_IDF dists = np.linalg.norm(TF_IDF - qTF_IDF, axis=0) # Bước 7: Ranking and display result ranked_result = np.argsort(dists) print("The ranking result matching with query {}".format(args['query'])) N = int(args['numbers']) for index in ranked_result[0:N]: print(file_paths[index])
import senteval from utils import build_dictionary import torch from ADNet import ADNet import numpy as np PATH_TO_DATA = "../../SentEVal-master/data" file = "amazon_food_review/train.csv" MODEL_PATH = "Modals/ADnets.dms" TOKEN2ID = build_dictionary(file) MODEL = ADNet(input_size=256, hidden_size=256, sentiment_size=256, max_len=35, vocab_size=len(TOKEN2ID) + 9, output_size=1) MODEL.load_state_dict(torch.load(MODEL_PATH, map_location="cpu")) def prepare(params, samples): params.word2id = TOKEN2ID def batcher(params, batch): batch_size = len(batch) batch = [sent if sent != [] else ['<unk>'] for sent in batch] lengths = torch.LongTensor( [len(sent) if len(sent) <= 35 else 35 for sent in batch]) padded_sentences = np.zeros((batch_size, 35)) padded_t = np.zeros((batch_size, 35))
def reconstruction(self,input_sentence,max_len): sentence = clean_sentence(input_sentence) sentence = convert_sentence2id(input_sentence,self.token2id,max_len) target = np.zeros((1,max_len)) l = len(word_tokenize(input_sentence)) target[0,:l] = sentence[0,:l] target = torch.LongTensor(target) #target = convert_sentence2id(input_sentence[:-1],self.token2id,max_len) # print(" ".join([self.id2token[id.item()] for id in target[0]])) print(sentence) print(target) output = self.model(inputs = sentence,targets=target,lengths=torch.LongTensor([len(sentence)])) prediction = output["predictions"][0] return " ".join(self.id2token[id.item()] for id in prediction) if __name__ == "__main__": #model_path = "Test_Models/checkpoint.dms" model_path = "Test_Models/ElmoSentenceEmbdedding_model-3.dms" training_data = "amazon_food_review/small_train.csv" token2id = build_dictionary(training_data) #model = ADNet(input_size=512, hidden_size=512, sentiment_size=512, max_len=35, # vocab_size=len(token2id) + 8, output_size=1) model = ElmoSentenceEmbeddingNets(input_size=512,hidden_size=512,max_len=35, vocab_size=len(token2id)+2,sentiment_size=512) re = ReconstructionSent(model=model,model_path=model_path,token2id=token2id) print(re.reconstruction("This is good iced tea. It is hard to find locally in the Fall and Winter.",35)) #print(re.reconstruction("My children love these rice milk boxes and they are just the right size for their lunches.",35)) #print(re.reconstruction("Some may say this buffet is pricey but I think you get what you pay for and this place you are getting quite a lot!",35))
import config as cfg # query = "how do I schedule an event?" query = "What is the purpose of 25Live Event Wizard?" # ### Content ### DATABASE = utils.get_file_path(cfg.DATABASE_FILE) content = help_content.HelpContent(DATABASE) # print( help( corpora.dictionary ) ) should_rebuild = False # ### Dictionary ### dict_file = utils.get_file_path(cfg.DICT_BACKUP) # dictionary = corpora.dictionary.Dictionary.load(dict_file) dictionary = utils.build_dictionary(content, should_rebuild, cfg.DICT_BACKUP) # ### Corpus ### corpus_file = utils.get_file_path(cfg.CORPUS_BACKUP) # utils.pickle_save(corpus_file, corpus) # corpus = corpora.MmCorpus(corpus_file) corpus = utils.build_corpus(dictionary, content, should_rebuild, cfg.CORPUS_BACKUP) # corpus = pickle.load( open( corpus_file, "rb" ) ) # print( cfg.MODEL_NAME ) # ### LDA Model ### bow = dictionary.doc2bow(utils.get_cleaned_text(query.lower()).split()) # bag_of_words = [word for word in bow] model = utils.build_model(dictionary, corpus, should_rebuild)
sentiment.append(y.item()) data = { "sentiment_hidden": sentiment_hidden, "other_hidden": other_hidden, "sentiment": sentiment } with open("Sentiment_other.pk", "wb") as f: pickle.dump(data, f) print("data_has been saved successfully!!") if __name__ == "__main__": training_data = "amaozn_food_review/small_train.csv" token2id = build_dictionary("amazon_food_review/small_train.csv") # print(len(token2id)) id2token = dict(zip(token2id.values(), token2id.keys())) adnet = ADNet(input_size=512, hidden_size=512, sentiment_size=512, max_len=35, vocab_size=len(token2id) + 2, output_size=1) test(adnet, "./Test_Models/ADnetsS.dms", token2id, 1, "amazon_food_review/valid_data.csv") #inputs = convert_sentence2id("It'll be a regular stop on my trips to Phoenix!",token2id,35) #target = convert_sentence2id("It'll be a regular stop on my trips to Phoenix!",token2id,35) #o = test_model(adnet,"./Modals/ADnets.dms",inputs,target,torch.LongTensor([len(inputs)])) #print("sentiment_hidden",o["sentiment_hidden"].detach().numpy())
print("*" * 40) running_loss = 0 for i, data in enumerate(dataloader): x, x_len, y, t = data predict = model(x, x_len) loss = criterion(predict.squeeze(1), y) model_optimizer.zero_grad() loss.backward() model_optimizer.step() running_loss += loss.item() if i % 10 == 0 and i != 0: print("Average batch loss: {}".format(running_loss / 10)) running_loss = 0 torch.save(model.state_dict(), "./basic_model") print("Model has been saved successfully !!") if __name__ == "__main__": training_data = "sentiment_data/training_data_shuffle.csv" dictionary = build_dictionary(training_data) model = SentimentModel(input_size=64, hidden_size=64, max_len=85, vocab_size=len(dictionary), output_size=1) train(dictionary, training_data, 0.01, 32, 20, model)
def inverted_index(args): def calculate_tf_weights(lst_contents, words): rows = len(words) columns = len(lst_contents) TF_word_dict = dict() for word in words: docs = list() for j, content in enumerate(lst_contents): count = content.count(word) if count != 0: docs.append((j, count / len(content))) TF_word_dict[word] = docs return TF_word_dict def calculate_idf_weights(TF_word_dict, file_paths): number_of_docs = len(file_paths) IDF = np.zeros(len(TF_word_dict)) # IDF = 1 + np.log(len(file_paths)/np.sum(TF_word_dict != 0, axis=1)) i = 0 for word, docs in TF_word_dict.items(): idf = 1 + np.log(number_of_docs / len(docs)) IDF[i] = idf i += 1 return np.array([IDF]).T def calculate_TF_IDF(TF_word_dict, IDF, file_paths): TF_matrix = np.zeros((len(TF_word_dict), len(file_paths)), dtype=np.float32) for i, word in enumerate(TF_word_dict): for i, pair in enumerate(TF_word_dict[word]): TF_matrix[i, pair[0]] = pair[1] return TF_matrix * IDF # Bước 1: Load data from directory lst_contents, file_paths = load_data_from_directory(args['data_path']) # Bước 2: Build dictionary vocal = build_dictionary(lst_contents) # Bước 3: Calculate the TF weights for each document. TF_word_dict = calculate_tf_weights(lst_contents, vocal) # Bước 4: Calculate the IDF weights IDF = calculate_idf_weights(TF_word_dict, file_paths) # Bước 5: Calculate the TF-IDF TF_IDF = calculate_TF_IDF(TF_word_dict, IDF, file_paths) # Bước 6: Make query qwords = args['query'].split() qTF = calculate_tf_weights([qwords], vocal) qTF_IDF = calculate_TF_IDF(qTF, IDF, file_paths) # Bước 6: Calculate the similarity between qTF_IDF and TF_IDF dists = np.linalg.norm(TF_IDF - qTF_IDF, axis=0) # Bước 7: Ranking and display result ranked_result = np.argsort(dists) print("The ranking result matching with query {}".format(args['query'])) N = int(args['numbers']) for index in ranked_result[0:N]: print(file_paths[index])
def main(): ## import data train_raw = pd.read_csv('/home/bsong/Python_Stuff/Data/Kaggle_Mercari/train.tsv',delimiter= '\t') normalized_price = np.log1p(train_raw['price'].values) mean_price_norm = np.mean(normalized_price) std_price_norm = np.std(normalized_price) train_raw['price'] = (normalized_price - mean_price_norm)/std_price_norm # split the categories into three new columns train_raw['cat1'],train_raw['cat2'],train_raw['cat3'] = zip(*train_raw['category_name'].apply(lambda x: utils.split_cat(x))) # remove the column that isn't needed anymore train_raw.drop('category_name',axis = 1, inplace = True) # replaces NaN with a string placeholder 'missing' # note: this is mildly hardcoded so it has to come after splitting categories into three handle_missing_inplace(train_raw) # make a dictionary for both name and item_description (figured similar words appear, so combining words from both) all_name_desc = np.hstack((train_raw['name'],train_raw['item_description'])) # get all dem words all_name_desc = utils.clean_and_tokenize(all_name_desc) all_name_desc = [item for sublist in all_name_desc for item in sublist] train_raw['name'] = utils.clean_and_tokenize(train_raw['name']) train_raw['item_description'] = utils.clean_and_tokenize(train_raw['item_description']) # Build dictionaries here vocabulary_size = 100000 # keeping 100000 words in the dictionary. 0.28% of total words were put into "UNK". so kept 99.72% "common" words word2vec_dict, reverse_dict = utils.build_dictionary(all_name_desc,vocabulary_size) dict_brand_len = 3000 # .16% of the words were put into "UNK" dict_cat1_len = 12 # theres apparently less than 12 categories in cat1 dict_cat2_len= 100 # .114% of the words were put into "UNK" dict_cat3_len = 700 # .04% of works were put into "UNK" brand_name_dict, brand_name_dict_rev = utils.build_dictionary(train_raw['brand_name'], dict_brand_len) train_raw['brand_name_inds'], count_unk_brand = utils.convert_word_to_ind(train_raw['brand_name'].values.reshape((-1,1)), brand_name_dict) cat1_dict ,cat1_rev_dict= utils.build_dictionary(train_raw['cat1'],dict_cat1_len) train_raw['cat1_inds'], count_unk_cat1 = utils.convert_word_to_ind(train_raw['cat1'].values.reshape((-1,1)), cat1_dict) cat2_dict ,cat2_rev_dict= utils.build_dictionary(train_raw['cat2'],dict_cat2_len) train_raw['cat2_inds'], count_unk_cat2 = utils.convert_word_to_ind(train_raw['cat2'].values.reshape((-1,1)), cat2_dict) cat3_dict ,cat3_rev_dict= utils.build_dictionary(train_raw['cat3'],dict_cat3_len) train_raw['cat3_inds'], count_unk_cat3 = utils.convert_word_to_ind(train_raw['cat3'].values.reshape((-1,1)), cat3_dict) # make some padded vectors and NOT store them back in pandas df (keeping it as np.array) name_pad_size = 9 # max length of name itemdesc_pad_size = 75 # 95th percentile of length of item descriptions name_padded , _ = utils.convert_word_to_padded(train_raw.name,word2vec_dict,name_pad_size) # without _, will get tuple lol. itemdesc_padded , _ = utils.convert_word_to_padded(train_raw.item_description,word2vec_dict,itemdesc_pad_size) # Define some embedding lengths here name_emb_size = 15 itemdesc_emb_size = 15 brand_emb_size = 10 cat1_emb_size = 10 cat2_emb_size = 10 cat3_emb_size = 10 itemcond_emb_size = 10 shipping_emb_size = 10 # lengths needed here and a bit later itemcond_len = np.max(train_raw.item_condition_id.values) name_itemdesc_emb = embed([i for i in range(vocabulary_size)],vocabulary_size,name_emb_size, name= 'name_itemdesc_emb') brand_emb = embed(train_raw.brand_name_inds,dict_brand_len, brand_emb_size, name= 'brand_emb') cat1_emb = embed(train_raw.cat1_inds,dict_cat1_len,cat1_emb_size, name= 'cat1_emb') cat2_emb = embed(train_raw.cat2_inds,dict_cat2_len,cat2_emb_size, name= 'cat2_emb') cat3_emb = embed(train_raw.cat3_inds,dict_cat3_len,cat3_emb_size, name= 'cat3_emb') itemcond_emb = embed(train_raw.item_condition_id,itemcond_len ,itemcond_emb_size, name= 'itemcond_emb') shipping_emb = embed(train_raw.shipping, 2, shipping_emb_size, name= 'shipping_emb') # Setup feeding stuff here # somewhat state which variables will be used here # reshaped to fit better (not sure if too necessary in hindsight, but minimal loss in time) input_name = name_padded input_itemdesc = itemdesc_padded input_price = train_raw['price'].values.reshape((-1,1)) input_brand = train_raw.brand_name_inds.values.reshape((-1,1)) input_cat1 = train_raw.cat1_inds.values.reshape((-1,1)) input_cat2 = train_raw.cat2_inds.values.reshape((-1,1)) input_cat3 = train_raw.cat3_inds.values.reshape((-1,1)) input_itemcond = train_raw.item_condition_id.values.reshape((-1,1)) input_ship = train_raw.shipping.values.reshape((-1,1)) # define some lengths for partitioning data after feeding input_name_len = input_name.shape[1] input_itemdesc_len = input_itemdesc.shape[1] # concatenate data to make into tensor slices temp_set = np.concatenate((input_name, input_itemdesc,input_cat1,input_cat2,input_cat3, input_brand, input_itemcond, input_ship),axis = 1) #name_and_desc ,input_itemcond,input_shipping shape_set = temp_set.shape[1] batch_len = 10000 num_epoch = 25 tot_iter = train_raw.shape[0]* num_epoch // batch_len + 1 print('splitting labels and features...') features_input = temp_set.astype(np.int32) label_input = input_price.astype(np.float32) # make some placeholders to avoid GraphDef exceeding 2GB feat_placeholder = tf.placeholder(features_input.dtype, features_input.shape) label_placeholder = tf.placeholder(label_input.dtype, label_input.shape) print('making tensor slices...') dataset = tf.data.Dataset.from_tensor_slices((feat_placeholder, label_placeholder)) print('shuffling...') #np.random.shuffle(temp_set) # shuffle the data dataset = dataset.shuffle(buffer_size =10000) print('making epochs...') dataset = dataset.repeat(num_epoch) # epoch print('making batches...') dataset = dataset.batch(batch_len) iterator = dataset.make_initializable_iterator() next_batch = iterator.get_next() # Tensorflow model setup input_x = tf.placeholder(tf.int32,[None, shape_set], name = "input_x") # pad_length = 25 or something defined earlier input_y = tf.placeholder(tf.float32,[None,1], name = "input_y") # train agianst this input_x_name = input_x[:,:input_name_len] input_x_itemdesc = input_x[:,input_name_len:(input_name_len + input_itemdesc_len)] input_x_cat1 = input_x[:,(input_name_len + input_itemdesc_len)] input_x_cat2 = input_x[:,(input_name_len + input_itemdesc_len)+1] input_x_cat3 = input_x[:,(input_name_len + input_itemdesc_len)+2] input_x_brand = input_x[:,(input_name_len + input_itemdesc_len)+3] input_x_itemcond = input_x[:,(input_name_len + input_itemdesc_len)+4] input_x_shipping = input_x[:,(input_name_len + input_itemdesc_len)+5] name_emb_lookup = tf.nn.embedding_lookup(name_itemdesc_emb, input_x_name) itemdesc_emb_lookup = tf.nn.embedding_lookup(name_itemdesc_emb,input_x_itemdesc) brand_emb_lookup = tf.nn.embedding_lookup(brand_emb,input_x_brand) cat1_emb_lookup = tf.nn.embedding_lookup(cat1_emb,input_x_cat1) cat2_emb_lookup = tf.nn.embedding_lookup(cat2_emb,input_x_cat2) cat3_emb_lookup = tf.nn.embedding_lookup(cat3_emb,input_x_cat3) itemcond_emb_lookup = tf.nn.embedding_lookup(itemcond_emb, input_x_itemcond) shipping_emb_lookup = tf.nn.embedding_lookup(shipping_emb, input_x_shipping) # expand name and item_desc because conv2d wants it 4-d name_emb_lookup_expand = tf.expand_dims(name_emb_lookup,-1) itemdesc_emb_lookup_expand = tf.expand_dims(itemdesc_emb_lookup,-1) # set some lazy parameters here out_nodes = 15 dropout_keep_prob = tf.placeholder(tf.float32) W_shape_name = [1,name_emb_size,1,out_nodes] #figure this out if it works b_shape_name = out_nodes # same as last dimension in W W_shape_itemdesc = [1,itemdesc_emb_size,1,out_nodes] b_shape_itemdesc = out_nodes #layers_namedesc = test_cnn(input_x_namedesc,W_shape_namedesc,b_shape_namedesc,dropout_keep_prob) layers_name = CNN(name_emb_lookup_expand,W_shape_name,b_shape_name,dropout_keep_prob,name_pad_size) layers_itemdesc = CNN(itemdesc_emb_lookup_expand,W_shape_itemdesc,b_shape_itemdesc,dropout_keep_prob,itemdesc_pad_size) layers_brand = RegNN(brand_emb_lookup, dropout_keep_prob, dict_brand_len, brand_emb_size, batch_len, out_nodes) layers_cat1 = RegNN(cat1_emb_lookup, dropout_keep_prob, dict_cat1_len, cat1_emb_size, batch_len, out_nodes) layers_cat2 = RegNN(cat2_emb_lookup, dropout_keep_prob, dict_cat2_len, cat2_emb_size, batch_len, out_nodes) layers_cat3 = RegNN(cat3_emb_lookup, dropout_keep_prob, dict_cat3_len, cat3_emb_size, batch_len, out_nodes) layers_itemcond = RegNN(itemcond_emb_lookup, dropout_keep_prob, itemcond_len, itemcond_emb_size, batch_len, out_nodes) layers_shipping = RegNN(shipping_emb_lookup, dropout_keep_prob, 2, shipping_emb_size, batch_len, out_nodes) comb_layers = tf.concat([layers_name,layers_itemdesc, layers_brand, layers_cat1, layers_cat2, layers_cat3,layers_itemcond, layers_shipping],axis=1) #, input_x_name, input_x_shipping #dense dense1 = dense_NN(comb_layers, 64, batch_len) dense2 = dense_NN(dense1, 128, batch_len) predictions = dense_NN(dense2, 1, batch_len) loss = 2 loss,train_step = train_the_NN(predictions,input_y,loss) # as is, normalized predictions cause NaN in rmsle solving. adding .00001 just in case unwind_true = tf.log(tf.expm1((input_y* std_price_norm) + mean_price_norm)+ .00001) unwind_pred = tf.log(tf.expm1((predictions* std_price_norm) + mean_price_norm)+ .00001) rmsle_ = tf.sqrt(tf.reduce_mean(tf.square(unwind_true - unwind_pred))) # Training model starts here with tf.Session() as sess:
arguments['up_sample_input_dim'] = arguments['hidden_size'] + arguments[ 'noise_dim'] arguments['num_channels'] = 3 # 下采样参数 arguments['image_feature_size'] = 512 # 文本解码参数 arguments['hidden_size'] = 512 arguments['num_layers'] = 1 arguments["sentence_embedding_size"] = arguments['hidden_size'] # 划分训练集和验证集 split_train_validation_set(arguments['sentence_path'], arguments['train_sentence_path'], arguments['val_sentence_path'], 200) # 建立词典 sentences = read_sentences(arguments['sentence_path']) word2idx, idx2word, lengths = build_dictionary(sentences) arguments['word2idx'] = word2idx arguments['idx2word'] = idx2word arguments["lengths"] = lengths arguments['word_number'] = len(word2idx) arguments['max_seq_length'] = len(word2idx) arguments['sentence_max_length'] = np.max(lengths) + 1 arguments['use_sentence_generator'] = False trainer = Trainer(arguments) trainer.train()