def write_performance(self, model_type, metrics, train_time): def get_header_string(): header = [ 'acc', 'mr_f', 'mrr_f', 'mr_p', 'mrr_p', 'pre', 'rec', 'f1', 'S', 'D', 'B', 'E', 'l_rate', 'data_set', 't_sec', 'model_type' ] return format_list_to_string(header, '\t') def get_perf_string(metrics, train_time): content = [] content.extend(metrics[:-1]) content.extend([self.n_sense, self.dim, self.batch_size, self.n_epoch, self.learning_rate,\ self.regu_strength, self.dropout, self.dataset, train_time, model_type]) return format_list_to_string(content, '\t') # quantitative analysis header_string = get_header_string() perf_string = get_perf_string(metrics, train_time) print header_string + '\n' + perf_string file_header = read_first_line(self.perf_file) with open(self.perf_file, 'a') as fout: if file_header != header_string: fout.write(header_string + '\n') fout.write(perf_string + '\n') # error analysis error_indicator = metrics[-1] error_indicator_file = self.instance_analysis_path + str( model_type) + '.txt' ensure_directory_exist(error_indicator_file) with open(error_indicator_file, 'w') as fout: for element in error_indicator: fout.write(str(element) + '\n')
def write_to_file(instances, x_vocab, y_vocab, output_file): ensure_directory_exist(output_file) with open(output_file, 'w') as fout: for instance in instances: instance_string = format_instance_to_string( instance, x_vocab, y_vocab) fout.write(instance_string + '\n')
def write_one_case(self, idx, neighbor_ids, scores): neighbor_info = [idx] description = self.vocab.get_description(idx) neighbor_info.append(description) for (neighbor_idx, score) in zip(neighbor_ids, scores): description = self.vocab.get_description(neighbor_idx) neighbor_info.append([description, score]) neighbor_info = format_list_to_string(neighbor_info, ' ') # print neighbor_info ensure_directory_exist(self.case_output_file) with open(self.case_output_file, 'a') as fp: fp.write(neighbor_info + '\n')
def write_performance(pd, model_type, metrics, train_time): header = ['time', 'acc', 'mr_f', 'mrr_f', 'mr_p', 'mrr_p',\ 'n_sense', 'dim', 'n_epoch', 'data_dir', 'model_type'] content = [train_time, metrics[0], metrics[1], metrics[2], metrics[3], metrics[4],\ pd['n_sense'], pd['embedding_dim'], pd['n_epoch'], pd['data_dir'], model_type] header_string = format_list_to_string(header, '\t') content_string = format_list_to_string(content, '\t') print header_string + '\n' + content_string # write to file perf_file = pd['performance_file'] ensure_directory_exist(perf_file) file_header = read_first_line(perf_file) with open(perf_file, 'a') as fout: if file_header != header_string: fout.write(header_string + '\n') fout.write(content_string + '\n')
def __init__(self, opt): self.opt = opt self.n_sense = opt['n_sense'] self.dim = opt['embedding_dim'] self.n_epoch = opt['n_epoch'] self.dataset = opt['data_dir'].split('/')[-2] self.batch_size = opt['batch_size'] self.learning_rate = opt['learning_rate'] self.regu_strength = opt['regu_strength'] self.dropout = opt['dropout'] self.perf_file = opt['data_dir'] + 'output/performance.txt' self.instance_analysis_path = opt['data_dir'] + 'instance/' ensure_directory_exist(self.perf_file) self.test_data_file = opt['data_dir'] + 'input/test.txt' self.x_vocab_file = opt['data_dir'] + 'input/words.txt' self.criterion = nn.NLLLoss()
def find_duplicate_sense_words(self): if self.n_sense != 2: print 'Similar sense detection is only supported for n_sense = 2.' return print 'Start detecting duplicate sense for %s words.' % self.n_words words = [] for i in xrange(self.n_words): idx = i * self.n_sense + 1 vec_a = self.emb_matrix[idx] vec_b = self.emb_matrix[idx + 1] if self.is_similar(vec_a, vec_b, mode='norm'): word = self.vocab.get_description(idx) words.append(word) ensure_directory_exist(self.dup_sense_file) with open(self.dup_sense_file, 'w') as fout: for w in words: fout.write(w + '\n')
def write_cluster_members(self, clus, cluster_file, parent_dir): n_cluster = clus.n_cluster clusters = clus.clusters # a dict: cluster id -> keywords with open(cluster_file, 'w') as fout: for clus_id in range(n_cluster): members = clusters[clus_id] for keyword_id in members: keyword = self.keywords[keyword_id] fout.write(str(clus_id) + '\t' + keyword + '\n') # write the cluster for each sub-folder clus_centers = clus.center_ids for clus_id, center_keyword_id in clus_centers: center_keyword = self.keywords[center_keyword_id] output_file = parent_dir + center_keyword + '/seed_keywords.txt' ensure_directory_exist(output_file) members = clusters[clus_id] with open(output_file, 'w') as fout: for keyword_id in members: keyword = self.keywords[keyword_id] fout.write(keyword + '\n')
def write_document_membership(self, clus, output_file, parent_dir): n_cluster = clus.n_cluster keyword_membership = clus.membership # an array containing the membership of the keywords cluster_document_map = defaultdict(list) # key: cluster id, value: document list with open(output_file, 'w') as fout: for idx, doc in zip(self.original_doc_ids, self.documents): doc_membership = self.get_doc_membership(n_cluster, doc, keyword_membership) cluster_id = self.assign_document(doc_membership) cluster_document_map[cluster_id].append(idx) fout.write(str(idx) + '\t' + str(cluster_id) + '\n') # write the document ids for each sub-folder clus_centers = clus.center_ids for clus_id, center_keyword_id in clus_centers: center_keyword = self.keywords[center_keyword_id] output_file = parent_dir + center_keyword + '/doc_ids.txt' ensure_directory_exist(output_file) doc_ids = cluster_document_map[clus_id] with open(output_file, 'w') as fout: for doc_id in doc_ids: fout.write(str(doc_id) + '\n')
def train_neg(train_data, model, criterion, optimizer, model_type, pd): forward_time, backward_time = 0, 0 n_epoch = pd['n_epoch'] train_log_file = pd['train_log_file'] ensure_directory_exist(train_log_file) with open(train_log_file, 'a') as fout: # train for epoch in xrange(n_epoch): running_loss = 0.0 for i in xrange(len(train_data)): # get the input inputs, labels = train_data[i] inputs = Variable(torch.LongTensor(inputs)) noise_labels = train_data.sample_negatives(5, labels[0]) labels.extend(noise_labels) labels = Variable(torch.LongTensor(labels)) f_start_time = time.time() output = model(inputs, labels) f_end_time = time.time() forward_time += f_end_time - f_start_time loss = criterion(output) # zero the parameter gradients optimizer.zero_grad() # backward + optimize b_start_time = time.time() loss.backward() optimizer.step() b_end_time = time.time() backward_time += (b_end_time - b_start_time) # print statistics running_loss += loss.data[0] if (i + 1) % 2000 == 0: print('%20s [%d, %5d] training loss: %.3f' % (model_type, epoch+1, i+1, running_loss/2000)) fout.write('%20s [%d, %5d] training loss: %.3f\n' % (model_type, epoch+1, i+1, running_loss/20)) running_loss = 0.0 print 'forward time:', forward_time print 'backward time:', backward_time
def save_model(self, model, model_type): model_name = self.get_model_name(model_type) file_name = self.model_path + model_name ensure_directory_exist(file_name) torch.save(model.state_dict(), file_name)