def train_svm(train_processed_path): # Load abbr index abbr_idx_mapper = pickle_reader(train_processed_path + '/abbr_idx_mapper.pkl') abbr_cui2idx_inventory = {} os.makedirs(train_processed_path + '/svm_models/', exist_ok=True) # generate training data & train model for abbr, abbr_idx in tqdm.tqdm(abbr_idx_mapper['abbr2idx'].items()): content_vector = pickle_reader(train_processed_path + '/content_vectors/%d_vector.pkl' % abbr_idx) label2idx = {} label_idx = 0 x = [] y = [] for global_instance_idx, doc_id, pos, content_pos, content_vec, content, label in content_vector: if label not in label2idx: label2idx[label] = label_idx label_idx += 1 x.append(content_vec) y.append(label2idx[label]) abbr_cui2idx_inventory[abbr] = label2idx # no need to train if only have 1 CUI if len(label2idx) > 1: x_train, y_train = train_sample(x, y, 2000) # train svm model model = SVC(kernel='rbf', gamma=0.01, C=100).fit(x_train, y_train) pickle_writer( model, train_processed_path + '/svm_models/%d_svm.pkl' % abbr_idx) pickle_writer(abbr_cui2idx_inventory, train_processed_path + '/abbr_cui_idx_inventory.pkl')
def generate_instance_collection(self, save_collection_path=None): """ Collect list of instances (index, abbr, sense, long_form). :param save_collection_path: :return: """ instance_collection = [] global_instance_idx = 0 for line in self.corpus: for token in line.split(" "): items = process_abbr_token(token) if items is not None: abbr, sense, long_form = items instance_collection.append(Instance( index=global_instance_idx, abbr=abbr, sense=sense, long_form=long_form)) global_instance_idx += 1 # save instance collection if save_collection_path is not None: pickle_writer(instance_collection, save_collection_path) return instance_collection
def abbr_job(abbr, abbr_index, abbr_idx_mapper, docs, model, content_dir): corpus = AbbrCorpus(abbr, abbr_index, docs) corpus_content = corpus.content_generator() abbr_content_vec = [] for global_instance_idx, doc_id, pos, content_pos, content, label in corpus_content: content_vec = compute_content_word2vec(content, model) content.insert(content_pos, abbr) content = " ".join(content) abbr_content_vec.append((global_instance_idx, doc_id, pos, content_pos, content_vec, content, label)) # save vector to pickle file index = abbr_idx_mapper['abbr2idx'][abbr] pickle_writer(abbr_content_vec, content_dir + '%d_vector.pkl' % index)
def train_fasttext_classifier_multi_model(train_processed_path, use_pretrain=False, use_softmax=False): train_path = train_processed_path + '/fasttext' if use_pretrain: model_path = train_path + '/model/pre_train' else: model_path = train_path + '/model' if use_softmax: loss = 'softmax' else: loss = 'hs' os.makedirs(model_path, exist_ok=True) # Load abbr index abbr_idx_mapper = pickle_reader(train_path + '/abbr_idx_mapper.pkl') abbr_index = AbbrIndex(train_processed_path + '/abbr_index_data.pkl') abbr_label_set = {} # Load training data & train model for abbr, abbr_idx in tqdm(abbr_idx_mapper['abbr2idx'].items()): input_file = train_path + '/dataset/%d.txt' % abbr_idx model_file = model_path + '/%d.bin' % abbr_idx # load label list label_set = set() for doc_id, pos_list in abbr_index[abbr].items(): for global_instance_idx, pos, label in pos_list: label_set.add(label) abbr_label_set[abbr] = label_set # no need to train if only have 1 CUI if len(label_set) > 1: model_config = { 'input': input_file, 'epoch': 50, 'lr': 0.1, 'lrUpdateRate': 100, 'dim': 100, 'ws': 5, 'wordNgrams': 2, 'loss': loss, 'thread': 60, } if use_pretrain: model_config[ 'pretrainedVectors'] = train_processed_path + '/fasttext.vec' model = train_supervised(**model_config) model.save_model(model_file) pickle_writer(abbr_label_set, train_path + '/abbr_label_set.pkl')
def generate_counter(self, save_collection_path=None): """ Generate Counters for every abbr-CUI mappings. :param save_collection_path: :return: """ dataset_counter = defaultdict(Counter) for line in self.corpus: for token in line.split(" "): items = process_abbr_token(token) if items is not None: abbr, sense, _ = items dataset_counter[abbr].update([sense]) # save DataSet Counter if save_collection_path is not None: pickle_writer(dataset_counter, save_collection_path) return dataset_counter
def generate_test_content(test_processed_path, train_processed_path): # Load word2vec vectors model = gensim.models.Word2Vec.load(train_processed_path + '/train.model') # Load abbr index abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl') train_docs = Doc(txt_reader(test_processed_path + "/test_no_mark.txt")) # Build index for abbrs (for saving pickle files) abbr_idx_mapper = build_index_of_abbrs(abbr_index) pickle_writer(abbr_idx_mapper, test_processed_path + '/abbr_idx_mapper.pkl') # Save all content vectors to pickle files content_dir = test_processed_path + '/content_vectors/' os.makedirs(content_dir, exist_ok=True) print("Saving content vectors...") print(len(abbr_index)) for abbr in tqdm.tqdm(abbr_index): abbr_job(abbr, abbr_index, abbr_idx_mapper, train_docs, model, content_dir)
def generate_test_data(test_processed_path, window_size=5): # Load abbr index abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl') test_docs = Doc(txt_reader(test_processed_path + "/test_no_mark.txt")) data_processed_path = test_processed_path + '/fasttext' os.makedirs(data_processed_path, exist_ok=True) # Build index for abbrs (for saving pickle files) abbr_idx_mapper = build_index_of_abbrs(abbr_index) pickle_writer(abbr_idx_mapper, data_processed_path + '/abbr_idx_mapper.pkl') content_dir = data_processed_path + '/dataset/' os.makedirs(content_dir, exist_ok=True) print("Building dataset for fastText...") print(len(abbr_index)) for abbr in tqdm(abbr_index): abbr_job(abbr, abbr_index, abbr_idx_mapper, test_docs, content_dir, window_size)