Exemple #1
0
def train_svm(train_processed_path):
    # Load abbr index
    abbr_idx_mapper = pickle_reader(train_processed_path +
                                    '/abbr_idx_mapper.pkl')
    abbr_cui2idx_inventory = {}
    os.makedirs(train_processed_path + '/svm_models/', exist_ok=True)
    # generate training data & train model
    for abbr, abbr_idx in tqdm.tqdm(abbr_idx_mapper['abbr2idx'].items()):
        content_vector = pickle_reader(train_processed_path +
                                       '/content_vectors/%d_vector.pkl' %
                                       abbr_idx)
        label2idx = {}
        label_idx = 0
        x = []
        y = []
        for global_instance_idx, doc_id, pos, content_pos, content_vec, content, label in content_vector:
            if label not in label2idx:
                label2idx[label] = label_idx
                label_idx += 1
            x.append(content_vec)
            y.append(label2idx[label])

        abbr_cui2idx_inventory[abbr] = label2idx
        # no need to train if only have 1 CUI
        if len(label2idx) > 1:
            x_train, y_train = train_sample(x, y, 2000)
            # train svm model
            model = SVC(kernel='rbf', gamma=0.01, C=100).fit(x_train, y_train)
            pickle_writer(
                model,
                train_processed_path + '/svm_models/%d_svm.pkl' % abbr_idx)
    pickle_writer(abbr_cui2idx_inventory,
                  train_processed_path + '/abbr_cui_idx_inventory.pkl')
Exemple #2
0
    def generate_instance_collection(self, save_collection_path=None):
        """
        Collect list of instances (index, abbr, sense, long_form).

        :param save_collection_path:
        :return:
        """
        instance_collection = []
        global_instance_idx = 0
        for line in self.corpus:
            for token in line.split(" "):
                items = process_abbr_token(token)
                if items is not None:
                    abbr, sense, long_form = items
                    instance_collection.append(Instance(
                        index=global_instance_idx,
                        abbr=abbr,
                        sense=sense,
                        long_form=long_form))
                    global_instance_idx += 1

        # save instance collection
        if save_collection_path is not None:
            pickle_writer(instance_collection, save_collection_path)
        return instance_collection
Exemple #3
0
def abbr_job(abbr, abbr_index, abbr_idx_mapper, docs, model, content_dir):
    corpus = AbbrCorpus(abbr, abbr_index, docs)
    corpus_content = corpus.content_generator()

    abbr_content_vec = []
    for global_instance_idx, doc_id, pos, content_pos, content, label in corpus_content:
        content_vec = compute_content_word2vec(content, model)
        content.insert(content_pos, abbr)
        content = " ".join(content)
        abbr_content_vec.append((global_instance_idx, doc_id, pos, content_pos, content_vec, content, label))

    # save vector to pickle file
    index = abbr_idx_mapper['abbr2idx'][abbr]
    pickle_writer(abbr_content_vec, content_dir + '%d_vector.pkl' % index)
Exemple #4
0
def train_fasttext_classifier_multi_model(train_processed_path,
                                          use_pretrain=False,
                                          use_softmax=False):
    train_path = train_processed_path + '/fasttext'
    if use_pretrain:
        model_path = train_path + '/model/pre_train'
    else:
        model_path = train_path + '/model'
    if use_softmax:
        loss = 'softmax'
    else:
        loss = 'hs'
    os.makedirs(model_path, exist_ok=True)
    # Load abbr index
    abbr_idx_mapper = pickle_reader(train_path + '/abbr_idx_mapper.pkl')
    abbr_index = AbbrIndex(train_processed_path + '/abbr_index_data.pkl')
    abbr_label_set = {}
    # Load training data & train model
    for abbr, abbr_idx in tqdm(abbr_idx_mapper['abbr2idx'].items()):
        input_file = train_path + '/dataset/%d.txt' % abbr_idx
        model_file = model_path + '/%d.bin' % abbr_idx
        # load label list
        label_set = set()
        for doc_id, pos_list in abbr_index[abbr].items():
            for global_instance_idx, pos, label in pos_list:
                label_set.add(label)
        abbr_label_set[abbr] = label_set
        # no need to train if only have 1 CUI
        if len(label_set) > 1:
            model_config = {
                'input': input_file,
                'epoch': 50,
                'lr': 0.1,
                'lrUpdateRate': 100,
                'dim': 100,
                'ws': 5,
                'wordNgrams': 2,
                'loss': loss,
                'thread': 60,
            }
            if use_pretrain:
                model_config[
                    'pretrainedVectors'] = train_processed_path + '/fasttext.vec'
            model = train_supervised(**model_config)
            model.save_model(model_file)
    pickle_writer(abbr_label_set, train_path + '/abbr_label_set.pkl')
Exemple #5
0
    def generate_counter(self, save_collection_path=None):
        """
        Generate Counters for every abbr-CUI mappings.

        :param save_collection_path:
        :return:
        """
        dataset_counter = defaultdict(Counter)
        for line in self.corpus:
            for token in line.split(" "):
                items = process_abbr_token(token)
                if items is not None:
                    abbr, sense, _ = items
                    dataset_counter[abbr].update([sense])

        # save DataSet Counter
        if save_collection_path is not None:
            pickle_writer(dataset_counter, save_collection_path)
        return dataset_counter
Exemple #6
0
def generate_test_content(test_processed_path, train_processed_path):
    # Load word2vec vectors
    model = gensim.models.Word2Vec.load(train_processed_path + '/train.model')

    # Load abbr index
    abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl')
    train_docs = Doc(txt_reader(test_processed_path + "/test_no_mark.txt"))

    # Build index for abbrs (for saving pickle files)
    abbr_idx_mapper = build_index_of_abbrs(abbr_index)
    pickle_writer(abbr_idx_mapper, test_processed_path + '/abbr_idx_mapper.pkl')

    # Save all content vectors to pickle files
    content_dir = test_processed_path + '/content_vectors/'
    os.makedirs(content_dir, exist_ok=True)

    print("Saving content vectors...")
    print(len(abbr_index))

    for abbr in tqdm.tqdm(abbr_index):
        abbr_job(abbr, abbr_index, abbr_idx_mapper, train_docs, model, content_dir)
Exemple #7
0
def generate_test_data(test_processed_path, window_size=5):
    # Load abbr index
    abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl')
    test_docs = Doc(txt_reader(test_processed_path + "/test_no_mark.txt"))

    data_processed_path = test_processed_path + '/fasttext'
    os.makedirs(data_processed_path, exist_ok=True)

    # Build index for abbrs (for saving pickle files)
    abbr_idx_mapper = build_index_of_abbrs(abbr_index)
    pickle_writer(abbr_idx_mapper,
                  data_processed_path + '/abbr_idx_mapper.pkl')

    content_dir = data_processed_path + '/dataset/'
    os.makedirs(content_dir, exist_ok=True)

    print("Building dataset for fastText...")
    print(len(abbr_index))

    for abbr in tqdm(abbr_index):
        abbr_job(abbr, abbr_index, abbr_idx_mapper, test_docs, content_dir,
                 window_size)