Beispiel #1
0
    def __init__(self,
                 training_epochs=50,
                 sequence_length=20,
                 batch_size=100,
                 learning_rate=0.001,
                 dropout=0.2):
        # Hyper Parameters
        self.sequence_length = sequence_length
        self.embedding_size = 512
        self.hidden_size = 128
        self.num_layers = 1
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.dropout = dropout

        self.data = DataUtil()
        self.data.load_split_data()
        print(self.data.get_dataset(self.data.TRAIN)[:5])
        self.data.build_vocab(
            self.data.get_dataset(self.data.TRAIN) +
            self.data.get_dataset(self.data.TEST))

        self.model = GRURNN(self.embedding_size, self.hidden_size,
                            self.num_layers, self.data.input_lang.n_words,
                            self.data.output_lang.n_words, self.dropout)
        self.training_epochs = training_epochs
        self.epoch_start = 1
        self.use_cuda = torch.cuda.is_available()
    def __init__(self):
        self.config = Config()
        self.du = DataUtil(self.config)
        self.sc = StockScraper(ASingleStockConfig())
        self.config.ACTION_NUM = len(self.config.actions)
        self.memories = []
        self.W1 = tf.get_variable('W1', [self.config.INPUT, self.config.M1])
        self.b1 = tf.get_variable('b1', [self.config.M1])
        self.W2 = tf.get_variable('W2', [self.config.M1, self.config.M2])
        self.b2 = tf.get_variable('b2', [self.config.M2])
        self.W3 = tf.get_variable('W3', [self.config.M2, 1])
        self.b3 = tf.get_variable('b3', [1])

        self.current_data = []
        self.current_state = []

        self.portfolio = {
            'fund': 500000,
            'stock_quantity': 50000,
            'current_stock_price': 0,
            'total': -1,
            'stock_value': 0
        }

        # self.init_op = tf.initialize_all_variables()
        self.init_placeholder()
        scores = self.batch_scoring_op()
        next_step_scores = self.batch_predict_op()
        self.add_loss_n_train_op(scores, next_step_scores)
        self.add_step_predict_op()
        self.saver = tf.train.Saver()

        self.init_op = tf.initialize_all_variables()
Beispiel #3
0
def printUpgrades(currentGear, allGear):
    headerString = 'UPGRADES'
    print('\n\n\n%s\n%s\n' % (headerString, '-' * len(headerString)))

    headers = ['DPS Diff', 'Name', 'ilvl', 'Location', 'Boss']

    for slot in sorted(list(currentGear.keys())):
        piece = currentGear[slot]
        print('\n\n\n%s (%s, %.2f DPS)' % (slot, piece['Name'], piece['DPS']))

        actualSlotString = CalcUtil.removeUnderscore(slot)

        outputItems = []

        for otherName in allGear[actualSlotString]:
            otherPiece = copy.deepcopy(allGear[actualSlotString][otherName])
            otherPiece['DPS Diff'] = otherPiece['DPS'] - piece['DPS']

            if (otherPiece['DPS Diff'] > 0):
                outputItems.append(otherPiece)

        outputItems.sort(lambda p1, p2: int(p2['DPS'] - p1['DPS']))

        print('')
        print(DataUtil.getTabulated(outputItems, headers))
        print('')
Beispiel #4
0
def main(num_epochs=100, n_splits=5):
    data_util = DataUtil('data', 'spectrogram_data')
    X, y = data_util.get_data()
    kf = KFold(n_splits=n_splits, shuffle=True)
    test_accuracy_sum = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = Model(data_util.height, data_util.width)

        param_values, threshold = train_and_validate(model, X_train, y_train, num_epochs)
        model.set_param_values(param_values)

        test_accuracy_sum += perform_validation(model, X_test, y_test, threshold)
    print("Cross-validation results:")
    print("  accuracy:\t\t{:.2f} %".format(test_accuracy_sum/n_splits * 100))
Beispiel #5
0
    def __init__(self, charInfoPath):
        """
        Slots (currently - adding weapons and trinkets later):
        'Back', 'Belt', 'Bracer', 'Chest', 'Feet',
        'Gloves', 'Head', 'Legs', 'Neck', 'Ring', 'Shoulder'
        """

        paths = {'allGear': 'AllGear.json', 'trinkets': 'Trinkets.json'}

        self.charInfo = FileUtil.getJSONContents(charInfoPath)

        allGearList = FileUtil.getJSONContents(paths['allGear'])
        allTrinketsList = FileUtil.getJSONContents(paths['trinkets'])

        # They don't exlicitly say that they're trinkets
        for trink in allTrinketsList:
            trink['Slot'] = 'Trinket'

        # Combine all gear into one list
        self.allGear = []
        self.allGear.extend(allGearList)
        self.allGear.extend(allTrinketsList)

        # Then turn that list into a map from name to the piece of gear
        self.allGear = DataUtil.toMap(self.allGear, 'Name')

        # Load the current gear into memory
        self.currentGear = DataUtil.statifyNamedGear(
            self.charInfo['Current Gear'], self.allGear)

        # TODO
        # SEE IF THIS DOESN'T BREAK THINGS LATER ON IN EXECUTION
        # (Might not be kosher if slotified this early)
        self.allGear = CalcUtil.slotifyAllGear(self.allGear)

        # Calculate each piece's DPS
        for name in self.currentGear:
            piece = self.currentGear[name]
            piece['DPS'] = CalcUtil.calcDPS(piece, self.charInfo)

        # Get some basic overall stats about the current gear
        self.totalStats = CalcUtil.getTotalStats(self.currentGear,
                                                 Globals.allStats)
    def __init__(self):

        self.entity_extractor = EntityExtractor()
        self.encoder = Encoder()
        self.action_manipulator = ActionManipulator()
        self.config = Config()
        self.du = DataUtil()
        obs_size = self.config.u_embed_size + self.config.vocab_size + self.config.feature_size
        self.action_templates = self.action_manipulator.get_action_templates()
        self.model = Model()
    def ensureUniqueGearNames():
        allGearList = FileUtil.getJSONContents('AllGear.json')
        allGear = DataUtil.toMap(allGearList, 'Name')

        print('%d items in list' % len(allGearList))
        print('%d items in map' % len(list(allGear.keys())))

        nRings = 0
        for name in allGear:
            piece = allGear[name]
            if (piece['Slot'] == 'Ring'):
                nRings += 1

        print('%d rings' % nRings)
    def printAllGear(slot, globs):
        nameToPiece = globs.allGear[slot]

        charInfo = globs.charInfo

        for name in nameToPiece:
            piece = nameToPiece[name]
            piece['DPS'] = CalcUtil.calcDPS(piece, charInfo)

        items = sorted(list(nameToPiece.values()),
                       lambda p1, p2: int(p2['DPS'] - p1['DPS']))
        headers = ['Name', 'ilvl', 'DPS', 'Location', 'Boss']

        print(DataUtil.getTabulated(items, headers))
    def predict(self, data):
        logging.info("predict data....")
        # data 必须是个list
        if type(data) is not list:
            logging.error("data's type is not list.")
            raise Exception("data's type is not list.")

        if len(data) == 0:
            logging.error("num of data is 0!")
            raise Exception("num of data is 0!")

        if os.path.exists('news_classifier_model.h5') == False:
            logging.info("news_classifier model is not exists!")
            self.train()

        wordVec = Word2Vector()
        embeddings = np.array(wordVec.embeddings)
        model = self.model(embeddings)
        adam = Adagrad(lr=0.01, epsilon=1e-06)
        model.compile(loss='binary_crossentropy',
                      metrics=[ut.f_score],
                      optimizer=adam)
        model.load_weights('news_classifier_model.h5')
        dataUtil = DataUtil("articles_testN")
        pre_data = dataUtil.filter_data(data, 1)
        pre_data = dataUtil.transfer_form(pre_data)
        pre_data["processed_content"] = sequence.pad_sequences(pre_data['processed_content'],\
               maxlen=self.content_max_len,  padding='post', truncating='post')
        pre_data["processed_title"] = sequence.pad_sequences(pre_data['processed_title'],\
       maxlen=self.title_max_len,  padding='post', truncating='post')
        result = model.predict_classes([pre_data["processed_content"], \
         pre_data["processed_title"]], batch_size=self.batch_size, verbose=1)

        #count = 0
        for i in range(len(data)):
            data[i]["artitle_label"] = result[i][0]
        return data
Beispiel #10
0
def printAllGear(allGear):
    headerString = 'ALL GEAR'
    print('\n\n\n%s\n%s\n' % (headerString, '-' * len(headerString)))

    for slot in sorted(list(allGear.keys())):
        print('\n\n%s' % slot.upper())

        slotPieces = allGear[slot]

        sortedPieces = sorted(slotPieces.values(),
                              lambda p1, p2: int(p2['DPS'] - p1['DPS']))
        headers = ['DPS', 'Name', 'ilvl', 'Location', 'Boss']

        print('')
        print(DataUtil.getTabulated(sortedPieces, headers))
        print('')
Beispiel #11
0
def calculateDiffs(globs):
    # Assign stats to the current gear and print it out
    currentGear = copy.deepcopy(globs.currentGear)

    # Print current gear
    items = [currentGear[slot] for slot in sorted(list(currentGear.keys()))]
    headers = ['Slot', 'Name', 'ilvl', 'Location', 'Boss', 'DPS']

    print('\nCurrent gear (%s %s):\n' %
          (globs.charInfo['Spec'], globs.charInfo['Class']))
    print(DataUtil.getTabulated(items, headers))

    # Print stat DPS
    print('\n\n\nStat DPS:\n')
    for stat in globs.charInfo['Stat DPS']:
        value = globs.charInfo['Stat DPS'][stat]
        print('%s:\t%.4f' % (stat, value))
    print('')

    # TODO
    # Factored this out into the Globals constructor.
    # Might want to get rid of this for sure later.
    # # Partition all gear into slots
    # allGear = CalcUtil.slotifyAllGear(globs.allGear)

    # Return a new object
    # Because mutation is wonky
    out = {}

    for slot in list(globs.allGear.keys()):
        out[slot] = {}

        # curPiece = currentGear[slot]

        actualSlotString = CalcUtil.removeUnderscore(slot)
        otherPieces = globs.allGear[actualSlotString]
        for name in otherPieces:
            # Making a deep copy gets rid of issues with having 2 ring slots.
            # The second DPSDiff calculation would clobber the original DPSDiff calculation.
            otherPiece = copy.deepcopy(otherPieces[name])
            # otherPiece['DPSDiff'] = CalcUtil.calcDPSDiff(curPiece, otherPiece, globs.statDPS)
            otherPiece['DPS'] = CalcUtil.calcDPS(otherPiece, globs.charInfo)

            out[slot][name] = otherPiece

    return out
Beispiel #12
0
from data_util import DataUtil
from lstm import SemiLSTM

if __name__ == '__main__':
    # 根据微博设定,截取文本最长长度为 140
    data_util = DataUtil()
    # 1. 建立 LSTM 网络
    lstm = SemiLSTM(lr=1e-4, epochs=20, batch_size=50)
    feature, label = data_util.load_data('data/train.txt', True)
    unlabeled_data, _ = data_util.load_data('data/unlabeled.txt', False)
    test_data, test_label = data_util.load_data('data/test.txt', True)
    lstm.build_lstm([32])
    lstm.train_semi(feature,
                    label,
                    test_data,
                    test_label,
                    unlabeled_data,
                    round=5,
                    saved_model='my-lstm')
    lstm.test(test_data, test_label)
    # 2. 根据训练好的模型,预测是否为不良言论
    saved_lstm = SemiLSTM(lr=1e-4, epochs=20, batch_size=50)
    text = '如何真正为自己的利益发声,而不被境外势力利用?那些势力并不关心你想要的民主,它们只想要中国弱下去'
    feature = data_util.extract_feature(text)
    result = saved_lstm.test_text(feature, saved_model='my-lstm')
    print(result)
    text = '菅义伟在开记者会,两次鞠躬、向国民道歉,“没能解除紧急事态,我非常抱歉”。记者问,“没能解除紧急事态的原因是什么?您自己觉得充分向国民说明了吗?”v光计划 。'
    feature = data_util.extract_feature(text)
    result = saved_lstm.test_text(feature, saved_model='my-lstm')
    print(result)
 def load_data(self):
     self.du = DataUtil(self.config)
     self.max_as_count = self.du.max_as_count
class Coref_cluster(object):
    def __init__(self, config):
        self.config = config
        self.load_data()
        self.add_placeholder()
        scores = self.add_model()
        self.add_loss_and_train_op(scores)
        self.add_predict_op(scores)
        self.init_op = tf.initialize_all_variables()
        self.saver = tf.train.Saver()

    def load_data(self):
        self.du = DataUtil(self.config)
        self.max_as_count = self.du.max_as_count

    def add_placeholder(self):
        self.inputs = tf.placeholder(tf.float32)
        self.labels = tf.placeholder(tf.int32)
        self.deltas = tf.placeholder(tf.float32)

    def create_feed_dict(self, inputs, deltas=None, labels=None):
        feed = {self.inputs: inputs}
        if labels:
            feed[self.deltas] = deltas
            feed[self.labels] = labels
        return feed

    def add_model(self):
        x = tf.reshape(self.inputs, (-1, self.config.I))
        W1 = tf.get_variable('W1', [self.config.I, self.config.M1])
        b1 = tf.get_variable('b1', [self.config.M1])
        fc1 = tf.matmul(x, W1) + b1
        relu1 = tf.nn.relu(fc1)

        W2 = tf.get_variable('W2', [self.config.M1, self.config.M2])
        b2 = tf.get_variable('b2', [self.config.M2])
        fc2 = tf.matmul(relu1, W2) + b2
        relu2 = tf.nn.relu(fc2)

        W3 = tf.get_variable('W3', [self.config.M2, 1])
        b3 = tf.get_variable('b3', [1])
        fc3 = tf.matmul(relu2, W3) + b3
        scores = tf.abs(fc3)

        return scores

    def add_loss_and_train_op(self, scores):
        target_scores = tf.gather(scores, self.labels)
        scores = tf.reshape(scores, (-1, self.max_as_count))
        loss = 1 + scores - target_scores
        self.loss = tf.reduce_sum(tf.reduce_max(loss * self.deltas, 1))
        optimizer = tf.train.RMSPropOptimizer(self.config.learning_rate)
        self.train_op = optimizer.minimize(self.loss)

    def add_predict_op(self, scores):
        self.predictions = tf.argmax(
            tf.reshape(scores, (-1, self.max_as_count)), 1)

    def run_epoch(self, session, save=None, load=None):
        if not os.path.exists('./save'):
            os.makedirs('./save')
        if load:
            self.saver.restore(session, load)
        else:
            session.run(self.init_op)
        time0 = time.time()
        for epoch in range(self.config.epochs):
            time1 = time.time()
            shuffled_epoch_Rs, shuffled_epoch_HAs, shuffled_epoch_HTs, shuffled_epoch_deltas, \
                    shuffled_answer_indices = self.du.get_shuffled_data_set()
            assert len(shuffled_epoch_HTs) == len(
                shuffled_answer_indices) == len(shuffled_epoch_deltas)
            start_ind = 0
            len_data_set = len(shuffled_epoch_Rs)
            step = 1
            time2 = time.time()
            best_loss = float('inf')
            loss = 0
            while start_ind < len_data_set:
                time3 = time.time()
                end_ind = start_ind + self.config.batch_size
                if end_ind > len_data_set:
                    end_ind = len_data_set
                    start_ind = end_ind - self.config.batch_size
                batch_Rs = shuffled_epoch_Rs[start_ind:end_ind]
                batch_As = shuffled_epoch_HAs[start_ind:end_ind]
                batch_Ts = shuffled_epoch_HTs[start_ind:end_ind]
                batch_labels = shuffled_answer_indices[start_ind:end_ind]
                batch_deltas = shuffled_epoch_deltas[start_ind:end_ind]
                batch_HAs = self.du.encode_mention_pairs(
                    batch_Rs, batch_Ts, batch_As)
                start_ind = end_ind
                time4 = time.time()
                batch_labels = [
                    batch_labels[i] + self.max_as_count * i
                    for i in range(len(batch_labels))
                ]
                feed = self.create_feed_dict(batch_HAs, batch_deltas,
                                             batch_labels)
                batch_loss, _ = sess.run([self.loss, self.train_op],
                                         feed_dict=feed)
                time5 = time.time()
                loss += batch_loss
                if step % self.config.interval == 0:
                    print 'Epoch {}, Step {}, Time {:.2f}, Loss {:.2f}'.format(
                        epoch, step, time5 - time0, batch_loss)
                step += 1

            if best_loss >= loss / step:
                self.evluation(session)
                if save is not None:
                    self.saver.save(session, save)
                else:
                    self.saver.save(session, './save/weight_{}'.format(epoch))

    def evluation(self, session, load=None):
        if load:
            self.saver.restore(session, load)

        train_answer_indices, train_h_r_antecedents = \
                self.du.get_test_data(self.config.test_batch_size, 'train')
        feed1 = self.create_feed_dict(inputs=train_h_r_antecedents)
        predictions1 = sess.run(self.predictions, feed_dict=feed1)

        test_answer_indices, test_h_r_antecedents = \
                self.du.get_test_data(self.config.test_batch_size, 'test')
        feed2 = self.create_feed_dict(inputs=test_h_r_antecedents)
        predictions2 = sess.run(self.predictions, feed_dict=feed2)

        train_acc = metrics.accuracy_score(train_answer_indices, predictions1)
        test_acc = metrics.accuracy_score(test_answer_indices, predictions2)

        print '============================='
        print 'Training Accuracy: {:.4f}'.format(train_acc)
        print 'Testing Accuracy: {:.4f}'.format(test_acc)
        print '============================='
Beispiel #15
0
 def test_getAverageHR(self):
     datautil = DataUtil('../.config/postgres.config')
     self.assertEqual(datautil.getAverageHR(0, 0, 0), 0)
     self.assertEqual(datautil.getAverageHR(0, -1, 0), 0)
     self.assertEqual(datautil.getAverageHR(45, -1, 45), 30)
     self.assertEqual(datautil.getAverageHR(45, 45, 45), 45)
Beispiel #16
0
 def __init__(self):
     self.config = Config()
     self.du = DataUtil()
     self.inv_map = {v: k for k, v in self.config.class_dict.iteritems()}
Beispiel #17
0
    dev_fname = 'dev-data-processed.json'
    test_fname = 'test-data-processed.json'


modes = ['train_lm', 'test_lm', 'train_tri-an']

if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in modes:
        print('add one of the following arguments:', modes)
    else:
        mode = sys.argv[1]
        print('mode is', mode)

        config = Config()
        lm_config = LMConfig()
        data_util = DataUtil(config, lm_config, device)

        # define language model
        lm = LM(data_util.vocab_size, lm_config.embed_dim,
                lm_config.hidden_dim, data_util.embedding, lm_config.dropout,
                device).to(device)

        lm_train_util = LMTrainUtil(data_util.lm_train_iter,
                                    data_util.lm_dev_iter, lm, device,
                                    lm_config, data_util.vocab_size,
                                    data_util.TEXT)

        if mode == 'train_lm':
            # train language model
            lm_train_util.train_model()
        elif mode == 'test_lm':
Beispiel #18
0
from data_util import DataUtil, get_embd 


if __name__ == "__main__":

    parser = argparse.ArgumentParser(description='cmod')
    parser.add_argument('-c', '--config', help='Config file path', required=True)
    cfg_parser = configparser.ConfigParser()

    args = parser.parse_args()
    cfg_parser.read(args.config)

    cfg = config.Config(cfg_parser)

    D = DataUtil (cfg)

    train_dataset = D.get_data('train')
    test_dataset = D.get_data('test')
    dev_dataset = D.get_data('dev')

    device = torch.device("cuda:0" if cfg.use_cuda() else "cpu")

    if cfg.sparse() and cfg.weight_decay() != 0:
        cfg.logger.error('Sparsity and weight decay are incompatible, pick one!')
        exit()

    torch.manual_seed(cfg.random_seed())
    random.seed(cfg.random_seed())

    if cfg.use_cuda():
Beispiel #19
0
   date:          2018/5/5 0005
-------------------------------------------------
   Change Activity:
                   2018/5/5 0005:
-------------------------------------------------
"""
__author__ = 'Administrator'

import warnings

from unittest import TestCase
from data_helper import DataHelper
from data_util import DataUtil
from feature_integrate import *
d_h = DataHelper()
d_t = DataUtil()
path = '../data/dm/train.csv'
data = pd.read_csv(path)

fi = FeatureIntegrate()


class TestFeature(TestCase):
    def test_train_feature(self):
        """
        Ran 1 test in 3.251s
        """
        train_feature = fi.train_feature_integrate(data)
        print(train_feature.columns)
        assert True
Beispiel #20
0
num_epochs = 5
vocab_size = 8000
log_window = 30
val_size = 7300  #4000    # validation size
make_model = False  #True
make_test = True
categories = ['politics', 'entertainment', 'sport', 'business']

model_path = 'models/textcnn_17-12-10_20-12-07.pkl'
train_path = 'train_ksj.json'
voca_path = 'voca.json'
val_path = 'val_ksj.json'
test_path = 'test_ksj.json'
savepath = ''

util = DataUtil(seq_length, vocab_size, batch_size, train_path, val_path,
                voca_path)
textcnn = TextCNN(seq_length, num_classes, vocab_size, embed_size,
                  filter_sizes, num_filters, dropout_prob)
textcnn.cuda()
# define loss & optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(textcnn.parameters(), lr=learning_rate)

num_batches = util.num_batch


def val_accuracy(data):  # data = zip(input, label)

    textcnn.eval()
    num_correct = 0
    num_total = 0
Beispiel #21
0
class Experiment:
    def __init__(self,
                 training_epochs=50,
                 sequence_length=20,
                 batch_size=100,
                 learning_rate=0.001,
                 dropout=0.2):
        # Hyper Parameters
        self.sequence_length = sequence_length
        self.embedding_size = 512
        self.hidden_size = 128
        self.num_layers = 1
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.dropout = dropout

        self.data = DataUtil()
        self.data.load_split_data()
        print(self.data.get_dataset(self.data.TRAIN)[:5])
        self.data.build_vocab(
            self.data.get_dataset(self.data.TRAIN) +
            self.data.get_dataset(self.data.TEST))

        self.model = GRURNN(self.embedding_size, self.hidden_size,
                            self.num_layers, self.data.input_lang.n_words,
                            self.data.output_lang.n_words, self.dropout)
        self.training_epochs = training_epochs
        self.epoch_start = 1
        self.use_cuda = torch.cuda.is_available()

    def as_minutes(self, s):
        m = math.floor(s / 60)
        s -= m * 60
        return '%dm %ds' % (m, s)

    def time_since(self, since, percent):
        now = time.time()
        s = now - since
        es = s / percent
        rs = es - s
        return '%s (- %s)' % (self.as_minutes(s), self.as_minutes(rs))

    def train(self, print_every=20, plot_every=100, learning_rate=0.01):
        start = time.time()

        plot_losses = []
        print_loss_total = 0
        plot_loss_total = 0

        # optimizer = optim.SGD(self.model.parameters(), lr=learning_rate)
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     self.model.parameters()),
                              lr=learning_rate,
                              momentum=0.9)
        # optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=learning_rate)

        num_train_data = self.data.get_dataset_size(self.data.TRAIN)
        num_batches = int(np.ceil(num_train_data / float(self.batch_size)))
        log('num_batches: ' + str(num_batches))

        for epoch in range(self.epoch_start, self.training_epochs + 1):
            batch_start = time.time()
            correct = 0
            total = 0

            train_data = self.data.get_dataset(self.data.TRAIN)
            random.shuffle(train_data)

            self.model.train()

            for cnt, i in enumerate(random.sample(range(num_batches),
                                                  num_batches),
                                    start=1):
                inputs, seq_lengths, targets = self.data.construct_batch(
                    self.batch_size * i,
                    self.batch_size * (i + 1),
                    dataset=self.data.TRAIN)

                if self.use_cuda:
                    inputs = inputs.cuda()
                    targets = targets.cuda()

                optimizer.zero_grad()

                outputs = self.model(inputs, seq_lengths)
                _, predicted = torch.max(outputs.data, dim=1)

                total += targets.data.size(0)
                correct += (predicted == targets.data).sum()
                batch_train_acc = 100.0 * (
                    predicted == targets.data).sum() / targets.data.size(0)

                # loss = F.nll_loss(outputs, targets)
                loss = F.cross_entropy(outputs, targets)
                loss.backward()
                optimizer.step()
                log("Epoch %d, batch %d / %d: train loss = %f, train accuracy = %f %%"
                    % (epoch, cnt, num_batches, loss.data[0], batch_train_acc))

                print_loss_total += loss.data[0]
                plot_loss_total += loss.data[0]

                if cnt % print_every == 0:
                    print_loss_avg = print_loss_total / print_every
                    print_loss_total = 0
                    log('Average batch loss: %s' % str(print_loss_avg))
                    log(self.time_since(batch_start, cnt * 1.0 / num_batches))

                if cnt % plot_every == 0:
                    plot_loss_avg = plot_loss_total / plot_every
                    plot_losses.append(plot_loss_avg)
                    plot_loss_total = 0
            log("epoch %s is done" % str(epoch))
            log('Train Accuracy: %f %%' % (100.0 * correct / total))
            log(self.time_since(start, epoch * 1.0 / self.training_epochs))

            # save intermediate training results
            save_path = "train_saved/epoch%s.pt" % str(epoch)
            torch.save(self.model, save_path)
            log('Model saved in file: %s' % save_path)

            # run test set after one epoch
            self.test()

    def test(self, epoch=-1):
        if epoch > 0:
            self.model = torch.load("train_saved/epoch%s.pt" % str(epoch))
            log('Model of epoch ' + str(epoch) + ' is restored.')

        self.model.eval()

        start = time.time()

        num_test_data = self.data.get_dataset_size(self.data.TEST)
        num_batches = int(np.ceil(num_test_data / float(self.batch_size)))
        log('num_batches: ' + str(num_batches))

        correct = 0
        total = 0
        loss = 0.0
        labels = []
        predictions = []
        for i in random.sample(range(num_batches), num_batches):
            inputs, seq_lengths, targets = self.data.construct_batch(
                self.batch_size * i,
                self.batch_size * (i + 1),
                dataset=self.data.TEST)

            if self.use_cuda:
                inputs = inputs.cuda()
                targets = targets.cuda()

            outputs = self.model(inputs, seq_lengths)
            _, predicted = torch.max(outputs.data, dim=1)

            total += targets.data.size(0)
            correct += (predicted == targets.data).sum()
            labels.extend(targets.data.numpy().tolist())
            predictions.extend(predicted.numpy().tolist())

            loss += F.cross_entropy(outputs, targets).data[0]
        log('Time used: ' + str(time.time() - start))
        log('Test loss: %f' % loss)
        log('Test Accuracy: %f %%' % (100.0 * correct / total))
        log('Test Precision: %f %%' %
            (100.0 * precision_score(labels, predictions, average='micro')))
        log('Test Recall: %f %%' %
            (100.0 * recall_score(labels, predictions, average='micro')))
        log('Test F1 Score: %f %%' %
            (100.0 * f1_score(labels, predictions, average='micro')))
Beispiel #22
0
        print "ERROR: need model destination filepath!"
        sys.exit(1)

    if len(sys.argv) > 2:
        layer_arg = int(sys.argv[2])
    else:
        layer_arg = 2

    if len(sys.argv) > 3:
        ep_arg = int(sys.argv[3])
    else:
        ep_arg = 20

    # Read the data
    print ">> Initializing data..."
    reader = DataUtil(WORDVEC_FILEPATH, TAGGED_NEWS_FILEPATH)
    X, Y = reader.get_data()
    print X.shape
    print Y.shape

    # Train the model
    print ">> Training model... epochs = {0}, layers = {1}".format(
        ep_arg, layer_arg)
    nermodel = NERModel(reader)
    nermodel.train(epochs=ep_arg, layers=layer_arg)

    # Evaluate the model
    print ">> Evaluating model..."
    nermodel.evaluate()

    # Save the model
    parser.add_argument(
        '--varthresh', help='variance thresh (default 0 means take all)', type=float, default=0)

    args = parser.parse_args()

    rand_state = 1

    n_cv = args.cv
    n_iter_search = args.iter
    sample_rate = args.sample
    sub_sample = (False if sample_rate < 0 else True)
    var_thresh = args.varthresh
    scoring = 'log_loss'
    verbose = 10

    du = DataUtil()
    du.load_data(sub_sample=sub_sample, sample_rate=sample_rate)

    x_train, x_test = du.vectorize_x(
        ['brand_code', 'model_code', 'label_id_bag'], variance_thresh=var_thresh)
    print('train set shape: ', x_train.shape)
    print('test set shape: ', x_test.shape)

    # xgb seems have issue detecting number of columns with sparse matrix
    x_train_xgb = sp.hstack(
        (x_train, sp.csr_matrix(np.ones((x_train.shape[0], 1)))))
    print(
        'patching train data with non-zero column to get around xgb sparse issue')

    y_train = du.get_y_train()
    print('y_train shape: ', y_train.shape)
Beispiel #24
0
__author__ = 'jdwang'
__date__ = 'create date: 2016-07-05'
__email__ = '*****@*****.**'

from data_util import DataUtil
final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/NLPCC2016_Stance_Detection_Task_A_Testdata.txt'
final_test_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/TaskA_all_testdata_15000.csv'

# 去进行分类的句子
final_test_classify_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/train_data/TaskA_all_testdata_14966.csv'
# 分类结果的标签
clasify_result_file_path = '/home/jdwang/PycharmProjects/weiboStanceDetection/data_processing/result/cp_L_rf_1000tree_classify_label.csv'



data_util = DataUtil()

final_test_data = data_util.load_data(final_test_file_path)
print(final_test_data.head())
print(final_test_data.shape)
# quit()
# final_test_data = final_test_data[[]]
print(final_test_data[final_test_data['WORDS'].isnull()].shape)
print(final_test_data[final_test_data['WORDS'].isnull()])
final_test_data = final_test_data[final_test_data['WORDS'].notnull()]
data_util.save_data(final_test_data,'result/TaskA_all_testdata_15000_A.csv')
# print(final_test_data.tail())
# print(final_test_data.sort_values(by=['ID']).tail())
quit()

final_test_classify_data = data_util.load_data(final_test_classify_file_path)
class Reinforcer:
    def __init__(self):
        self.config = Config()
        self.du = DataUtil(self.config)
        self.sc = StockScraper(ASingleStockConfig())
        self.config.ACTION_NUM = len(self.config.actions)
        self.memories = []
        self.W1 = tf.get_variable('W1', [self.config.INPUT, self.config.M1])
        self.b1 = tf.get_variable('b1', [self.config.M1])
        self.W2 = tf.get_variable('W2', [self.config.M1, self.config.M2])
        self.b2 = tf.get_variable('b2', [self.config.M2])
        self.W3 = tf.get_variable('W3', [self.config.M2, 1])
        self.b3 = tf.get_variable('b3', [1])

        self.current_data = []
        self.current_state = []

        self.portfolio = {
            'fund': 500000,
            'stock_quantity': 50000,
            'current_stock_price': 0,
            'total': -1,
            'stock_value': 0
        }

        # self.init_op = tf.initialize_all_variables()
        self.init_placeholder()
        scores = self.batch_scoring_op()
        next_step_scores = self.batch_predict_op()
        self.add_loss_n_train_op(scores, next_step_scores)
        self.add_step_predict_op()
        self.saver = tf.train.Saver()

        self.init_op = tf.initialize_all_variables()

    def init_placeholder(self):
        self.states = tf.placeholder(tf.float32)
        self.rewards = tf.placeholder(tf.float32)
        self.states_next = tf.placeholder(tf.float32)

    def batch_scoring_op(self):
        x = tf.reshape(self.states,
                       (self.config.BATCH_SIZE, self.config.INPUT))
        scores = self.Q_network_op(x)
        return scores

    def add_loss_n_train_op(self, scores, next_scores):
        self.predict_scores = self.config.gamma * tf.reduce_max(next_scores, 1)
        self.viewing_scores = scores
        '''sarsa reward better?'''
        self.losses = (self.rewards + self.predict_scores - scores)**2
        self.loss = tf.reduce_sum(self.losses)
        optimizer = tf.train.RMSPropOptimizer(self.config.lr)
        self.train_op = optimizer.minimize(self.loss)

    def add_step_predict_op(self):
        x = tf.reshape(self.states,
                       (self.config.ACTION_NUM, self.config.INPUT))
        scores = self.Q_network_op(x)
        self.prediction = tf.argmax(tf.reshape(scores,
                                               (-1, self.config.ACTION_NUM)),
                                    axis=1)[0]

    def Q_network_op(self, x):
        fc1 = tf.matmul(x, self.W1) + self.b1
        tanh1 = tf.nn.tanh(fc1)
        tanh1 = tf.nn.dropout(tanh1, self.config.DROPOUT)
        fc2 = tf.matmul(tanh1, self.W2) + self.b2
        tanh2 = tf.nn.tanh(fc2)
        tanh2 = tf.nn.dropout(tanh2, self.config.DROPOUT)
        scores = tf.matmul(tanh2, self.W3) + self.b3
        scores = tf.squeeze(scores)
        return scores

    def batch_predict_op(self):
        x = tf.reshape(self.states_next,
                       (self.config.BATCH_SIZE * self.config.ACTION_NUM,
                        self.config.INPUT))
        Q_scores = self.Q_network_op(x)
        Q_scores = tf.reshape(Q_scores,
                              (self.config.BATCH_SIZE, self.config.ACTION_NUM))
        return Q_scores

    def build_feed_dict(self, random_memories):
        feed = {}
        feed[self.states] = [m[0] for m in random_memories]
        states_next = []
        feed[self.rewards] = [m[1] for m in random_memories]
        new_portfolios = [m[-1] for m in random_memories]
        new_datas = [m[-2] for m in random_memories]
        assert len(new_datas) == len(new_portfolios)
        for i in range(len(new_datas)):
            port = new_portfolios[i]
            data = new_datas[i]
            for action in self.config.actions:
                action = self.action_policy(action, port)
                port_to_be_evaluated = self.update_portfolio_after_action(
                    port, action)
                print "predicting...", port_to_be_evaluated, action
                state_to_be_evaluated = self.du.preprocess_state(
                    data, port_to_be_evaluated)
                states_next.append(state_to_be_evaluated)
        feed[self.states_next] = states_next
        return feed

    def action_policy(self, buy_quantity, portfolio):
        stock_price = portfolio['current_stock_price']
        fund = portfolio['fund']
        stock_quantity = portfolio['stock_quantity']
        if buy_quantity > 0:
            if buy_quantity * stock_price > fund:
                quantity_max = fund / stock_price
                for action in self.config.actions[::-1]:
                    if action <= quantity_max:
                        buy_quantity = action
            return buy_quantity
        elif buy_quantity < 0:
            if -buy_quantity > stock_quantity:
                for action in self.config.actions:
                    if -action <= stock_quantity:
                        buy_quantity = action
            return buy_quantity
        else:
            return 0

    @staticmethod
    def update_portfolio_after_action(portfolio, action):
        # print 'action', action
        port = copy(portfolio)
        if action == 0:
            return port
        else:
            # print 'a', port['fund'], 'b', port['current_stock_price'], 'c',action
            port['fund'] = port['fund'] - port['current_stock_price'] * action
            port['stock_quantity'] += action
            port['stock_value'] += port['current_stock_price'] * action
            return port

    @staticmethod
    def update_portfolio_after_fetch_price(portfolio, new_price):
        port = copy(portfolio)
        port['current_stock_price'] = new_price
        port['stock_value'] = new_price * port['stock_quantity']
        port['total'] = port['stock_value'] + port['fund']
        return port

    @staticmethod
    def calc_total_with_different_price(portfolio, price):
        return portfolio['stock_quantity'] * price + portfolio['fund']

    @staticmethod
    # def calc_reward(new_portfolio, prev_portfolio):
    #     return 1000.0*(new_portfolio['total'] - prev_portfolio['total']) / prev_portfolio['total']
    def calc_reward(new_portfolio, prev_portfolio):
        '''reward compared to hold'''
        print "new, ", new_portfolio
        print "prev, ", prev_portfolio
        return new_portfolio[
            'total'] - Reinforcer.calc_total_with_different_price(
                prev_portfolio, new_portfolio['current_stock_price'])

    def run_epoch(self, session, save=None, load=None):
        if not os.path.exists('./save'):
            os.makedirs('./save')
        if load:
            self.saver.restore(session, load)
        else:
            session.run(self.init_op)

        while True:
            if self.portfolio['total'] == -1:
                init_data = self.sc.request_api()
                print init_data
                if init_data[self.config.open_price_ind]:
                    # assert init_data[self.config.open_price_ind] == init_data[self.config.current_ind]
                    self.portfolio['current_stock_price'] = init_data[
                        self.config.current_ind]
                    self.portfolio['stock_value'] = self.portfolio[
                        'stock_quantity'] * self.portfolio[
                            'current_stock_price']
                    self.portfolio['total'] = self.portfolio[
                        'stock_value'] + self.portfolio['fund']
                    self.current_data = init_data
                    self.current_state = self.du.preprocess_state(
                        init_data, self.portfolio)
                    self.config.INPUT = len(self.current_state)
                else:
                    print "market closed or stock halts"
                    sys.exit(0)
                print self.config.INPUT
            is_exploration = random.random()
            assert self.portfolio['current_stock_price'] != 0
            if is_exploration <= self.config.EPSILON:
                buy_quantity = random.choice(self.config.actions)
                print "random"
            else:
                candidates = []

                for action in self.config.actions:
                    action = self.action_policy(action, self.portfolio)
                    candidate_portfolio = self.update_portfolio_after_action(
                        self.portfolio, action)
                    candidate_state = self.du.preprocess_state(
                        self.current_data, candidate_portfolio)
                    candidates.append(candidate_state)
                max_q_ind = sess.run(self.prediction,
                                     feed_dict={self.states: candidates})
                buy_quantity = self.config.actions[max_q_ind]
            '''fetch!!!'''
            # time.sleep(self.sc.config.time_interval)
            new_data = self.sc.request_api()
            '''update my portfolio & get reward'''
            port_before_action = copy(self.portfolio)
            new_portfolio = self.update_portfolio_after_action(
                self.portfolio, buy_quantity)
            if (new_portfolio['current_stock_price'] *
                    new_portfolio['stock_quantity'] +
                    new_portfolio['fund']) != new_portfolio['total']:
                print(
                    new_portfolio['current_stock_price'] *
                    new_portfolio['stock_quantity'] +
                    new_portfolio['fund']), new_portfolio['total']
                print "*&&&*^*^&*^(&*&^%*&^%*&^"
            self.portfolio = new_portfolio
            self.current_state = self.du.preprocess_state(
                self.current_data, self.portfolio)
            new_price = new_data[self.config.current_ind]
            new_portfolio = self.update_portfolio_after_fetch_price(
                new_portfolio, new_price)
            assert (new_portfolio['current_stock_price'] *
                    new_portfolio['stock_quantity'] +
                    new_portfolio['fund']) == new_portfolio['total']

            reward = self.calc_reward(new_portfolio, port_before_action)
            print "################### reward : ", reward, "####################"
            '''now current state is a state where price is old while action has been performed'''
            '''new_state is a state where price is new and with new portfolio, but has not made furthur action yet'''
            self.memories.append(
                (self.current_state, reward, new_data, new_portfolio))
            '''update data and portfolio'''
            self.current_data = new_data
            self.portfolio = new_portfolio
            print "action taken: ", buy_quantity
            print "current portfolio: ", new_portfolio
            print "total: ", new_portfolio['total']
            print "histroy: ", len(self.memories)
            print "wait for next tick ................\n"

            if len(self.memories) > 2 * self.config.BATCH_SIZE:
                random.shuffle(self.memories)
                batch = self.memories[:self.config.BATCH_SIZE]
                '''batch BS*I'''
                feed = self.build_feed_dict(batch)
                scores1, scores2, losses, loss, _ = sess.run([
                    self.predict_scores, self.viewing_scores, self.losses,
                    self.loss, self.train_op
                ],
                                                             feed_dict=feed)
                print loss
Beispiel #26
0
class Classifier:
    def __init__(self):
        self.config = Config()
        self.du = DataUtil()
        self.inv_map = {v: k for k, v in self.config.class_dict.iteritems()}

    def run_trainer(self):
        #load train set
        self.raw_sent, self.data, self.raw_labels = self.du.load_data_set()
        #load test set
        self.test_sent, self.test_data, self.test_raw_labels = self.du.load_data_set(
            'test')
        #shuffle train set
        self.raw_sent, self.data, self.raw_labels = shuffle(
            self.raw_sent, self.data, self.raw_labels)
        #train/test classes in integer
        self.classes = self.du.convert_raw_label_to_class(
            self.raw_labels, self.config.class_dict)
        self.test_classes = self.du.convert_raw_label_to_class(
            self.test_raw_labels, self.config.class_dict)
        #convert to one hot catagory
        self.test_labels = keras.utils.to_categorical(self.test_classes,
                                                      self.du.config.n_classes)
        self.labels = keras.utils.to_categorical(self.classes,
                                                 self.du.config.n_classes)

        #compile model
        self.model = self.build_model()
        self.model.compile(
            loss=keras.losses.categorical_crossentropy,
            optimizer=keras.optimizers.RMSprop(lr=self.config.lr),
            metrics=['accuracy'])
        self.train()
        self.evaluate()
        self.model.save(self.config.final_round_model_path)

    def run_prediction(self, sentence):
        emb_sent = self.du.prepare_predict_data(sentence)
        emb_sent = emb_sent.reshape(
            [1, self.config.max_sent_len, self.config.emb_dim])
        self.config.dropout = 0
        self.model = load_model(self.config.dl_model_path)
        pred, prob = self.predict(emb_sent)
        print pred
        pred = pred[0]
        prob = prob[0]
        response = self.inv_map[pred]
        return response, prob

    def build_model(self):
        input = Input(shape=(self.config.max_sent_len, self.config.emb_dim))
        conv_output = Conv1D(self.config.n_filter,
                             kernel_size=self.config.filter_size,
                             strides=1,
                             activation="relu")(input)
        lstm_output = LSTM(self.config.lstm_dim,
                           dropout=self.config.dropout)(conv_output)
        out = Dense(self.config.n_classes,
                    activity_regularizer=l2(self.config.l2_rate),
                    activation="softmax")(lstm_output)

        model = Model(inputs=[input], outputs=[out])
        return model

    def train(self):

        check = keras.callbacks.ModelCheckpoint(self.du.config.dl_model_path,
                                                monitor='val_acc',
                                                verbose=1,
                                                save_best_only=True,
                                                save_weights_only=False,
                                                mode='auto',
                                                period=1)
        self.model.fit(self.data,
                       self.labels,
                       batch_size=self.du.config.batch_size,
                       epochs=self.du.config.epochs,
                       verbose=1,
                       validation_split=0.1,
                       callbacks=[check])

    def predict(self, test_data):
        probs = self.model.predict(test_data)
        predictions = np.argmax(probs, axis=-1)
        return predictions, probs

    def evaluate(self):
        # self.config.dropout = 0
        # predictions, _ = self.predict(self.data)
        # comparison = (predictions == self.classes)
        # acc = np.mean(comparison)
        print self.model.evaluate(self.test_data, self.test_labels)
        self.generate_prediction_results(self.test_data, self.test_raw_labels,
                                         self.test_sent)

    def evaluate_on_model(self, model_path):
        self.test_sent, self.test_data, self.test_raw_labels = self.du.load_data_set(
            'test')
        self.test_classes = self.du.convert_raw_label_to_class(
            self.test_raw_labels, self.config.class_dict)
        self.test_labels = keras.utils.to_categorical(self.test_classes,
                                                      self.du.config.n_classes)

        self.model = load_model(model_path)
        print self.model.evaluate(self.test_data, self.test_labels)
        self.generate_prediction_results(self.test_data, self.test_raw_labels,
                                         self.test_sent)

    def generate_prediction_results(self, data, raw_labels, raw_sents):
        with open(self.config.result_path, 'w') as f:
            predictions = self.model.predict(data)
            predictions = np.argmax(predictions, axis=-1)
            '''argsort'''
            # top_n = predictions.argsort(axis=-1, order=)
            for i in range(len(predictions)):
                sent = raw_sents[i]
                ground_truth = raw_labels[i]
                pred = self.inv_map[predictions[i]]
                res = '\t'.join([sent, ground_truth, pred]) + '\n'
                f.write(res)
        print "file written"
Beispiel #27
0
WORDVEC_FILEPATH = "wordvecs.txt"
TAGGED_NEWS_FILEPATH = "news_tagged_data.txt"
SAVED_MODEL_FILEPATH = "model_blstm_150_150_ep50.h5"
NEWS_DATA_FILEPATH = "news_tagged_data.txt"
EXTRA_LOGGING = False
PRINT_BAD = True

if __name__ == "__main__":

    if len(sys.argv) > 1:
        n_samples = int(sys.argv[1])
    else:
        n_samples = sys.maxint

    reader = DataUtil(WORDVEC_FILEPATH, TAGGED_NEWS_FILEPATH)
    nermodel = NERModel(reader)

    nermodel.load(SAVED_MODEL_FILEPATH)

    with open(NEWS_DATA_FILEPATH, 'r') as f:
        cur_sentence = []
        cur_tags = []

        samples_read = 0
        total_frames = 0
        total_matched_frames = 0
        total_correct_preditions = 0

        for line in f:
            line = line.strip()
Beispiel #28
0
    def my_print(*lists):
        """
        如果参数列表最后一位为 False 就拒绝打印
        """

        print(DataUtil.decode(lists))
# Setup flask server
server = flask.Flask(__name__)
app = dash.Dash("Charge_Tracker",
                external_stylesheets=[\
                        "https://codepen.io/chriddyp/pen/bWLwgP.css"])
cache = Cache(app.server, config={
    'CACHE_TYPE': 'filesystem',
    'CACHE_DIR': 'cache-directory'
})
TIMEOUT = 2
app.config['suppress_callback_exceptions'] = True

## GLOBAL DEFINITIONS 


query_helper = DataUtil()

queries_sig={'signames_ecg':[], 'signals':[]}
queries_name={'signames_ecg':[]}
queries_name={'signames_ecg':['101', '103']}
queries_id={'id':['1']}

evnt_df = query_helper.getAllEvents()
# sig_df = query_helper.getECGSignal('50')
# x=list(range(len(sig_df.get_value(0,'ecg'))))
# y_sig= sig_df.get_value(0,'ecg')
# trace = []
# trace.append(go.Scatter(x=x, y=y_sig, mode='lines',
#                 marker={'size': 8, "opacity": 0.6, "line": {'width': 0.5}}, ))
# print("FIRST TRACE")
# print(trace)
Beispiel #30
0
class Experiment:
    def __init__(self, config, sequence_length=20, reload_data=True):
        # Hyper Parameters
        self.sequence_length = sequence_length
        self.hidden_size = 128
        self.num_layers = 1

        self.config = config
        self.data = DataUtil(data_dir=config.data_dir,
                             vocab_dir=config.vocab_dir,
                             split_by_sentence=not config.split_by_section,
                             skip_list=config.skip_list)

        if not self.config.filtered:
            self.data.make_dir(self.config.output_dir + "/models/")

        if reload_data:
            for ds in self.config.textbook_data_sets:
                self.data.load_textbook_train_dev_data(
                    config.data_dir + 'medlit/train/' + ds,
                    config.data_dir + 'medlit/dev/' + ds)
            # train
            self.data.load_i2b2_train_data(train_base_dir=config.data_dir +
                                           '/i2b2_ehr/')
            # test
            self.data.load_test_data(ref_base_dir=config.data_dir +
                                     '/i2b2_ehr/')
            # dev
            self.data.load_test_data(ref_base_dir=config.data_dir +
                                     '/i2b2_ehr/',
                                     type='dev')

        else:
            self.data.load_split_data()

        self.data.make_dir(self.config.output_dir)

        log_file_name = strftime("log_%Y_%m_%d_%H_%M_%S", localtime())
        self.logger = self.setup_logger(self.config.output_dir +
                                        '/%s.txt' % log_file_name)

        if exists(config.vocab_dir + "/NaturalLang.pkl") and not reload_data:
            print("Loading vocab")
            self.data.load_vocab()
        else:
            print("Building vocab")
            self.data.build_vocab(self.data.textbook_train_data,
                                  pretrain=False)

        self.model = None
        self.use_cuda = torch.cuda.is_available()

        if not self.config.filtered:
            if self.config.model_type == 'gru_rnn':
                self.model = GRURNN(
                    self.config.embedding_size, self.hidden_size,
                    self.data.input_lang, self.data.pretrained_embeddings,
                    self.num_layers, self.data.input_lang.n_words,
                    self.data.output_lang.n_words, self.config.dropout)
            elif self.config.model_type == 'attn_gru_rnn':
                self.model = AttentionGRURNN(
                    self.config.embedding_size, self.hidden_size,
                    self.data.input_lang, self.data.pretrained_embeddings,
                    self.num_layers, self.data.input_lang.n_words,
                    self.data.output_lang.n_words, self.config.dropout)
            elif self.config.model_type == 'cnn':
                self.model = CNN(self.data.input_lang.n_words,
                                 self.data.output_lang.n_words,
                                 self.config.embedding_size,
                                 self.data.input_lang,
                                 self.data.pretrained_embeddings,
                                 self.config.dropout)

            self.epoch_start = 1

            if self.use_cuda:
                self.model = self.model.cuda()

    def setup_logger(self, log_file, level=logging.INFO):
        logger = logging.getLogger()
        logger.setLevel(level)
        handler = logging.FileHandler(log_file)
        formatter = logging.Formatter('%(asctime)s %(message)s')
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        return logger

    def log(self, info):
        print(info)
        if self.logger is not None:
            self.logger.info(info)

    def as_minutes(self, s):
        m = math.floor(s / 60)
        s -= m * 60
        return '%dm %ds' % (m, s)

    def time_since(self, since, percent):
        now = time.time()
        s = now - since
        es = s / percent
        rs = es - s
        return '%s (- %s)' % (self.as_minutes(s), self.as_minutes(rs))

    def train(self,
              data_setup,
              save_model_dir,
              print_every=20,
              plot_every=100,
              learning_rate=0.001):
        start = time.time()

        plot_losses = []
        print_loss_total = 0
        plot_loss_total = 0

        if self.config.model_type == 'cnn' and self.config.transfer_learning:
            self.model.output_size = self.data.output_lang.n_words
            if self.config.reuse_embedding_layer_only:
                self.model.init_conv1_layer()
                self.model.init_conv2_layer()
                self.model.init_fc_layers()
            if self.config.reuse_embedding_conv1_layers:
                self.model.init_conv2_layer()
                self.model.init_fc_layers()
            if self.use_cuda:
                self.model = self.model.cuda()
        elif self.config.transfer_learning:
            self.model.freeze_layer("fc1")

        if self.config.optimizer == 'sgd':
            optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                         self.model.parameters()),
                                  lr=learning_rate,
                                  momentum=0.9)
        elif self.config.optimizer == 'adam':
            optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                          self.model.parameters()),
                                   lr=learning_rate)

        self.log('data_setup:' + str(data_setup))

        train_data = []

        for data_set in data_setup:
            data_ratio = data_setup[data_set]
            data = self.data.get_dataset(data_set)
            train_data += self.data.get_data_subset(data, data_ratio)
        print('len train_data:', len(train_data))
        print('training data examples:', train_data[:5])

        if self.config.downsampling:
            train_data = self.data.downsampling(
                train_data, number_samples=self.config.downsampling_size)

        num_train_data = len(train_data)
        print('num_train_data:', num_train_data)
        print('train_data:', train_data[:10])
        num_batches = int(
            np.ceil(num_train_data / float(self.config.batch_size)))
        self.log('num_batches: ' + str(num_batches))

        if self.config.weighted_loss:
            loss_weight = self.data.get_label_weight(train_data)
            if self.use_cuda:
                loss_weight = loss_weight.cuda()
        else:
            loss_weight = None

        max_dev_acc = 0

        for epoch in range(self.epoch_start, self.config.num_train_epochs + 1):
            batch_start = time.time()
            correct = 0
            total = 0

            random.shuffle(train_data)

            self.model.train()

            for cnt, i in enumerate(random.sample(range(num_batches),
                                                  num_batches),
                                    start=1):
                inputs, seq_lengths, targets, batch = self.data.construct_batch(
                    self.config.batch_size * i,
                    self.config.batch_size * (i + 1),
                    train_data,
                    fixed_length=True
                    if self.config.model_type == 'cnn' else False)

                if self.use_cuda:
                    inputs = inputs.cuda()
                    targets = targets.cuda()

                optimizer.zero_grad()

                if self.config.model_type == 'cnn':
                    outputs = self.model(inputs)  # for CNN
                elif self.config.model_type == 'attn_gru_rnn':
                    outputs = self.model(inputs, self.data.input_lang,
                                         seq_lengths)
                else:
                    outputs = self.model(inputs, seq_lengths)

                _, predicted = torch.max(outputs.data, dim=1)

                total += targets.data.size(0)
                correct += (predicted == targets.data).sum()
                batch_train_acc = 100.0 * (
                    predicted == targets.data).sum() / targets.data.size(0)

                loss = F.cross_entropy(outputs, targets, weight=loss_weight)
                loss.backward()
                optimizer.step()
                self.log(
                    "Epoch %d, batch %d / %d: train loss = %f, train accuracy = %f %%"
                    % (epoch, cnt, num_batches, loss.data.item(),
                       batch_train_acc))

                print_loss_total += loss.data.item()
                plot_loss_total += loss.data.item()

                if cnt % print_every == 0:
                    print_loss_avg = print_loss_total / print_every
                    print_loss_total = 0
                    self.log('Average batch loss: %s' % str(print_loss_avg))
                    self.log(
                        self.time_since(batch_start, cnt * 1.0 / num_batches))

                if cnt % plot_every == 0:
                    plot_loss_avg = plot_loss_total / plot_every
                    plot_losses.append(plot_loss_avg)
                    plot_loss_total = 0
            self.log('Epoch %d is done' % epoch)
            self.log('Epoch %d Train Accuracy: %f %%' %
                     (epoch, 100.0 * correct / total))
            self.log(
                self.time_since(start,
                                epoch * 1.0 / self.config.num_train_epochs))

            datasets = []
            print("TUNING SET IS: " + str(self.config.tuning_set))
            if 'ALL' in self.config.tuning_set or 'MedLit' in self.config.tuning_set:
                self.log("Test on MedLit Dev: ")
                datasets.append(self.data.TEXTBOOK_DEV)
            if 'ALL' in self.config.tuning_set or 'i2b2' in self.config.tuning_set:
                self.log("Test on i2b2 EHR Dev: ")
                datasets.append(self.data.i2b2_DEV)

            self.log("Tuning on:")
            self.log(datasets)
            dev_acc = self.test(datasets=datasets,
                                epoch=epoch,
                                calc_confusion_matrix=True)

            # save intermediate training results
            if dev_acc > max_dev_acc:
                save_path = save_model_dir + "/models/best_model.pt"
                torch.save(self.model, save_path)
                self.log('Best Model saved in file: %s' % save_path)
                max_dev_acc = dev_acc

                if 'i2b2' in self.config.test_set:
                    self.log("Test on i2b2 Test:")
                    self.test(datasets=[self.data.i2b2_TEST],
                              epoch=epoch,
                              print_test_results=True)

            save_path = save_model_dir + "/models/epoch_" + str(epoch) + ".pt"
            torch.save(self.model, save_path)
            self.log('Model saved in file: %s' % save_path)

    def test(self,
             datasets,
             epoch=-1,
             calc_confusion_matrix=True,
             generate_reports=True,
             print_test_results=False,
             print_examples=False):
        if self.model is None:
            self.log('Restoring model from ' + self.config.reload_model_file)

            if torch.cuda.is_available():
                self.model = torch.load(self.config.reload_model_file)
            else:
                self.model = torch.load(self.config.reload_model_file,
                                        map_location='cpu')

            self.log('Model is restored')

        self.model.eval()

        start = time.time()

        data = []

        dataset_name = '_'.join(datasets)

        for dataset in datasets:
            data.extend(self.data.get_dataset(dataset))

        if self.config.downsampling:
            data = []
            for dataset in datasets:
                data.extend(self.data.get_dataset(dataset))
                data = self.data.downsampling(data, number_samples=500)

        num_test_data = len(data)
        self.log("num_test_data: " + str(num_test_data))
        num_batches = int(
            np.ceil(num_test_data / float(self.config.batch_size)))
        self.log('num_batches: ' + str(num_batches))

        correct = 0
        total = 0
        loss = 0.0
        labels = []
        predictions = []
        examples = []

        for i in range(num_batches):

            inputs, seq_lengths, targets, batch = self.data.construct_batch(
                self.config.batch_size * i,
                self.config.batch_size * (i + 1),
                data,
                fixed_length=True
                if self.config.model_type == 'cnn' else False)

            if self.use_cuda:
                inputs = inputs.cuda()
                targets = targets.cuda()

            if self.config.model_type == 'cnn':
                outputs = self.model(inputs)  # for CNN
            elif self.config.model_type == 'attn_gru_rnn':
                outputs = self.model(inputs, self.data.input_lang, seq_lengths)
            else:
                outputs = self.model(inputs, seq_lengths)

            _, predicted = torch.max(outputs.data, dim=1)

            ordered = torch.sort(outputs.data)

            total += targets.data.size(0)
            correct += (predicted == targets.data).sum()
            labels.extend(targets.cpu().data.numpy().tolist())
            predictions.extend(predicted.cpu().numpy().tolist())

            loss += F.cross_entropy(outputs, targets).data.item()

            if print_examples or print_test_results:
                for k, d in enumerate(batch):
                    examples.append([
                        d[0],
                        d[1].replace('\r',
                                     ' ').replace('\n',
                                                  ' ').replace('\t', ' '),
                        d[2], d[3],
                        str(d[4]),
                        str(d[5]),
                        self.data.output_lang.get_word(
                            predicted[k].cpu().data.item()),
                        self.data.output_lang.get_word(
                            int(ordered[1][k][outputs.data.shape[1] - 2])),
                        self.data.output_lang.get_word(
                            int(ordered[1][k][outputs.data.shape[1] - 3]))
                    ])

        if print_examples:
            self.data.make_dir(self.config.output_dir + '/test_saved')
            self.log("Save examples to: " + self.config.output_dir +
                     '/test_saved')
            with open(
                    self.config.output_dir + '/test_saved/' + dataset_name +
                    'epoch_%d.txt' % epoch, 'w') as f:
                f.write(
                    "#\tSentence\tTrue\tHeader String\tLocation\tLine\tPrediction 1\tPrediction 2\tPrediction 3\n"
                )
                for e in examples:
                    f.write('\t'.join(e) + '\n')

        self.log('Epoch %d ' % epoch + 'Time used: ' +
                 str(time.time() - start))
        self.log('Epoch %d ' % epoch + 'Test loss: %f' % loss)
        self.log('Epoch %d ' % epoch + 'Test Accuracy: %f %%' %
                 (100.0 * correct / total))
        self.log(
            'Epoch %d ' % epoch + 'Test Precision: %f %%' %
            (100.0 * precision_score(labels, predictions, average='micro')))
        self.log('Epoch %d ' % epoch + 'Test Recall: %f %%' %
                 (100.0 * recall_score(labels, predictions, average='micro')))
        self.log('Epoch %d ' % epoch + 'Test F1 Score: %f %%' %
                 (100.0 * f1_score(labels, predictions, average='micro')))

        text_labels = [self.data.output_lang.get_word(l) for l in labels]
        text_preds = [self.data.output_lang.get_word(l) for l in predictions]
        label_set = sorted(list(set(text_labels)))
        if calc_confusion_matrix:
            cm = confusion_matrix(text_labels, text_preds, labels=label_set)
            self.log('confusion_matrix for epoch %d: ' % epoch)
            header = '\t'.join(label_set)
            self.log(header)
            for i, row in enumerate(list(cm)):
                row = [str(num) for num in row]
                self.log('\t'.join([label_set[i]] + row))
            np.savetxt(self.config.output_dir + '/' + dataset_name +
                       '_confusion_matrix_epoch_%d.csv' % epoch,
                       cm,
                       fmt='%d',
                       header=header,
                       delimiter=',')
            self.log('Saved confusion matrix!')

        if generate_reports:
            reports = classification_report(text_labels,
                                            text_preds,
                                            labels=label_set,
                                            target_names=label_set,
                                            digits=4)
            self.log(reports)
            with open(
                    self.config.output_dir + '/' + dataset_name +
                    '_report_epoch_%d.txt' % epoch, 'w') as f:
                f.write(reports)
            self.log('Saved report!')

        if print_test_results:
            with open(
                    self.config.output_dir + '/' + dataset_name +
                    '_predictions_epoch_%d.json' % epoch, 'w') as f:
                json.dump(examples, f, indent=4, sort_keys=True)
        return 100.0 * correct / total

    def test_one(self, header, text):
        if self.model is None:
            self.log('Restoring model from ' + self.config.reload_model_file)

            if torch.cuda.is_available():
                self.model = torch.load(self.config.reload_model_file)
            else:
                self.model = torch.load(self.config.reload_model_file,
                                        map_location='cpu')
            self.log('Model is restored')
            self.model.eval()
            if self.use_cuda:
                self.model = self.model.cuda()

        inputs, seq_lengths, targets = self.data.construct_one(
            header,
            text,
            fixed_length=True if self.config.model_type == 'cnn' else False)

        if self.use_cuda:
            inputs = inputs.cuda()
            targets = targets.cuda()

        if self.config.model_type == 'cnn':
            outputs = self.model(inputs)  # for CNN
        elif self.config.model_type == 'attn_gru_rnn':
            outputs = self.model(inputs, self.data.input_lang, seq_lengths)
        else:
            outputs = self.model(inputs, seq_lengths)

        _, predicted = torch.max(outputs.data, dim=1)

        return predicted.cpu().numpy().tolist() == targets.cpu().data.numpy(
        ).tolist()
Beispiel #31
0
def main():

    data_set_names = {
        'WikipediaMedical': 'WM',
    }

    if args.data_sets == ['EHR']:
        args.textbook_data_sets = []
    else:
        args.textbook_data_sets = args.data_sets

    # if len(args.textbook_data_sets) == 0:
    #     base_dir = args.global_dir + '/' + '_'.join(args.data_sets)
    # else:
    #     base_dir = args.global_dir + '/' + '_'.join([data_set_names[ds] for ds in args.textbook_data_sets])

    data = DataUtil(data_dir=args.data_dir, vocab_dir=args.vocab_dir,
                    split_by_sentence=not args.split_by_section, skip_list=args.skip_list)
    if args.reload_data:
        # if self.config.textbook_data_ratio > 0:
        for ds in args.textbook_data_sets:
            data.load_textbook_train_dev_data(
                args.ref_data_dir + 'medlit/' + args.data_type + '/train/' + ds,
                args.ref_data_dir + 'medlit/' + args.data_type + '/dev/' + ds)
        # train
        data.load_i2b2_train_data(train_base_dir=args.ref_data_dir + 'i2b2_ehr/' + args.data_type)
        # test
        data.load_test_data(ref_base_dir=args.ref_data_dir + 'i2b2_ehr/' + args.data_type, i2b2=True)
        # dev
        data.load_test_data(ref_base_dir=args.ref_data_dir + 'i2b2_ehr/' + args.data_type, i2b2=True,
                                 type='dev')

    else:
        data.load_split_data()

    logger.info("MedLit Training data: " + str(len(data.textbook_train_data)))
    logger.info("MedLit Dev data: " + str(len(data.textbook_dev_data)))
    logger.info("i2b2 Training data: " + str(len(data.i2b2_train_data)))
    logger.info("i2b2 Dev data: " + str(len(data.i2b2_dev_data)))
    logger.info("i2b2 Test data: " + str(len(data.i2b2_test_data)))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    # task_name = args.task_name.lower()
    #
    # if task_name not in processors:
    #     raise ValueError("Task not found: %s" % (task_name))

    #processor = processors[task_name]()
    num_labels = 11
    label_list = ['Allergies', 'Assessment and Plan', 'Chief Complaint', 'Examination', 'Family History', 'Findings',
                  'Medications', 'Past Medical History', 'Personal and Social history', 'Procedures',
                  'Review of Systems']

    logger.info("Num Labels: " + str(num_labels))
    logger.info("Labels: " + str(label_list))

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = []
    num_train_steps = None
    if args.do_train:
        for data_name in args.train_data:
            if data_name == "i2b2" or data_name == "ALL":
                if args.i2b2_data_ratio != 1:
                    train_examples.extend(data.get_data_subset(data.i2b2_train_data, args.i2b2_data_ratio))
                else:
                    train_examples.extend(data.i2b2_train_data)
            if data_name == "MedLit" or data_name == "ALL":
                train_examples.extend(data.textbook_train_data)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

    logger.info("Combined Train data: " + str(len(train_examples)))

    # Prepare model
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
                                                          cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(
                                                              args.local_rank),
                                                          num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    t_total = num_train_steps
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=t_total)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    best_f1 = 0
    best_model = model
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num train examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        dev_examples = []
        name = ""
        for data_name in args.tuning_set:
            name += data_name + "_"
            if data_name == "i2b2" or data_name == "ALL":
                random.shuffle(data.i2b2_dev_data)
                dev_examples.extend(data.i2b2_dev_data[:500])
            if data_name == "MedLit" or data_name == "ALL":
                random.shuffle(data.textbook_dev_data)
                dev_examples.extend(data.textbook_dev_data[:500])
        dev_features = convert_examples_to_features(
            dev_examples, label_list, args.max_seq_length, tokenizer)

        logger.info(" Num dev examples: " + str(len(dev_examples)))

        logger.info("EVAL on Pretrained model only: " + args.bert_model)
        run_eval(args, model, device, dev_examples, dev_features, 0, global_step,
                 name, label_list, save_results=False)

        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    # modify learning rate with special warm up BERT uses
                    lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion)
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            loss = tr_loss / nb_tr_steps
            f1 = run_eval(args, model, device, dev_examples, dev_features, loss, global_step,
                          name, label_list, save_results=False)
            logger.info(str(epoch) + "/" + str(args.num_train_epochs) + ". loss: " + str(loss) + ", F1: " + str(f1))
            if f1 > best_f1:
                best_f1 = f1
                best_model = model
                output_model_file = os.path.join(args.output_dir, "pytorch_model" + str(epoch) + ".bin")
                logger.info("Saving best model with F1: " + str(best_f1))
                model_to_save = best_model.module if hasattr(best_model,
                                                        'module') else best_model  # Only save the model it-self
                torch.save(model_to_save.state_dict(), output_model_file)

    # Save a trained model
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")

    if args.do_train:
        logger.info("Saving best model with F1: " + str(best_f1))
        model_to_save = best_model.module if hasattr(best_model, 'module') else best_model  # Only save the model it-self
        torch.save(model_to_save.state_dict(), output_model_file)
    else:
        model_state_dict = torch.load(os.path.join(args.bert_model, "pytorch_model.bin"))
        best_model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict,
                                                              num_labels=num_labels)
    best_model.to(device)

    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        loss = tr_loss / nb_tr_steps if args.do_train else None

        if "ALL" in args.test_set or "MedLit" in args.test_set:
            eval_examples = data.textbook_dev_data
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer)

            run_eval(args, best_model, device, eval_examples, eval_features, loss, global_step, "medlit_dev",
                     label_list, print_examples=True)

        if "ALL" in args.test_set or "i2b2" in args.test_set:
            eval_examples = data.i2b2_test_data
            eval_features = convert_examples_to_features(
               eval_examples, label_list, args.max_seq_length, tokenizer)

            run_eval(args, best_model, device, eval_examples, eval_features, loss, global_step, "i2b2_test",
                    label_list, print_examples=True)
Beispiel #32
0
    def __init__(self, config, sequence_length=20, reload_data=True):
        # Hyper Parameters
        self.sequence_length = sequence_length
        self.hidden_size = 128
        self.num_layers = 1

        self.config = config
        self.data = DataUtil(data_dir=config.data_dir,
                             vocab_dir=config.vocab_dir,
                             split_by_sentence=not config.split_by_section,
                             skip_list=config.skip_list)

        if not self.config.filtered:
            self.data.make_dir(self.config.output_dir + "/models/")

        if reload_data:
            for ds in self.config.textbook_data_sets:
                self.data.load_textbook_train_dev_data(
                    config.data_dir + 'medlit/train/' + ds,
                    config.data_dir + 'medlit/dev/' + ds)
            # train
            self.data.load_i2b2_train_data(train_base_dir=config.data_dir +
                                           '/i2b2_ehr/')
            # test
            self.data.load_test_data(ref_base_dir=config.data_dir +
                                     '/i2b2_ehr/')
            # dev
            self.data.load_test_data(ref_base_dir=config.data_dir +
                                     '/i2b2_ehr/',
                                     type='dev')

        else:
            self.data.load_split_data()

        self.data.make_dir(self.config.output_dir)

        log_file_name = strftime("log_%Y_%m_%d_%H_%M_%S", localtime())
        self.logger = self.setup_logger(self.config.output_dir +
                                        '/%s.txt' % log_file_name)

        if exists(config.vocab_dir + "/NaturalLang.pkl") and not reload_data:
            print("Loading vocab")
            self.data.load_vocab()
        else:
            print("Building vocab")
            self.data.build_vocab(self.data.textbook_train_data,
                                  pretrain=False)

        self.model = None
        self.use_cuda = torch.cuda.is_available()

        if not self.config.filtered:
            if self.config.model_type == 'gru_rnn':
                self.model = GRURNN(
                    self.config.embedding_size, self.hidden_size,
                    self.data.input_lang, self.data.pretrained_embeddings,
                    self.num_layers, self.data.input_lang.n_words,
                    self.data.output_lang.n_words, self.config.dropout)
            elif self.config.model_type == 'attn_gru_rnn':
                self.model = AttentionGRURNN(
                    self.config.embedding_size, self.hidden_size,
                    self.data.input_lang, self.data.pretrained_embeddings,
                    self.num_layers, self.data.input_lang.n_words,
                    self.data.output_lang.n_words, self.config.dropout)
            elif self.config.model_type == 'cnn':
                self.model = CNN(self.data.input_lang.n_words,
                                 self.data.output_lang.n_words,
                                 self.config.embedding_size,
                                 self.data.input_lang,
                                 self.data.pretrained_embeddings,
                                 self.config.dropout)

            self.epoch_start = 1

            if self.use_cuda:
                self.model = self.model.cuda()