Beispiel #1
0
    def detect_lang(self, text):
        datafile = Dataset(self.params,
                           None,
                           os.path.join('data', self.params.get('corpus_name'),
                                        'train'),
                           text_to_eval=text)

        guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
        total = 0
        while not datafile.is_finished():
            batch_xs, _, lengths = datafile.get_batch()

            outs = self.model.eval(self.session, batch_xs, lengths)

            for j in range(len(outs[0])):
                for i in range(len(outs)):
                    max = outs[i][j]

                    if batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                        break

                    guesses[max] += 1

                    total += 1
        best = np.argmax(guesses)
        acc = 0
        if total > 0:
            acc = float(guesses[best]) / float(total)

        return self.langs[datafile.get_target_name(best, type='orig')], acc
Beispiel #2
0
    def evaluate_string(self, text, print_per_character=False, languages=None):
        if languages is not None:
            langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int)

            for l in languages:
                # try find originally
                id = self.train_set.trg_vocab.get_id(l)
                if id == Vocabulary.Vocab.UNK_ID:
                    print("UNSUPPORTED LANGUAGE IN MODEL: " + l)
                else:
                    langs_mask[id] = 1
        datafile = Dataset(self.params,
                           None,
                           "data/" + self.params.get("corpus_name") + "/train",
                           text_to_eval=text)

        guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
        total = 0
        orig = ""
        classif = ""
        while not datafile.is_finished():
            dev_batch_xs, _, lengths = datafile.get_batch()

            if languages is not None:
                outs = self.model.eval(self.sess,
                                       dev_batch_xs,
                                       lengths,
                                       langs_mask=langs_mask)
            else:
                outs = self.model.eval(self.sess, dev_batch_xs, lengths)
            for j in range(len(outs[0])):
                for i in range(len(outs)):
                    maxim = outs[i][j]

                    if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                        break
                    guesses[maxim] += 1

                    total += 1
        max = np.argmax(guesses)
        if print_per_character:
            print(orig)
            print(classif)
        accur = 0
        if total > 0:
            accur = float(guesses[max]) / float(total)

        print([datafile.get_target_name(max, type='name'), accur])
Beispiel #3
0
    def _init_train(self):
        '''
        Initialize loader for train data
        '''
        train_data = self.dataset_reader.read_dataset("train")
        self.train_loader = data.DataLoader(Dataset(train_data),
                                            batch_size=self.config.batch_size,
                                            shuffle=True,
                                            collate_fn=self.my_collate_fn)

        eval_train_data = self.dataset_reader.read_dataset("train",
                                                           is_eval=True)
        self.eval_train_loader = data.DataLoader(
            Dataset(eval_train_data),
            batch_size=self.config.eval_batch_size,
            shuffle=False,
            collate_fn=self.my_collate_fn)
Beispiel #4
0
    def __init__(self,
                 sess,
                 trained_model=None,
                 params=None,
                 prepare_train_set=True):
        start = time.time()

        self.session = sess
        self.params = Parameters('PARAMS')

        if trained_model:
            self.params.load_params(trained_model)
            logging.info('Загружается модель {0}'.format(trained_model))
        else:
            self.params = params

        self.train_set = Dataset(self.params,
                                 os.path.join('data',
                                              self.params.get('corpus_name'),
                                              'train'),
                                 only_eval=False)
        self.langs = {}

        with open(
                os.path.join('data', self.params.get('corpus_name'), 'labels'),
                'r') as f:
            for line in f.readlines():
                split = line.strip().split(' ', 1)
                self.langs[split[0]] = split[1]

        if prepare_train_set:
            self.train_set.prepare_data(self.params.get('min_count'))

        self.model = Model(self.session, self.params,
                           self.train_set.vocab_size())

        if trained_model:
            self.model.saver.restore(
                self.session,
                os.path.join('models', self.params.get('corpus_name'),
                             trained_model))

        print('Модель подготовлена за {0} секунд'.format(
            str(int(time.time() - start))))
Beispiel #5
0
 def _init_dev(self):
     '''
     Initialize loader for dev data
     '''
     dev_data = self.dataset_reader.read_dataset("dev")
     self.dev_loader = data.DataLoader(
         Dataset(dev_data),
         batch_size=self.config.eval_batch_size,
         shuffle=False,
         collate_fn=self.my_collate_fn)
Beispiel #6
0
 def _init_test(self):
     '''
     Initialize loader for test data
     '''
     test_data = self.dataset_reader.read_dataset("test")
     self.test_loader = data.DataLoader(
         Dataset(test_data),
         batch_size=self.config.eval_batch_size,
         shuffle=False,
         collate_fn=self.my_collate_fn)
def get_data(pre_win, post_win):
    settings = setup(dataset='test', data_loc='./data/controlIntervention/', subject_n=3)

    data = Dataset(
        settings,
        trim=True,
        check=False,
        used_data_types=[DATA_TYPES.event, DATA_TYPES.fitbit]
    )

    minutes = post_win+pre_win
    PNUM = 0

    bars = []
    for evt in data.subject_data[0].event_data.time:
        time = evt-timedelta(minutes=pre_win)
        bars.append(data.get_steps_after_time(time, minutes, PNUM))

    pids = [1]*len(bars)  # all events are same participant

    return minutes, pids, bars
def get_fake_data(pre_win, post_win, minutes, pids, bars):
    # returns data from randomly chosen fake data points
    settings = setup(dataset='test', data_loc='./data/controlIntervention/', subject_n=3)

    data = Dataset(
        settings,
        trim=True,
        check=False,
        used_data_types=[DATA_TYPES.event, DATA_TYPES.fitbit]
    )

    PNUM = 0
    fake_bars = []
    for evt in data.subject_data[0].event_data.time:
        time = evt-timedelta(days=1, minutes=pre_win)  # get random(ish) time
        fake_bars.append(data.get_steps_after_time(time, minutes, PNUM))

    diff_bars = []
    for i in range(len(bars)):
        diff_bars.append(list_subtract(bars[i], fake_bars[i]))

    return minutes, pids, diff_bars
Beispiel #9
0
    def __init__(self,
                 sess,
                 params,
                 trained_model=False,
                 prepare_train_set=True):
        start = time.time()
        self.sess = sess
        self.params = params
        self.train_set = Dataset(self.params,
                                 "data/" + self.params.get("corpus_name") +
                                 "/train",
                                 None,
                                 only_eval=False)

        if prepare_train_set:
            self.train_set.prepare_data(self.params.get("min_count"))

        self.model = Model(sess, self.params, self.train_set.vocab_size())

        if trained_model:
            self.model.saver.restore(sess, trained_model)

        print("Модель подготовлена за " + str(int(time.time() - start)) +
              " секунд.")
Beispiel #10
0
def test(model, test_dataloader, device, distance):
    model.eval()
    average_meter = AverageMeter()

    with torch.no_grad():
        for i, data in enumerate(test_dataloader):
            spectrograms, targets, input_lens, target_lens, word_wise_target = data
            spectrograms, targets = Dataset.pad_batch(
                spectrograms=list(spectrograms),
                targets=list(targets)
            )
            spectrograms = spectrograms.to(device)

            # ==== forward ====
            output = model(spectrograms, this_model_train=True)
            output = nn.LogSoftmax(dim=2)(output)

            # adjust word wise targets
            adjusted_targets = []
            for target in word_wise_target:
                for word_index in target:
                    adjusted_targets.append(torch.Tensor([word_index]))
            adjusted_targets = torch.stack(adjusted_targets)

            adjusted_targets.transpose_(1, 0)
            tensor_len_delta = adjusted_targets.shape[1] - output.shape[0]
            if tensor_len_delta > 0:
                output = torch.cat((output, torch.zeros(tensor_len_delta, 1, 9896).to(device)))

            loss = distance(output, adjusted_targets, (output.shape[0],), (adjusted_targets.shape[1],))

            # ==== log ====
            if loss.item() != 0:
                average_meter.step(loss=loss.item())

    average_loss = average_meter.average()
    test_losses.append(average_loss)
    print(f'Test evaluation: Average loss: {average_loss}')
Beispiel #11
0
def train(model, train_dataloader, device, distance, optim, epoch, lr_scheduler, dataset):
    model.train()
    average_meter = AverageMeter()

    for i, data in enumerate(train_dataloader):
        spectrograms, targets, input_lens, target_lens, word_wise_target = data
        spectrograms, targets = Dataset.pad_batch(
            spectrograms=list(spectrograms),
            targets=list(targets)
        )
        spectrograms = spectrograms.to(device)
        targets = targets.to(device)

        # ==== forward ====
        output = model(x=spectrograms, this_model_train=True)
        output = nn.LogSoftmax(dim=2)(output)
        output = output.transpose(0, 1)     # reshape to '(input_sequence_len, batch_size, n_classes)' as described in 'https://pytorch.org/docs/master/generated/torch.nn.CTCLoss.html'
        loss = distance(output, targets, input_lens, target_lens)

        # ==== backward ====
        optim.zero_grad()
        loss.backward()
        optim.step()

        # ==== adjustments ====
        lr = lr_scheduler.new_lr()
        for param_group in optim.param_groups:
            param_group['lr'] = lr

        # ==== log ====
        if loss.item() != 0:
            average_meter.step(loss=loss.item())
        if i % 200 == 0:
            average_loss = average_meter.average()
            train_losses.append(average_loss)
            print(f'Loss: {average_loss} | Batch: {i} / {len(train_dataloader)} | Epoch: {epoch} | lr: {lr}')

    return lr
Beispiel #12
0
    def test(self, dataset):
        datafile = Dataset(
            self.params, os.path.join('data', dataset, 'test'),
            os.path.join('data', self.params.get('corpus_name'), 'train'))
        datafile.prepare_data(self.params.get('min_count'))
        start = time.time()

        logging.info(
            'Тестирование начато. Датасет для тестирования - {0}.'.format(
                dataset))
        corr = [0, 0]
        while not datafile.is_finished():
            batch_xs, batch_ys, lengths = datafile.get_batch()

            dropout = 1
            _, out = self.model.run(self.session, batch_xs, batch_ys, lengths,
                                    dropout)
            corr = np.sum([corr, out], axis=0)

        logging.info('Тестирование закончено за {0} секунд'.format(
            str(int(time.time() - start))))

        return corr
Beispiel #13
0
    def training(self, eval=None):
        self.train_set.skip_n_lines(self.params.params["trained_lines"])

        dev = Dataset(self.params,
                      "data/" + self.params.get("corpus_name") + "/dev",
                      "data/" + self.params.get("corpus_name") + "/train")
        dev.prepare_data(self.params.get("min_count"))
        start = time.time()  # for counting the time
        cycle_time = time.time()
        logging.info("Training process begun.")
        stop = False
        loss_per_epoch = []
        accuracy_per_epoch = []

        # Keep training until reach max iterations
        while not stop:
            self.params.params["step"] += 1
            batch_xs, batch_ys, lengths = self.train_set.get_batch()
            l, _ = self.model.run(self.sess, batch_xs, batch_ys, lengths,
                                  self.params.get("dropout"))
            loss_per_epoch.append(l)

            stop = self.chech_stopfile("STOP_IMMEDIATELY")

            if time.strftime("%H") == self.params.get("time_stop"):
                stop = True

            if self.params.params["step"] % self.params.get(
                    "steps_per_checkpoint") == 0 or stop:
                c_time = time.time()
                corr = [0, 0]

                while not dev.is_finished() and eval is None:
                    dev_batch_xs, dev_batch_ys, lengths = dev.get_batch()

                    dropout = 1
                    _, out = self.model.run(self.sess, dev_batch_xs,
                                            dev_batch_ys, lengths, dropout)
                    corr = np.sum([corr, out], axis=0)

                if eval is not None:
                    logging.info("Not testing on dev but on special function.")
                    result = eval()
                else:
                    # restart development data
                    dev.restart()
                    result = (corr[0] / corr[1]) * 100
                    accuracy_per_epoch.append(corr[0] / corr[1])

                self.params.params[
                    "trained_lines"] = self.train_set.get_trained_lines()

                self.model.save(self.sess, self.params.params["step"], result)

                print(
                    "Iter {0}, Total correctness: {1} % {2}, time per step: {3} s, total time: {4} min, {5}"
                    .format(
                        self.params.params["step"] *
                        self.params.get("batch_size"), result, corr,
                        (c_time - cycle_time) /
                        self.params.get("steps_per_checkpoint"),
                        int((time.time() - start) / 60),
                        time.strftime("%H:%M:%S")))
                # print((c_time - cycle_time) / self.params.get("steps_per_checkpoint"))
                cycle_time = time.time()

                stop = stop or self.chech_stopfile(
                    "STOP_MODEL")  # if it already is True do not change it

                if self.params.params["step"] >= self.params.get("max_iters"):
                    stop = True

            # check if the file was not finished and if it was, start over
            if self.train_set.is_finished():
                avg_loss = np.mean(loss_per_epoch)
                avg_test_accuracy = np.mean(accuracy_per_epoch)

                summ = self.sess.run(self.model.performance_summaries,
                                     feed_dict={
                                         self.model.tf_loss_ph:
                                         avg_loss,
                                         self.model.tf_accuracy_ph:
                                         avg_test_accuracy
                                     })
                self.model.sum_writer.add_summary(summ,
                                                  self.params.get('epochs'))

                loss_per_epoch.clear()
                accuracy_per_epoch.clear()

                self.params.params["epochs"] += 1
                logging.info(
                    "Generator read training file completely and starts over")
                self.train_set.restart()

        print("Training finished in " + str(int(time.time() - start)) + " s")
Beispiel #14
0
def main(root, train_url='train-clean-100', test_url='test-clean'):
    version = 5
    CONTINUE_TRAINING = False
    TRAIN_SPEECH_MODEL = True

    n_epochs = 20

    hyper_params_speech = {
        # ==== training hyper parameters ====
        'i_lr': 0.0005,
        'n_batches_warmup': 420,
        'batch_size': 15,
        # ==== model hyper parameters ====
        'n_res_cnn_layers': 4,
        'n_bi_gru_layers': 5,
        'bi_gru_dim': 512,
        'n_classes': 29,
        'n_features': 128,
        'dropout_p': 0.2,
        'd_audio_embedding': 128
    }

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # define dataset loaders

    train_dataset = Dataset(root=root, url=train_url, mode='train', n_features=hyper_params_speech['n_features'],
                            download=False)
    train_dataloader = torch.utils.data.DataLoader(
        dataset=train_dataset,
        batch_size=hyper_params_speech['batch_size'],
        shuffle=True,
        num_workers=0,
        collate_fn=collate_fn
    )

    test_dataset = Dataset(root=root, url=test_url, mode='test', n_features=hyper_params_speech['n_features'],
                           download=False)
    test_dataloader = torch.utils.data.DataLoader(
        dataset=test_dataset,
        batch_size=hyper_params_speech['batch_size'],
        shuffle=False,
        num_workers=0,
        collate_fn=collate_fn
    )

    # get models
    start_epoch = 1

    # word_distribution = train_dataset.create_word_distribution()

    if TRAIN_SPEECH_MODEL:
        # speech model
        speech_model = SpeechModel(
            n_res_cnn_layers=hyper_params_speech['n_res_cnn_layers'],
            n_bi_gru_layers=hyper_params_speech['n_bi_gru_layers'],
            bi_gru_dim=hyper_params_speech['bi_gru_dim'],
            n_classes=hyper_params_speech['n_classes'],
            n_features=hyper_params_speech['n_features'],
            dropout_p=hyper_params_speech['dropout_p'],
            device=device,
            dataset=train_dataset,
            d_audio_embedding=hyper_params_speech['d_audio_embedding']
        ).to(device)
        # speech_model = speech_model.apply(weights_init)

        # set up optimizer, loss function and learning rate scheduler
        params = [p for p in speech_model.parameters() if p.requires_grad]
        optim = torch.optim.Adam(params=params, lr=hyper_params_speech['i_lr'])       # amsgrad=True ?
        distance = nn.CTCLoss(blank=28).to(device)

        n_batches_warmup = hyper_params_speech['n_batches_warmup']
        if CONTINUE_TRAINING:
            speech_model, optim, start_epoch, hyper_params_speech['i_lr'] = load_checkpoint(checkpoint_path='models/asr/model_checkpoints/model_checkpoint_1.0.pth', model=speech_model, optim=optim)
            # n_batches_warmup = 0

        lr_scheduler = CosineLearningRateScheduler(i_lr=hyper_params_speech['i_lr'],
                                                   n_batches_warmup=n_batches_warmup,
                                                   n_total_batches=(len(train_dataloader) * n_epochs))

    # train
    for epoch in range(start_epoch, (n_epochs + start_epoch)):
        if TRAIN_SPEECH_MODEL:
            lr = train(model=speech_model, train_dataloader=train_dataloader, device=device, distance=distance,
                       optim=optim, epoch=epoch, lr_scheduler=lr_scheduler, dataset=train_dataset)
            # test(model=speech_model, test_dataloader=test_dataloader, device=device, distance=distance)

            torch.save(speech_model, f'models/asr/models/speech_model_{version}.{epoch}.pth')
            torch.save({
                'epoch': n_epochs,
                'model_state_dict': speech_model.state_dict(),
                'optim_state_dict': optim.state_dict(),
                'lr': lr
            }, f'models/asr/model_checkpoints/speech_model_checkpoint_{version}.{epoch}.pth')

            plot_info_data = {
                'train_losses': train_losses,
                'test_losses': test_losses
            }
            with open(f'models/asr/plot_data/plot_data_speech_model_{version}_{epoch}', 'w') as plot_info_file:
                json.dump(plot_info_data, plot_info_file)

    if TRAIN_SPEECH_MODEL:
        torch.save(speech_model, f'models/asr/models/speech_model_{version}.0.pth')
        torch.save({
            'epoch': n_epochs,
            'model_state_dict': speech_model.state_dict(),
            'optim_state_dict': optim.state_dict(),
            'lr': lr
        }, f'models/asr/model_checkpoints/speech_model_checkpoint_{version}.0.pth')
import warnings

import pylab
import pandas

from src.settings import setup, QUALITY_LEVEL, DATA_TYPES
from src.data.mAvatar.Data import DAY_TYPE
from src.data.Dataset import Dataset


settings = setup(dataset='USF', data_loc='../subjects/', subject_n=0)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    data = Dataset(settings, min_quality=QUALITY_LEVEL.acceptable, trim=True, check=True,

                   used_data_types=[DATA_TYPES.fitbit, DATA_TYPES.avatar_views], avatar_view_freq=60)

UP_TO_DATE = True  # true if software versions are good
if pandas.version.version < '0.12.0':
    UP_TO_DATE = False
    print '\n\nWARN: Some analysis cannot be completed due to outdated pandas version ' + pandas.version.version + '\n\n'


###################
### BEGIN plots ###
###################

if UP_TO_DATE:
    # correlation scatterplot
    import src.scatterplot as scatterplot
Beispiel #16
0
USE_HMM = False

model_path = '../../../models/asr/models/speech_model_4.11.pth'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = torch.load(model_path, map_location='cuda')

root = 'data'
test_url = 'test-clean'
train_url = 'train-clean-100'

test_dataset = Dataset(root=root,
                       url=train_url,
                       mode='test',
                       n_features=128,
                       download=False)
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset,
                                              batch_size=1,
                                              shuffle=False,
                                              num_workers=0,
                                              collate_fn=collate_fn)

hmm = HMM(root='data/hmm_data', n_states=29)

for i, data in enumerate(test_dataloader):
    spectrograms, targets, input_lens, target_lens, _ = data
    spectrograms, targets = Dataset.pad_batch(spectrograms=list(spectrograms),
                                              targets=list(targets))
    spectrograms = spectrograms.to(device)
Beispiel #17
0
class NNHelper(object):
    def __init__(self,
                 sess,
                 trained_model=None,
                 params=None,
                 prepare_train_set=True):
        start = time.time()

        self.session = sess
        self.params = Parameters('PARAMS')

        if trained_model:
            self.params.load_params(trained_model)
            logging.info('Загружается модель {0}'.format(trained_model))
        else:
            self.params = params

        self.train_set = Dataset(self.params,
                                 os.path.join('data',
                                              self.params.get('corpus_name'),
                                              'train'),
                                 only_eval=False)
        self.langs = {}

        with open(
                os.path.join('data', self.params.get('corpus_name'), 'labels'),
                'r') as f:
            for line in f.readlines():
                split = line.strip().split(' ', 1)
                self.langs[split[0]] = split[1]

        if prepare_train_set:
            self.train_set.prepare_data(self.params.get('min_count'))

        self.model = Model(self.session, self.params,
                           self.train_set.vocab_size())

        if trained_model:
            self.model.saver.restore(
                self.session,
                os.path.join('models', self.params.get('corpus_name'),
                             trained_model))

        print('Модель подготовлена за {0} секунд'.format(
            str(int(time.time() - start))))

    def detect_langs(self, text, count):
        datafile = Dataset(self.params,
                           None,
                           os.path.join('data', self.params.get('corpus_name'),
                                        'train'),
                           text_to_eval=text)

        guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
        total = 0
        while not datafile.is_finished():
            batch_xs, _, lengths = datafile.get_batch()

            outs = self.model.eval(self.session, batch_xs, lengths)

            for j in range(len(outs[0])):
                for i in range(len(outs)):
                    max = outs[i][j]

                    if batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                        break

                    guesses[max] += 1

                    total += 1

        result = {}

        for i in range(count):
            if all(item == 0 for item in guesses):
                break

            best = np.argmax(guesses)

            acc = 0
            if total > 0:
                acc = float(guesses[best]) / float(total)

            lang = self.langs[datafile.get_target_name(best, type='orig')]
            guesses[best] = 0

            result[lang] = acc

        return result

    def detect_lang(self, text):
        datafile = Dataset(self.params,
                           None,
                           os.path.join('data', self.params.get('corpus_name'),
                                        'train'),
                           text_to_eval=text)

        guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
        total = 0
        while not datafile.is_finished():
            batch_xs, _, lengths = datafile.get_batch()

            outs = self.model.eval(self.session, batch_xs, lengths)

            for j in range(len(outs[0])):
                for i in range(len(outs)):
                    max = outs[i][j]

                    if batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                        break

                    guesses[max] += 1

                    total += 1
        best = np.argmax(guesses)
        acc = 0
        if total > 0:
            acc = float(guesses[best]) / float(total)

        return self.langs[datafile.get_target_name(best, type='orig')], acc

    def test(self, dataset):
        datafile = Dataset(
            self.params, os.path.join('data', dataset, 'test'),
            os.path.join('data', self.params.get('corpus_name'), 'train'))
        datafile.prepare_data(self.params.get('min_count'))
        start = time.time()

        logging.info(
            'Тестирование начато. Датасет для тестирования - {0}.'.format(
                dataset))
        corr = [0, 0]
        while not datafile.is_finished():
            batch_xs, batch_ys, lengths = datafile.get_batch()

            dropout = 1
            _, out = self.model.run(self.session, batch_xs, batch_ys, lengths,
                                    dropout)
            corr = np.sum([corr, out], axis=0)

        logging.info('Тестирование закончено за {0} секунд'.format(
            str(int(time.time() - start))))

        return corr

    def train(self):
        self.train_set.skip_n_lines(self.params.get('trained_lines'))

        dev = Dataset(
            self.params,
            os.path.join('data', self.params.get('corpus_name'), 'dev'),
            os.path.join('data', self.params.get('corpus_name'), 'train'))
        dev.prepare_data(self.params.get('min_count'))
        start = time.time()
        cycle_time = time.time()
        logging.info('Процесс обучения запущен')
        stop = False
        loss_per_epoch = []
        accuracy_per_epoch = []

        while not stop:
            self.params.params['step'] += 1
            batch_xs, batch_ys, lengths = self.train_set.get_batch()
            l, _ = self.model.run(self.session, batch_xs, batch_ys, lengths,
                                  self.params.get('dropout'))
            loss_per_epoch.append(l)

            stop = self.check_stopfile('STOP_IMMEDIATELY')

            if time.strftime('%H') == self.params.get('time_stop'):
                stop = True

            if self.params.get('step') % self.params.get(
                    'steps_per_checkpoint') == 0 or stop:
                c_time = time.time()
                corr = [0, 0]

                while not dev.is_finished():
                    dev_batch_xs, dev_batch_ys, lengths = dev.get_batch()

                    dropout = 1
                    _, out = self.model.run(self.session, dev_batch_xs,
                                            dev_batch_ys, lengths, dropout)
                    corr = np.sum([corr, out], axis=0)

                result = (corr[0] / corr[1]) * 100
                accuracy_per_epoch.append(float(corr[0]) / float(corr[1]))

                self.params.params[
                    'trained_lines'] = self.train_set.get_trained_lines()
                self.model.save(self.session, self.params.get('step'), result)

                print('''Итерация: {0},
                Точность: {1}% {2},
                Времени на шаг: {3} секунд
                Время обучения: {4} минут
                Время: {5}'''.format(
                    self.paramsget('step') * self.params.get('batch_size'),
                    result, corr, (c_time - cycle_time) /
                    self.params.get('steps_per_checkpoint'),
                    int((time.time() - start) / 60),
                    time.strftime('%H:%M:%S')))

                cycle_time = time.time()

                stop = stop or self.check_stopfile('STOP_MODEL')
                if self.params.get('step') >= self.params.get('max_iters'):
                    stop = True

            if self.train_set.is_finished():
                avg_loss = np.mean(loss_per_epoch)
                avg_test_accuracy = np.mean(accuracy_per_epoch)

                summ = self.sess.run(self.model.performance_summaries,
                                     feed_dict={
                                         self.model.tf_loss_ph:
                                         avg_loss,
                                         self.model.tf_accuracy_ph:
                                         avg_test_accuracy
                                     })
                self.model.sum_writer.add_summary(summ,
                                                  self.params.get('epochs'))

                loss_per_epoch.clear()
                accuracy_per_epoch.clear()

                self.params.params["epochs"] += 1
                logging.info("Эпоха {0} начата.".format(
                    self.params.get('epochs')))
                self.train_set.restart()

        print("Обучение закончено за " + str(int(time.time() - start)) +
              " секунд")

    def check_stopfile(self, filename):
        stop = False
        with open(filename, mode="r") as stp:
            for line in stp:
                if line.strip() == self.params.params["corpus_name"]:
                    logging.info("Stopping training on command from stopfile.")

                    stop = True
                    break

        if stop:
            # remove command from file
            f = open(filename, "r")
            lines = f.readlines()
            f.close()

            f = open(filename, "w")
            for line in lines:
                if line.strip() != self.params.params["corpus_name"]:
                    f.write(line)
            f.close()

        return stop
Beispiel #18
0
    def train(self):
        self.train_set.skip_n_lines(self.params.get('trained_lines'))

        dev = Dataset(
            self.params,
            os.path.join('data', self.params.get('corpus_name'), 'dev'),
            os.path.join('data', self.params.get('corpus_name'), 'train'))
        dev.prepare_data(self.params.get('min_count'))
        start = time.time()
        cycle_time = time.time()
        logging.info('Процесс обучения запущен')
        stop = False
        loss_per_epoch = []
        accuracy_per_epoch = []

        while not stop:
            self.params.params['step'] += 1
            batch_xs, batch_ys, lengths = self.train_set.get_batch()
            l, _ = self.model.run(self.session, batch_xs, batch_ys, lengths,
                                  self.params.get('dropout'))
            loss_per_epoch.append(l)

            stop = self.check_stopfile('STOP_IMMEDIATELY')

            if time.strftime('%H') == self.params.get('time_stop'):
                stop = True

            if self.params.get('step') % self.params.get(
                    'steps_per_checkpoint') == 0 or stop:
                c_time = time.time()
                corr = [0, 0]

                while not dev.is_finished():
                    dev_batch_xs, dev_batch_ys, lengths = dev.get_batch()

                    dropout = 1
                    _, out = self.model.run(self.session, dev_batch_xs,
                                            dev_batch_ys, lengths, dropout)
                    corr = np.sum([corr, out], axis=0)

                result = (corr[0] / corr[1]) * 100
                accuracy_per_epoch.append(float(corr[0]) / float(corr[1]))

                self.params.params[
                    'trained_lines'] = self.train_set.get_trained_lines()
                self.model.save(self.session, self.params.get('step'), result)

                print('''Итерация: {0},
                Точность: {1}% {2},
                Времени на шаг: {3} секунд
                Время обучения: {4} минут
                Время: {5}'''.format(
                    self.paramsget('step') * self.params.get('batch_size'),
                    result, corr, (c_time - cycle_time) /
                    self.params.get('steps_per_checkpoint'),
                    int((time.time() - start) / 60),
                    time.strftime('%H:%M:%S')))

                cycle_time = time.time()

                stop = stop or self.check_stopfile('STOP_MODEL')
                if self.params.get('step') >= self.params.get('max_iters'):
                    stop = True

            if self.train_set.is_finished():
                avg_loss = np.mean(loss_per_epoch)
                avg_test_accuracy = np.mean(accuracy_per_epoch)

                summ = self.sess.run(self.model.performance_summaries,
                                     feed_dict={
                                         self.model.tf_loss_ph:
                                         avg_loss,
                                         self.model.tf_accuracy_ph:
                                         avg_test_accuracy
                                     })
                self.model.sum_writer.add_summary(summ,
                                                  self.params.get('epochs'))

                loss_per_epoch.clear()
                accuracy_per_epoch.clear()

                self.params.params["epochs"] += 1
                logging.info("Эпоха {0} начата.".format(
                    self.params.get('epochs')))
                self.train_set.restart()

        print("Обучение закончено за " + str(int(time.time() - start)) +
              " секунд")
Beispiel #19
0
import pylab
import pandas

from src.settings import setup, QUALITY_LEVEL, DATA_TYPES
from src.data.mAvatar.Data import DAY_TYPE
from src.data.Dataset import Dataset

settings = setup(dataset='USF', data_loc='../subjects/', subject_n=0)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    data = Dataset(
        settings,
        min_quality=QUALITY_LEVEL.acceptable,
        trim=True,
        check=True,
        used_data_types=[DATA_TYPES.fitbit, DATA_TYPES.avatar_views],
        avatar_view_freq=60)

UP_TO_DATE = True  # true if software versions are good
if pandas.version.version < '0.12.0':
    UP_TO_DATE = False
    print '\n\nWARN: Some analysis cannot be completed due to outdated pandas version ' + pandas.version.version + '\n\n'

###################
### BEGIN plots ###
###################

if UP_TO_DATE:
    # correlation scatterplot
Beispiel #20
0
    def evaluate(self,
                 files,
                 max_langs_per_file,
                 allowed_langs,
                 output_file,
                 threashold,
                 eval_lines=False,
                 eval_blocks=False,
                 smoothing=0,
                 unknown=None,
                 separator=",",
                 code_swaps=None):
        langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int)
        for allowed in self.train_set.get_tagging_classes():
            langs_mask[allowed] = 1

        for l in allowed_langs:
            # try find originally
            id = self.train_set.trg_vocab.get_id(l)
            if id == Vocabulary.Vocab.UNK_ID:
                print("UNSUPPORTED LANGUAGE IN MODEL: " + l)
            else:
                langs_mask[id] = 1

        datafile = Dataset(self.params,
                           None,
                           "data/" + self.params.get("corpus_name") + "/train",
                           only_eval=True,
                           use_eol=eval_lines)

        if smoothing > 0:
            print("USING SMOOTHING OF {0}".format(smoothing))

        with open(output_file, encoding='utf-8', mode='w', buffering=1) as bal:
            for filename in files:
                # files has structure: [folder, outputing_name, possible_encoding]
                if len(filename) > 2:
                    datafile.restart(filename[0] + filename[1], filename[2])
                else:
                    datafile.restart(filename[0] + filename[1])

                guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
                row = np.zeros(self.train_set.vocab_size()[1], np.int)
                row_length = 0
                total = 0
                smooth = []
                while not datafile.is_finished():
                    dev_batch_xs, dev_batch_ys, lengths = datafile.get_batch()
                    outs = self.model.eval(self.sess,
                                           dev_batch_xs,
                                           lengths,
                                           langs_mask=langs_mask)
                    for j in range(len(outs[0])):
                        block_guesses = np.zeros(
                            self.train_set.vocab_size()[1], np.int)
                        for i in range(len(outs)):
                            if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                                break
                            # print(datafile.get_source_name(dev_batch_xs[i][j]), datafile.get_target_name(outs[i][j], "orig"))
                            total += 1
                            if eval_lines:
                                if dev_batch_xs[i][
                                        j] == datafile.trg_vocab.EOL_ID:  # dev_batch_ys[i][j] == -2: # or
                                    guesses[np.argmax(row)] += row_length
                                    # print("filename {0}, guessed {1}, sum {2}, line length {3}".format(filename[1], datafile.get_target_name(np.argmax(row), "iso2"), row[np.argmax(row)], row_length))
                                    row = np.zeros(
                                        self.train_set.vocab_size()[1], np.int)
                                    row_length = 0
                                else:
                                    row[outs[i][j]] += 1
                                    row_length += 1
                            elif eval_blocks:
                                block_guesses[outs[i][j]] += 1
                            elif smoothing > 0:
                                smooth.append(outs[i][j])
                            else:
                                guesses[outs[i][j]] += 1

                        if eval_blocks:
                            guesses[np.argmax(block_guesses)] += i

                if smoothing > 0:
                    for i in range(len(smooth)):
                        if i + smoothing < len(smooth) and smooth[i] == smooth[
                                i + smoothing]:
                            # if first and the last are the same, the inbetween should be too
                            guesses[smooth[i]] += smoothing
                            i += smoothing - 1
                        else:
                            guesses[smooth[i]] += 1

                langs = 0
                last_count = 1
                seznam = ""
                for max in np.argsort(-guesses):
                    if guesses[max] == 0 or langs == max_langs_per_file:
                        break
                    guess_name = datafile.get_target_name(max, "iso2")
                    percent = 100 * guesses[max] / total
                    if guess_name in allowed_langs:
                        if code_swaps is not None and guess_name in code_swaps:
                            guess_name = code_swaps[guess_name]
                        # print at least on language
                        # if langs > 0 and 100 * guesses[max] / last_count < threashold:
                        #     break
                        if langs > 0 and percent < threashold:
                            break
                        seznam += "{0} {1:.0f}; ".format(guess_name, percent)
                        bal.write(filename[1] + separator + guess_name + "\n")
                        # print(filename[1] + "," + guess_name)
                        langs += 1
                        last_count = guesses[max]
                    else:
                        print(filename[1] + ", not allowed lang: " +
                              guess_name)
                if langs == 0 and unknown is not None:
                    # no language was outputted
                    bal.write(filename[1] + separator + unknown + "\n")
# from src.post_view_event_steps_bars import test_get_avg_list
# test_get_avg_list()

#knowMe.makePlots(type=PLOT_TYPES.bars, show=True, pre_win=10, post_win=40)
#knowMe.makePlots(type=PLOT_TYPES.bars, show=True)

if avatar:
    ### USF mAVATAR DATA LOADING ###
    settings = setup(dataset='USF', data_loc='../subjects/', subject_n=0)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = Dataset(
            settings,
            min_quality=QUALITY_LEVEL.acceptable,
            trim=True,
            check=True,
            used_data_types=[DATA_TYPES.fitbit, DATA_TYPES.avatar_views],
            avatar_view_freq=60)

    UP_TO_DATE = True  # true if software versions are good
    if pandas.version.version < '0.12.0':
        UP_TO_DATE = False
        print '\n\nWARN: Some analysis cannot be completed due to outdated pandas version ' + pandas.version.version + '\n\n'

    # comparison of events selected with/without overlap from mAvatar dataset
    # to demonstrate difference (especially at high time intervals like no-overlap for 3hrs around event)
    #plot_minutes(data, MINS=12*60, overlap_okay=True, shift=-6*60, edgecolor='none')
    #pylab.show()
    plot_minutes(data, MINS=60, overlap_okay=True, shift=-30, edgecolor='none')
    pylab.show()
Beispiel #22
0
class Architecture(object):
    def __init__(self,
                 sess,
                 params,
                 trained_model=False,
                 prepare_train_set=True):
        start = time.time()
        self.sess = sess
        self.params = params
        self.train_set = Dataset(self.params,
                                 "data/" + self.params.get("corpus_name") +
                                 "/train",
                                 None,
                                 only_eval=False)

        if prepare_train_set:
            self.train_set.prepare_data(self.params.get("min_count"))

        self.model = Model(sess, self.params, self.train_set.vocab_size())

        if trained_model:
            self.model.saver.restore(sess, trained_model)

        print("Модель подготовлена за " + str(int(time.time() - start)) +
              " секунд.")

    def evaluate(self,
                 files,
                 max_langs_per_file,
                 allowed_langs,
                 output_file,
                 threashold,
                 eval_lines=False,
                 eval_blocks=False,
                 smoothing=0,
                 unknown=None,
                 separator=",",
                 code_swaps=None):
        langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int)
        for allowed in self.train_set.get_tagging_classes():
            langs_mask[allowed] = 1

        for l in allowed_langs:
            # try find originally
            id = self.train_set.trg_vocab.get_id(l)
            if id == Vocabulary.Vocab.UNK_ID:
                print("UNSUPPORTED LANGUAGE IN MODEL: " + l)
            else:
                langs_mask[id] = 1

        datafile = Dataset(self.params,
                           None,
                           "data/" + self.params.get("corpus_name") + "/train",
                           only_eval=True,
                           use_eol=eval_lines)

        if smoothing > 0:
            print("USING SMOOTHING OF {0}".format(smoothing))

        with open(output_file, encoding='utf-8', mode='w', buffering=1) as bal:
            for filename in files:
                # files has structure: [folder, outputing_name, possible_encoding]
                if len(filename) > 2:
                    datafile.restart(filename[0] + filename[1], filename[2])
                else:
                    datafile.restart(filename[0] + filename[1])

                guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
                row = np.zeros(self.train_set.vocab_size()[1], np.int)
                row_length = 0
                total = 0
                smooth = []
                while not datafile.is_finished():
                    dev_batch_xs, dev_batch_ys, lengths = datafile.get_batch()
                    outs = self.model.eval(self.sess,
                                           dev_batch_xs,
                                           lengths,
                                           langs_mask=langs_mask)
                    for j in range(len(outs[0])):
                        block_guesses = np.zeros(
                            self.train_set.vocab_size()[1], np.int)
                        for i in range(len(outs)):
                            if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                                break
                            # print(datafile.get_source_name(dev_batch_xs[i][j]), datafile.get_target_name(outs[i][j], "orig"))
                            total += 1
                            if eval_lines:
                                if dev_batch_xs[i][
                                        j] == datafile.trg_vocab.EOL_ID:  # dev_batch_ys[i][j] == -2: # or
                                    guesses[np.argmax(row)] += row_length
                                    # print("filename {0}, guessed {1}, sum {2}, line length {3}".format(filename[1], datafile.get_target_name(np.argmax(row), "iso2"), row[np.argmax(row)], row_length))
                                    row = np.zeros(
                                        self.train_set.vocab_size()[1], np.int)
                                    row_length = 0
                                else:
                                    row[outs[i][j]] += 1
                                    row_length += 1
                            elif eval_blocks:
                                block_guesses[outs[i][j]] += 1
                            elif smoothing > 0:
                                smooth.append(outs[i][j])
                            else:
                                guesses[outs[i][j]] += 1

                        if eval_blocks:
                            guesses[np.argmax(block_guesses)] += i

                if smoothing > 0:
                    for i in range(len(smooth)):
                        if i + smoothing < len(smooth) and smooth[i] == smooth[
                                i + smoothing]:
                            # if first and the last are the same, the inbetween should be too
                            guesses[smooth[i]] += smoothing
                            i += smoothing - 1
                        else:
                            guesses[smooth[i]] += 1

                langs = 0
                last_count = 1
                seznam = ""
                for max in np.argsort(-guesses):
                    if guesses[max] == 0 or langs == max_langs_per_file:
                        break
                    guess_name = datafile.get_target_name(max, "iso2")
                    percent = 100 * guesses[max] / total
                    if guess_name in allowed_langs:
                        if code_swaps is not None and guess_name in code_swaps:
                            guess_name = code_swaps[guess_name]
                        # print at least on language
                        # if langs > 0 and 100 * guesses[max] / last_count < threashold:
                        #     break
                        if langs > 0 and percent < threashold:
                            break
                        seznam += "{0} {1:.0f}; ".format(guess_name, percent)
                        bal.write(filename[1] + separator + guess_name + "\n")
                        # print(filename[1] + "," + guess_name)
                        langs += 1
                        last_count = guesses[max]
                    else:
                        print(filename[1] + ", not allowed lang: " +
                              guess_name)
                if langs == 0 and unknown is not None:
                    # no language was outputted
                    bal.write(filename[1] + separator + unknown + "\n")

    def evaluate_dataset(self, source, allowed_languages=None):

        correct_all = 0
        total_all = 0
        with open(source, mode='r') as src:
            for l in src:
                if total_all % 1000 == 0:
                    print("processed lines ", total_all)
                entry = l.strip().split(' ', 1)
                if allowed_languages is not None:
                    guess = self.evaluate_string(entry[1],
                                                 languages=allowed_languages)
                else:
                    guess = self.evaluate_string(entry[1])
                total_all += 1
                if entry[0] == guess[0]:
                    correct_all += 1

        print("Accuracy all: {0} ({1}/{2})".format(correct_all / total_all,
                                                   correct_all, total_all))

    def evaluate_string(self, text, print_per_character=False, languages=None):
        if languages is not None:
            langs_mask = np.zeros(self.model.vocab_sizes[1], dtype=np.int)

            for l in languages:
                # try find originally
                id = self.train_set.trg_vocab.get_id(l)
                if id == Vocabulary.Vocab.UNK_ID:
                    print("UNSUPPORTED LANGUAGE IN MODEL: " + l)
                else:
                    langs_mask[id] = 1
        datafile = Dataset(self.params,
                           None,
                           "data/" + self.params.get("corpus_name") + "/train",
                           text_to_eval=text)

        guesses = np.zeros(self.train_set.vocab_size()[1], np.int)
        total = 0
        orig = ""
        classif = ""
        while not datafile.is_finished():
            dev_batch_xs, _, lengths = datafile.get_batch()

            if languages is not None:
                outs = self.model.eval(self.sess,
                                       dev_batch_xs,
                                       lengths,
                                       langs_mask=langs_mask)
            else:
                outs = self.model.eval(self.sess, dev_batch_xs, lengths)
            for j in range(len(outs[0])):
                for i in range(len(outs)):
                    maxim = outs[i][j]

                    if dev_batch_xs[i][j] == datafile.trg_vocab.PAD_ID:
                        break
                    guesses[maxim] += 1

                    total += 1
        max = np.argmax(guesses)
        if print_per_character:
            print(orig)
            print(classif)
        accur = 0
        if total > 0:
            accur = float(guesses[max]) / float(total)

        print([datafile.get_target_name(max, type='name'), accur])

    def training(self, eval=None):
        self.train_set.skip_n_lines(self.params.params["trained_lines"])

        dev = Dataset(self.params,
                      "data/" + self.params.get("corpus_name") + "/dev",
                      "data/" + self.params.get("corpus_name") + "/train")
        dev.prepare_data(self.params.get("min_count"))
        start = time.time()  # for counting the time
        cycle_time = time.time()
        logging.info("Training process begun.")
        stop = False
        loss_per_epoch = []
        accuracy_per_epoch = []

        # Keep training until reach max iterations
        while not stop:
            self.params.params["step"] += 1
            batch_xs, batch_ys, lengths = self.train_set.get_batch()
            l, _ = self.model.run(self.sess, batch_xs, batch_ys, lengths,
                                  self.params.get("dropout"))
            loss_per_epoch.append(l)

            stop = self.chech_stopfile("STOP_IMMEDIATELY")

            if time.strftime("%H") == self.params.get("time_stop"):
                stop = True

            if self.params.params["step"] % self.params.get(
                    "steps_per_checkpoint") == 0 or stop:
                c_time = time.time()
                corr = [0, 0]

                while not dev.is_finished() and eval is None:
                    dev_batch_xs, dev_batch_ys, lengths = dev.get_batch()

                    dropout = 1
                    _, out = self.model.run(self.sess, dev_batch_xs,
                                            dev_batch_ys, lengths, dropout)
                    corr = np.sum([corr, out], axis=0)

                if eval is not None:
                    logging.info("Not testing on dev but on special function.")
                    result = eval()
                else:
                    # restart development data
                    dev.restart()
                    result = (corr[0] / corr[1]) * 100
                    accuracy_per_epoch.append(corr[0] / corr[1])

                self.params.params[
                    "trained_lines"] = self.train_set.get_trained_lines()

                self.model.save(self.sess, self.params.params["step"], result)

                print(
                    "Iter {0}, Total correctness: {1} % {2}, time per step: {3} s, total time: {4} min, {5}"
                    .format(
                        self.params.params["step"] *
                        self.params.get("batch_size"), result, corr,
                        (c_time - cycle_time) /
                        self.params.get("steps_per_checkpoint"),
                        int((time.time() - start) / 60),
                        time.strftime("%H:%M:%S")))
                # print((c_time - cycle_time) / self.params.get("steps_per_checkpoint"))
                cycle_time = time.time()

                stop = stop or self.chech_stopfile(
                    "STOP_MODEL")  # if it already is True do not change it

                if self.params.params["step"] >= self.params.get("max_iters"):
                    stop = True

            # check if the file was not finished and if it was, start over
            if self.train_set.is_finished():
                avg_loss = np.mean(loss_per_epoch)
                avg_test_accuracy = np.mean(accuracy_per_epoch)

                summ = self.sess.run(self.model.performance_summaries,
                                     feed_dict={
                                         self.model.tf_loss_ph:
                                         avg_loss,
                                         self.model.tf_accuracy_ph:
                                         avg_test_accuracy
                                     })
                self.model.sum_writer.add_summary(summ,
                                                  self.params.get('epochs'))

                loss_per_epoch.clear()
                accuracy_per_epoch.clear()

                self.params.params["epochs"] += 1
                logging.info(
                    "Generator read training file completely and starts over")
                self.train_set.restart()

        print("Training finished in " + str(int(time.time() - start)) + " s")

    def chech_stopfile(self, filename):
        stop = False
        with open(filename, mode="r") as stp:
            for line in stp:
                if line.strip() == self.params.params["corpus_name"]:
                    logging.info("Stopping training on command from stopfile.")

                    stop = True
                    break

        if stop:
            # remove command from file
            f = open(filename, "r")
            lines = f.readlines()
            f.close()

            f = open(filename, "w")
            for line in lines:
                if line.strip() != self.params.params["corpus_name"]:
                    f.write(line)
            f.close()

        return stop