def extract_pages(origin_file, pages_dir):
    errors = []

    wiki_file = open(origin_file, 'r')

    page_counter = 0

    letter = wiki_file.read(1)
    read_letters = ''
    while letter != '':
        read_letters += letter
        if read_letters[-6:] == '<page>':
            try:
                extract_page(wiki_file, pages_dir)
                page_counter += 1
                read_letters = ''

                if page_counter % 200 == 0:
                    print_inline(".")
                    if page_counter % 10000 == 0:
                        print_inline('\n' + str(page_counter) + ' páginas')

            except Exception as e:
                errors.append((e, e.file_name))

        letter = wiki_file.read(1)

    print 'Quantidade de erros: ' + str(len(errors))
    print errors
    def __init__(self):
        dictionary = Word2Vec.load(params.dictionary_path)
        self.common_words = pickle.load(open(params.common_words_path, 'rb'))
        self.uncommon_words = pickle.load(open(params.uncommon_words_path, 'rb'))

        self.word_vectors = dictionary.wv
        del dictionary

        embeddings = word_vectors_2_embedding(self.word_vectors)
        self.embedding_table = nn.Embedding(*embeddings.shape)
        self.embedding_table.weight.data.copy_(torch.tensor(embeddings))
        self.embedding_table.weight.requires_grad = False

        print('Finish loading Word2Vec model! Size: ({},{})'.format(embeddings.shape[0], embeddings.shape[1]))

        self.embedding_size = embeddings.shape[1]

        self.net = GatedCNN(self.embedding_size, params.num_channels)
        if torch.cuda.is_available():
            self.net = self.net.cuda()
            self.net.load_state_dict(torch.load(params.save_path))
        else:
            self.net.load_state_dict(torch.load(params.save_path, map_location=lambda storage, loc: storage))
        self.net.eval()

        with open('assets/items.pickle', mode='rb') as fp:
            self.items = pickle.load(fp)

        if os.path.exists(params.embedding_path):
            with open(params.embedding_path, mode='rb') as fp:
                self.embedding_items = pickle.load(fp)
        else:
            batch = build_batch(self.items, self.word_vectors, 10, self.common_words, self.uncommon_words)

            self.embedding_items = {}
            count = 0
            for newsId, index_vector in batch.items():
                count += 1
                print_inline('Calculating item embedding {}/{}'.format(count, len(self.items)))

                try:
                    index_vector = torch.tensor(index_vector)
                    inputs = self.embedding_table(index_vector)
                    inputs = torch.FloatTensor(inputs)
                    inputs = inputs.unsqueeze(0).permute(0, 2, 1)  # (batch_size, embedding_size, seq_len)
                    if torch.cuda.is_available():
                        inputs = inputs.cuda()
                    inputs = Variable(inputs)
                    doc_embedding = self.net(inputs)[0].cpu().detach().numpy()
                    doc_embedding = doc_embedding / LA.norm(doc_embedding)
                    self.embedding_items[newsId] = doc_embedding
                except Exception as e:
                    print(e)
            with open(params.embedding_path, mode='wb') as fp:
                pickle.dump(self.embedding_items, fp, pickle.HIGHEST_PROTOCOL)

        embedding_items = np.array(list(self.embedding_items.values()))
        self.mean_vector, self.std_vector = np.mean(embedding_items, axis=0), np.std(embedding_items, axis=0)

        self.embedding_items = {k: (v - self.mean_vector) for k, v in self.embedding_items.items()}
 def optimize(self, nnet):
     timer = Stopwatch(verbose=False).start()
     self.total_epochs += self.max_epochs
     for i in xrange(self.max_epochs):
         self.epoch += 1
         if self.verbose:
             print_inline('Epoch {0:>{1}}/{2} '.format(
                 self.epoch, len(str(self.total_epochs)),
                 self.total_epochs))
         if self.verbose and self.early_stopping and nnet._X_val is not None:
             print_inline(' early stopping after {0} '.format(
                 self._early_stopping))
         losses = self.train_epoch(nnet)
         self.loss_history.append(losses)
         msg = 'elapsed: {0} sec'.format(
             width_format(timer.elapsed(), default_width=5,
                          max_precision=2))
         msg += ' - loss: {0}'.format(
             width_format(np.mean(losses), default_width=5,
                          max_precision=4))
         score = nnet._metric(nnet._y, nnet.validate())
         self.score_history.append(score)
         # TODO: change acc to metric name
         msg += ' - acc.: {0}'.format(
             width_format(score, default_width=6, max_precision=4))
         if nnet._X_val is not None:
             if self._early_stopping > 0 and self.epoch > 1:
                 self._early_stopping -= 1
             val_loss = nnet._loss(nnet._y_val,
                                   nnet.validate_proba(nnet._X_val))
             self.val_loss_history.append(val_loss)
             val_score = nnet._metric(nnet._y_val,
                                      nnet.validate(nnet._X_val))
             if self.epoch > 1 and val_score < 0.2 * self.val_score_history[
                     -1]:
                 return
             self.val_score_history.append(val_score)
             if self.epoch > 1 and val_score > nnet.best_val_score_:
                 nnet.best_val_score_ = val_score
                 nnet.best_epoch_ = self.epoch  # TODO move to optimizer
                 nnet._save_best_weights()
                 self._early_stopping = self.early_stopping  # reset counter
             msg += ' - val. loss: {0}'.format(
                 width_format(val_loss, default_width=5, max_precision=4))
             # TODO: fix acc.
             msg += ' - val. acc.: {0}'.format(
                 width_format(val_score, default_width=6, max_precision=4))
             if self._early_stopping == 0:
                 if self.verbose: print msg
                 return
         if self.verbose: print msg
         if self.epoch > 1 and self.plot:
             if not os.path.exists(self.plot_dirpath):
                 os.makedirs(self.plot_dirpath)
             plot_learning_curves(self.loss_history,
                                  self.score_history,
                                  self.val_loss_history,
                                  self.val_score_history,
                                  dirpath=self.plot_dirpath)
Example #4
0
 def train_epoch(self, X):
     mean_recons = []
     for i, X_batch in enumerate(self.batch_iter(X)):
         mean_recons.append(self.update(X_batch))
         if self.verbose and i % (len(X) / (self.batch_size * 16)) == 0:
             print_inline('.')
     if self.verbose: print_inline(' ')
     return np.mean(mean_recons)
Example #5
0
    def train_epoch(self, train_loader):
        self.model.train()

        epoch_iter = 0
        epoch_train_loss = 0.
        epoch_correct = 0
        epoch_total = 0
        epoch_acc = 0.
        epoch_train_loss_history = []

        for (X_batch,
             manip), (y_batch,
                      soft_logits) in progress_iter(iterable=train_loader,
                                                    verbose=self.verbose,
                                                    leave=True,
                                                    ncols=64,
                                                    desc='epoch'):
            if self.use_cuda:
                X_batch, y_batch = X_batch.cuda(), y_batch.cuda()
                manip = manip.cuda()
                soft_logits = soft_logits.cuda()
            X_batch, y_batch = Variable(X_batch), Variable(y_batch)
            manip = Variable(manip, requires_grad=False)
            soft_logits = Variable(soft_logits, requires_grad=False)
            self.optim.zero_grad()
            out = self.model((X_batch, manip))

            loss = self.loss_func(out, y_batch)
            if self.distill_cost > 1e-6:
                loss += 0.5 * self._get_distill_multiplier() * torch.mean(
                    (out - out.mean(1).view(-1, 1) - soft_logits)**2.)
            epoch_train_loss_history.append(loss.data[0])
            epoch_train_loss *= epoch_iter / (epoch_iter + 1.)
            epoch_train_loss += loss.data[0] / (epoch_iter + 1.)
            epoch_iter += 1

            _, y_pred = torch.max(out.data, 1)
            epoch_correct += y_pred.eq(y_batch.data).cpu().sum()
            epoch_total += y_batch.size(0)
            epoch_acc = epoch_correct / float(epoch_total)

            if self.verbose:
                s = "loss: {0:.4f} acc: {1:.4f}".format(
                    epoch_train_loss, epoch_acc)
                print_inline(s)

            loss.backward()  #create_graph=True, retain_graph=True)
            self.optim.step()

        # update global history
        self.train_loss_history.append(epoch_train_loss_history)
        self.train_acc_history.append(epoch_acc)

        # update cyclic LR if enabled
        if self.cyclic_lr:
            lrm = self._get_cyclic_lrm()
            self._mul_lr_by(lrm)
 def train_epoch(self, nnet):
     self._setup(nnet)
     losses = []
     for X_batch, y_batch in nnet.batch_iter():
         if self.verbose: print_inline('.')
         loss = np.mean(nnet.update(X_batch, y_batch))
         self.update(nnet)
         nnet._max_norm_update()
         losses.append(loss)
     if self.verbose: print
     return losses  # epoch losses
def train(samples, word_vectors, net, optimizer, criterion, epoch):
    # shuffle train set
    random.shuffle(samples)
    acc_loss = 0.0
    total_step = 0
    len_samples = len(samples)
    n_batches = len_samples // params.batch_size
    if (len_samples - n_batches * params.batch_size) != 0:
        n_batches += 1

    for batch_idx, i in enumerate(range(n_batches), 1):
        start = i * params.batch_size
        end = start + params.batch_size
        batch_samples = samples[start:end]
        mini_batches = sample_prediction_point(batch_samples, word_vectors)

        batch_loss = 0.0
        for inputs, targets in mini_batches:
            inputs, targets = torch.LongTensor(inputs), torch.LongTensor(
                targets)
            bs = inputs.shape[0]
            labels = torch.cat([
                torch.ones(bs, params.n_positive),
                torch.zeros(bs, params.n_negative)
            ], 1)
            if torch.cuda.is_available():
                inputs, targets, labels = inputs.cuda(), targets.cuda(
                ), labels.cuda()
            inputs, targets, labels = Variable(inputs), Variable(
                targets), Variable(labels)
            # zero the parameter gradients
            optimizer.zero_grad()
            logits = net(inputs, targets)

            loss = criterion(logits, labels)
            batch_loss += loss.item()
            loss.backward()
            # torch.nn.utils.clip_grad_norm(net.network.cnn.parameters(), 0.25)
            optimizer.step()

        acc_loss += batch_loss
        total_step += len(mini_batches)
        print_inline(
            'Train Epoch: {} [{} / {} ({:.1f}%)]   Learning Rate: {}   Loss: {:.6f}'
            .format(epoch,
                    str(batch_idx).ljust(int(floor(log10(n_batches))), ' '),
                    n_batches, 100. * batch_idx / n_batches,
                    _get_learning_rate(optimizer)[0],
                    batch_loss / len(mini_batches)))

    acc_loss /= total_step
    return acc_loss
def build_batch(items, wv, min_doc_length, common_words, uncommon_words):
    batch = {}
    count = 0

    docs_sentences = get_sentences(items)
    for newsId, sentences in docs_sentences.items():
        print_inline('Pre-process items {}/{}'.format(count, len(docs_sentences)))
        count += 1

        words = [w for s in sentences for w in s.strip().split()]
        if len(words) < min_doc_length:
            continue

        words_indices = [get_word_index(wv, word, common_words, uncommon_words) for word in words]
        batch[newsId] = words_indices

    return batch
def preprocess_dataset(fields=None):
    if fields is None:
        fields = ['title_token', 'sapo_token', 'content_token', 'tag_token']

    assets_folder = 'assets'
    pathlib.Path(assets_folder).mkdir(parents=True, exist_ok=True)

    results = {}
    sentence_length_arr = []
    with open('dataset/items.txt', 'r') as fp:
        with open(assets_folder + '/items.txt', 'w') as fw:
            count = 0
            while True:
                line = fp.readline().strip()
                if not line:
                    break
                if random.random() > params.ratio:  # pick 20%
                    continue
                fw.write(line + os.linesep)
                count += 1
                print_inline(count)

                item = json.loads(line)
                for field in fields:
                    text = item.get(field)
                    if text is None:
                        continue
                    sentences = nltk.tokenize.sent_tokenize(text)
                    normalized_text = ""
                    for sentence in sentences:
                        sentence_length, sentence = normalize(sentence)
                        sentence_length_arr.append(sentence_length)
                        normalized_text += sentence + " . "
                    item[field] = normalized_text.strip(". ")
                results[item.get('newsId')] = item

    print("Average length of each sentence is: %.2f" %
          (sum(sentence_length_arr) / len(sentence_length_arr)))
    del sentence_length

    with open(assets_folder + '/items.pickle', 'wb') as fp:
        pickle.dump(results, fp, pickle.HIGHEST_PROTOCOL)
Example #10
0
def iterate_over_files():
    file_counter = 0

    for root, dirs, files in os.walk(PAGES_DIR):
        for f in files:

            page_path = os.path.join(root, f)
            page_content = read_file(page_path)

            text = markup_formatter.format_text(page_content)

            text_dir = get_dir(f)
            text_path = os.path.join(text_dir, f)

            save_file(text_path, text)

            file_counter += 1

            if file_counter % 200 == 0:
                print_inline('.')
                if file_counter % 10000 == 0:
                    print_inline('\n' + str(file_counter) + ' páginas')
Example #11
0
def generate_pair(filepath, block_size, add_noise):
    base_log = 'Processing ' + filepath + ' - '
    print_inline(base_log + 'reading raw')
    raw_image = dcraw.read_raw(filepath)
    chunk_matrix = split_image_into_chunks(raw_image)
    subsampled_chunks = []
    new_original_chunks = []
    print_inline(base_log + 'subsampling ' +
                 str(len(chunk_matrix) * len(chunk_matrix[0])) + ' chunks')
    for row_index, row in enumerate(chunk_matrix):
        subsampled_row = []
        new_original_row = []
        for index, chunk in enumerate(row):
            if add_noise:
                [noise_params] = estimator.estimate_noise(chunk)
                # Clip the noise to a reasonable interval
                noise_params = [
                    max(NOISE_A_MIN_VALUE,
                        min(NOISE_A_MAX_VALUE, noise_params[0])),
                    max(NOISE_B_MIN_VALUE,
                        min(NOISE_B_MAX_VALUE, noise_params[1]))
                ]
            numpy_image = np.matrix.transpose(np.array(chunk))
            subsampled_chunk = subsample_image(numpy_image, block_size)
            groundtruth_chunk = Image.fromarray(subsampled_chunk, mode='RGB')
            input_chunk = three_channel_to_bayer(groundtruth_chunk)
            if add_noise:
                numpy_input_chunk = np.matrix.transpose(np.array(input_chunk))
                noised_input_chunk = estimator.apply_noise(
                    numpy_input_chunk, noise_params[0], noise_params[1])
                input_chunk = Image.fromarray(
                    np.matrix.transpose(noised_input_chunk), mode='L')
            subsampled_row.append(groundtruth_chunk)
            new_original_row.append(input_chunk)
        subsampled_chunks.append(subsampled_row)
        new_original_chunks.append(new_original_row)

    groundtruth_image = join_chunks_into_image(subsampled_chunks)
    new_original_image = join_chunks_into_image(new_original_chunks)

    # Generated images must have even dimensions.
    width, height = groundtruth_image.size
    if width % 2 != 0 or height % 2 != 0:
        new_width = width if width % 2 == 0 else (width - 1)
        new_height = height if height % 2 == 0 else (height - 1)
        groundtruth_image = groundtruth_image.crop(
            (0, 0, new_width, new_height))
        new_original_image = new_original_image.crop(
            (0, 0, new_width, new_height))

    print_inline(base_log + 'creating initial image')
    return new_original_image, groundtruth_image
Example #12
0
def print_formatted(datas):
    """Pretty print JSON DATA

    Argument:

        datas: dictionary of data
    """
    if not datas:
        print("No data")
        exit(1)

    if isinstance(datas, list):
        # get all zones
        # API /zone without :identifier
        hr()
        print('%-20s %-8s %-12s'
              % ('name', 'type', 'notified_serial'))
        hr()
        for record in datas:

            # print 'NAME'
            utils.print_inline("%(name)-20s" % record)

            # print 'TYPE' of SOA record
            utils.print_inline("%(type)-8s" % record)

            if record.get('notified_serial'):
                print("%(notified_serial)s" % record)
            else:
                print('')

        exit(0)

    elif datas.get('records'):
        print("domain: %(name)s" % datas)

        if datas.get('type') == 'MASTER' and datas.get('notified_serial'):
            print("serial: %(notified_serial)s" % datas)

        print("DNS   : %(type)s" % datas)

        # print header
        hr()
        print('%-33s %-5s %-25s %-5s %-3s'
              % ('name', 'type', 'content', 'ttl', 'prio'))
        hr()

        for record in datas.get('records'):

            # print 'NAME'
            utils.print_inline("%(name)-33s" % record)

            # print 'TYPE' of SOA record
            if record.get('type') == 'SOA':
                print("%(type)-5s" % record)

            # print 'TYPE' of non SOA record
            else:
                utils.print_inline("%(type)-5s" % record)

            # print 'CONTENT' of non SOA
            if record.get('type') == 'SOA':
                utils.print_inline(">\t\t%(content)-25s " % record)

            # print 'CONTENT' of SOA record
            else:
                utils.print_inline("%(content)-25s" % record)

            # print TTL, and PRIORITY for MX, SRV record
            if record.get('priority'):
                utils.print_inline("%(ttl)5s" % record)
                print("%(priority)2s" % record)

            # print ttl for non SOA record
            else:
                print("%(ttl)5s " % record)

        hr()

    elif datas.get('identifier'):
        # for template
        print("identifier : %(identifier)s" % datas)
        print("description: %(description)s" % datas)
        hr()
        print('%-33s %-5s %-25s %-5s %-3s'
              % ('name', 'type', 'content', 'ttl', 'prio'))

        for record in datas.get('entries'):

            # print 'NAME'
            utils.print_inline("%(name)-33s" % record)

            # print 'TYPE' for SOA
            if record.get('type') == 'SOA':
                print("%(type)-5s" % record)

            # print 'TYPE' for non SOA
            else:
                utils.print_inline("%(type)-5s" % record)

            # print 'CONTENT' for SOA
            if record.get('type') == 'SOA':
                utils.print_inline("> %(content)-25s " % record)

            # print 'CONTENT' for non SOA
            else:
                utils.print_inline("%(content)-24s" % record)

            # print 'TTL', and 'PRIORITY'
            if record.get('priority') is not None:
                utils.print_inline("%(ttl)5s" % record)
                print("%(priority)2s" % record)

            # print
            else:
                print("%(ttl)5s " % record)
        hr()
    else:
        print("No match records")
def main():
    start_at = time.time()
    dictionary = Word2Vec.load(params.dictionary_path)
    word_vectors = dictionary.wv
    del dictionary

    embedding = word_vectors_2_embedding(word_vectors)
    print('Finish loading Word2Vec model! Size: ({},{})'.format(
        embedding.shape[0], embedding.shape[1]))

    net = UnsupervisedCNNEmbeddingNetwork(embedding,
                                          params.num_channels,
                                          pos=params.n_positive,
                                          neg=params.n_negative)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    net.to(device)

    net.train()
    # optimizer = optim.SGD([{
    #     'params': net.module.network.cnn.parameters() if torch.cuda.device_count() > 1 else net.network.cnn.parameters()
    # }, {
    #     'params': net.module.network.fc.parameters() if torch.cuda.device_count() > 1 else net.network.fc.parameters(),
    #     'weight_decay': params.weight_decay
    # }], lr=params.learning_rate, momentum=params.momentum)

    optimizer = optim.Adadelta([{
        'params': net.network.cnn.parameters()
    }, {
        'params': net.network.fc.parameters(),
        'weight_decay': params.weight_decay
    }],
                               lr=params.learning_rate,
                               rho=0.9,
                               eps=1e-06)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                     mode='min',
                                                     factor=0.3,
                                                     patience=1,
                                                     min_lr=1e-3,
                                                     verbose=True)
    criterion = nn.BCEWithLogitsLoss()

    try:
        data_path = 'assets/data.pickle'
        pathlib.Path(data_path).parent.mkdir(parents=True, exist_ok=True)

        if os.path.exists(data_path):
            with open(data_path, mode='rb') as fp:
                samples = pickle.load(fp)
        else:
            items = pickle.load(open('assets/items.pickle', mode='rb'))
            common_words = pickle.load(open(params.common_words_path, 'rb'))
            uncommon_words = pickle.load(open(params.uncommon_words_path,
                                              'rb'))

            min_doc_length = params.min_offset + params.n_positive
            samples = build_batch(items, word_vectors, min_doc_length,
                                  common_words, uncommon_words)
            samples = list(samples.values())
            del items, common_words, uncommon_words

            with open(data_path, mode='wb') as fp:
                pickle.dump(samples, fp, pickle.HIGHEST_PROTOCOL)
        print('\nNumber of samples: %d' % len(samples))

        print("Training...")
        for epoch in range(1, params.n_epochs +
                           1):  # loop over the dataset multiple times
            acc_loss = train(samples, word_vectors, net, optimizer, criterion,
                             epoch)
            # print statistics
            print_inline('[{:3d}] loss: {:.5f} - learning rate: {}\n'.format(
                epoch, acc_loss,
                _get_learning_rate(optimizer)[0]))

            # Save the model if the validation loss is the best we've seen so far.
            if not scheduler.best or scheduler.is_better(
                    acc_loss, scheduler.best):
                with open(params.save_path, 'wb') as f:
                    torch.save(net.network.state_dict(), f)
            scheduler.step(acc_loss)

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')
        print('-' * 89)
    finally:
        end_at = time.time()
    print("start at: {}\nend_at: {}\nruntime: {} min".format(
        time.ctime(start_at), time.ctime(end_at), (end_at - start_at) / 60))
    print('Finished Training\n')
Example #14
0
    def _fit(self, X):
        if not self._initialized:
            layer = FullyConnected(self.n_hidden,
                                   bias=0.,
                                   random_seed=self.random_seed)
            layer.setup_weights(X.shape)
            self.W = layer.W
            self.vb = np.zeros(X.shape[1])
            self.hb = layer.b
            self._dW = np.zeros_like(self.W)
            self._dvb = np.zeros_like(self.vb)
            self._dhb = np.zeros_like(self.hb)
            self._rng = RNG(self.random_seed)
        self._rng.reseed()
        timer = Stopwatch(verbose=False).start()
        for _ in xrange(self.n_epochs):
            self.epoch += 1
            if self.verbose:
                print_inline('Epoch {0:>{1}}/{2} '.format(
                    self.epoch, len(str(self.n_epochs)), self.n_epochs))

            if isinstance(self.learning_rate, str):
                S, F = map(float, self.learning_rate.split('->'))
                self._learning_rate = S + (F - S) * (
                    1. - np.exp(-(self.epoch - 1.) / 8.)) / (
                        1. - np.exp(-(self.n_epochs - 1.) / 8.))
            else:
                self._learning_rate = self.learning_rate

            if isinstance(self.momentum, str):
                S, F = map(float, self.momentum.split('->'))
                self._momentum = S + (F - S) * (
                    1. - np.exp(-(self.epoch - 1) / 4.)) / (
                        1. - np.exp(-(self.n_epochs - 1) / 4.))
            else:
                self._momentum = self.momentum

            mean_recon = self.train_epoch(X)
            if mean_recon < self.best_recon:
                self.best_recon = mean_recon
                self.best_epoch = self.epoch
                self.best_W = self.W.copy()
                self.best_vb = self.vb.copy()
                self.best_hb = self.hb.copy()
                self._early_stopping = self.early_stopping
            msg = 'elapsed: {0} sec'.format(
                width_format(timer.elapsed(), default_width=5,
                             max_precision=2))
            msg += ' - recon. mse: {0}'.format(
                width_format(mean_recon, default_width=6, max_precision=4))
            msg += ' - best r-mse: {0}'.format(
                width_format(self.best_recon, default_width=6,
                             max_precision=4))
            if self.early_stopping:
                msg += ' {0}*'.format(self._early_stopping)
            if self.verbose:
                print msg
            if self._early_stopping == 0:
                return
            if self.early_stopping:
                self._early_stopping -= 1
Example #15
0
def print_formatted(datas):
    """Pretty print JSON DATA

    Argument:

        datas: dictionary of data
    """
    if not datas:
        print("No data")
        exit(1)

    if isinstance(datas, list):
        # get all zones
        # API /zone without :identifier
        hr()
        print('%-20s %-8s %-12s' % ('name', 'type', 'notified_serial'))
        hr()
        for record in datas:

            # print 'NAME'
            utils.print_inline("%(name)-20s" % record)

            # print 'TYPE' of SOA record
            utils.print_inline("%(type)-8s" % record)

            if record.get('notified_serial'):
                print("%(notified_serial)s" % record)
            else:
                print('')

        exit(0)

    elif datas.get('records'):
        print("domain: %(name)s" % datas)

        if datas.get('type') == 'MASTER' and datas.get('notified_serial'):
            print("serial: %(notified_serial)s" % datas)

        print("DNS   : %(type)s" % datas)

        # print header
        hr()
        print('%-33s %-5s %-25s %-5s %-3s' %
              ('name', 'type', 'content', 'ttl', 'prio'))
        hr()

        for record in datas.get('records'):

            # print 'NAME'
            utils.print_inline("%(name)-33s" % record)

            # print 'TYPE' of SOA record
            if record.get('type') == 'SOA':
                print("%(type)-5s" % record)

            # print 'TYPE' of non SOA record
            else:
                utils.print_inline("%(type)-5s" % record)

            # print 'CONTENT' of non SOA
            if record.get('type') == 'SOA':
                utils.print_inline(">\t\t%(content)-25s " % record)

            # print 'CONTENT' of SOA record
            else:
                utils.print_inline("%(content)-25s" % record)

            # print TTL, and PRIORITY for MX, SRV record
            if record.get('priority'):
                utils.print_inline("%(ttl)5s" % record)
                print("%(priority)2s" % record)

            # print ttl for non SOA record
            else:
                print("%(ttl)5s " % record)

        hr()

    elif datas.get('identifier'):
        # for template
        print("identifier : %(identifier)s" % datas)
        print("description: %(description)s" % datas)
        hr()
        print('%-33s %-5s %-25s %-5s %-3s' %
              ('name', 'type', 'content', 'ttl', 'prio'))

        for record in datas.get('entries'):

            # print 'NAME'
            utils.print_inline("%(name)-33s" % record)

            # print 'TYPE' for SOA
            if record.get('type') == 'SOA':
                print("%(type)-5s" % record)

            # print 'TYPE' for non SOA
            else:
                utils.print_inline("%(type)-5s" % record)

            # print 'CONTENT' for SOA
            if record.get('type') == 'SOA':
                utils.print_inline("> %(content)-25s " % record)

            # print 'CONTENT' for non SOA
            else:
                utils.print_inline("%(content)-24s" % record)

            # print 'TTL', and 'PRIORITY'
            if record.get('priority') is not None:
                utils.print_inline("%(ttl)5s" % record)
                print("%(priority)2s" % record)

            # print
            else:
                print("%(ttl)5s " % record)
        hr()
    else:
        print("No match records")
Example #16
0
def main():
    dictionary = Word2Vec.load(params.dictionary_path)
    common_words = pickle.load(open(params.common_words_path, 'rb'))
    uncommon_words = pickle.load(open(params.uncommon_words_path, 'rb'))

    word_vectors = dictionary.wv
    del dictionary

    embeddings = word_vectors_2_embedding(word_vectors)
    embedding_table = nn.Embedding(*embeddings.shape)
    embedding_table.weight.data.copy_(torch.tensor(embeddings))
    embedding_table.weight.requires_grad = False

    print('Finish loading Word2Vec model! Size: ({},{})'.format(
        embeddings.shape[0], embeddings.shape[1]))

    embedding_size = embeddings.shape[1]

    test_words = ['đẹp', 'Ronaldo', 'Covid']
    for test_word in test_words:
        _, test_word = normalize(test_word)
        print("*" * 90)
        print("Danh sách từ khóa cùng ngữ cảnh với từ: %s" % test_word)
        test_word_embedding = word_vectors[test_word]

        scores = np.matmul(embeddings, test_word_embedding)
        print([
            word_vectors.index2word[top_idx]
            for top_idx in np.argsort(scores)[-2:-12:-1]
        ])

        print("*" * 90)

    net = GatedCNN(embedding_size, params.num_channels)
    if torch.cuda.is_available():
        net = net.cuda()
        net.load_state_dict(torch.load(params.save_path))
    else:
        net.load_state_dict(
            torch.load(params.save_path,
                       map_location=lambda storage, loc: storage))
    net.eval()

    with open('assets/items.pickle', mode='rb') as fp:
        items = pickle.load(fp)

    if os.path.exists(params.embedding_path):
        with open(params.embedding_path, mode='rb') as fp:
            embedding_items = pickle.load(fp)
    else:
        batch = build_batch(items, word_vectors, common_words, uncommon_words)
        # save and clean
        del common_words, uncommon_words

        embedding_items = {}
        count = 0
        for newsId, index_vector in batch.items():
            count += 1
            print_inline('Calculating item embedding {}/{}'.format(
                count, len(items)))

            try:
                index_vector = torch.tensor(index_vector)
                inputs = embedding_table(index_vector)
                inputs = torch.FloatTensor(inputs)
                inputs = inputs.unsqueeze(0).permute(
                    0, 2, 1)  # (batch_size, embedding_size, seq_len)
                if torch.cuda.is_available():
                    inputs = inputs.cuda()
                inputs = Variable(inputs)
                doc_embedding = net(inputs)[0].cpu().detach().numpy()
                doc_embedding = doc_embedding / LA.norm(doc_embedding)
                embedding_items[newsId] = doc_embedding
            except Exception as e:
                print(e)
        with open(params.embedding_path, mode='wb') as fp:
            pickle.dump(embedding_items, fp, pickle.HIGHEST_PROTOCOL)

    def item_sim(id1, id2):
        return np.dot(embedding_items.get(id1, np.zeros(embedding_size)),
                      embedding_items.get(id2,
                                          np.zeros(embedding_size))).item()

    while True:
        item_id = input("\nNhập vào ID cua bài viết: ").strip()
        if item_id == "":
            break
        if item_id not in embedding_items:
            print("ID không tồn tại")
            continue
        print("Bài đang xét: " + items[item_id]['title_token'])

        def custom_comparator(id1, id2):
            score = item_sim(item_id, id2) - item_sim(item_id, id1)
            if score > 0:
                return 1
            if score == 0:
                return 0
            return -1

        candidate_items = embedding_items.copy()
        candidate_items.pop(item_id)

        sorted_ids = sorted(candidate_items.keys(),
                            key=functools.cmp_to_key(
                                lambda id1, id2: custom_comparator(id1, id2)))

        print("Danh sách top 10 bài liên quan được gợi ý:")

        count = 0
        i = 0
        title_set = set(normalize(items[item_id]['title_token']))
        while count < 10:
            title = normalize(items[sorted_ids[i]]['title_token'])
            i += 1
            if title in title_set:
                continue
            count += 1
            title_set.add(title)
            print("{}. {}".format(count, items[sorted_ids[i]]['title_token']))
    def fit(self, X, y):
        timer = Stopwatch(verbose=False).start()
        X, y = self._check_X_y(X, y)
        unique_params = self.unique_params()
        tts = TrainTestSplitter(**self.train_test_splitter_params)
        number_of_combinations = self.number_of_combinations()
        total_iter = self.n_splits * number_of_combinations
        current_iter_width = len(str(total_iter))

        if self.verbose:
            print "Training {0} on {1} samples x {2} features.".format(
                self.model.model_name(), *X.shape)
            print "{0}-fold CV for each of {1} params combinations == {2} fits ...\n"\
                .format(self.n_splits, number_of_combinations, total_iter)

        # initialize `cv_results_`
        self.cv_results_['mean_score'] = []
        self.cv_results_['std_score'] = []
        self.cv_results_['params'] = []
        for k in xrange(self.n_splits):
            self.cv_results_['split{0}_score'.format(k)] = []
            self.cv_results_['split{0}_train_time'.format(k)] = []
            self.cv_results_['split{0}_test_time'.format(k)] = []
        for param_name in unique_params:
            self.cv_results_['param_{0}'.format(param_name)] = ma.array([])

        current_iter = 0
        if self.refit:
            # for each param combination fit consequently on each fold
            # to obtain mean score across splits as soon as possible
            for params_index, params in enumerate(self.gen_params()):

                # set params and add to `cv_results_`
                self.model.reset_params().set_params(**params)
                self.cv_results_['params'].append(params)

                for param_name in unique_params:
                    cv_key = 'param_{0}'.format(param_name)
                    mask = [int(not param_name in params)]
                    to_concat = ma.array([params.get(param_name, None)],
                                         mask=mask)
                    self.cv_results_[cv_key] = ma.concatenate(
                        (self.cv_results_[cv_key], to_concat))
                splits_scores = []
                for split_index, (train, test) in enumerate(
                        tts.k_fold_split(y,
                                         n_splits=self.n_splits,
                                         stratify=True)):
                    # verbosing
                    if self.verbose:
                        current_iter += 1
                        t = "iter: {0:{1}}/{2} ".format(
                            current_iter, current_iter_width, total_iter)
                        t += '+' * (split_index + 1) + '-' * (self.n_splits -
                                                              split_index - 1)
                        print_inline(t)
                    # fit and evaluate
                    with Stopwatch(verbose=False) as s:
                        self.model.fit(X[train], y[train])
                    self.cv_results_['split{0}_train_time'.format(
                        split_index)].append(s.elapsed())
                    with Stopwatch(verbose=False) as s:
                        score = self.model.evaluate(X[test], y[test])
                    self.cv_results_['split{0}_test_time'.format(
                        split_index)].append(s.elapsed())
                    # score = self.scoring(y[test], y_pred)
                    splits_scores.append(score)
                    # add score to `cv_results_`
                    self.cv_results_['split{0}_score'.format(
                        split_index)].append(score)
                    # verbosing
                    if self.verbose:
                        print_inline(" elapsed: {0} sec".format(
                            width_format(timer.elapsed(), default_width=7)))
                        if split_index < self.n_splits - 1:
                            t = ""
                            if self.best_score_ > -np.inf:
                                t += " - best acc.: {0:.4f} at {1}" \
                                    .format(self.best_score_, self.best_params_)
                            else:
                                t += "   ..."
                            print t

                # compute mean and std score
                mean_score = np.mean(splits_scores)
                std_score = np.std(splits_scores)

                self.cv_results_['mean_score'].append(mean_score)
                self.cv_results_['std_score'].append(std_score)
                # update 'best' attributes
                if mean_score > self.best_score_:
                    self.best_index_ = params_index
                    self.best_score_ = mean_score
                    self.best_std_ = std_score
                    self.best_params_ = params
                    self.best_model_ = self.model
                    if self.save_models:
                        self.best_model_.save(filepath=os.path.join(
                            self.dirpath, self._best_model_name()),
                                              **self.save_params)
                # verbosing
                if self.verbose:
                    print_inline(
                        " - mean acc.: {0:.4f} +/- 2 * {1:.3f}\n".format(
                            mean_score, std_score))

        else:  # if self.refit == False
            # fit for each fold and then evaluate on each combination
            # of params
            for split_index, (train, test) in enumerate(
                    tts.k_fold_split(y, n_splits=self.n_splits,
                                     stratify=True)):
                current_best_score = -np.inf
                current_best_params = None
                for params_index, params in enumerate(self.gen_params()):
                    # set params
                    self.model.reset_params().set_params(**params)
                    # fit model (only once per split)
                    if params_index == 0:
                        with Stopwatch(verbose=False) as s:
                            self.model.fit(X[train], y[train])
                    # on first split add params to `cv_results_`
                    if split_index == 0:
                        # store params' values
                        self.cv_results_['params'].append(params)
                        for param_name in unique_params:
                            cv_key = 'param_{0}'.format(param_name)
                            mask = [int(not param_name in params)]
                            to_concat = ma.array(
                                [params.get(param_name, None)], mask=mask)
                            self.cv_results_[cv_key] = ma.concatenate(
                                (self.cv_results_[cv_key], to_concat))
                    # write training time
                    self.cv_results_['split{0}_train_time'.format(split_index)]\
                        .append(s.elapsed() if params_index == 0 else 0.)
                    # evaluate
                    with Stopwatch(verbose=False) as s:
                        score = self.model.evaluate(X[test], y[test])
                    self.cv_results_['split{0}_test_time'.format(
                        split_index)].append(s.elapsed())
                    # score = self.scoring(y[test], y_pred)
                    # add score to `cv_results_`
                    cv_key = 'split{0}_score'.format(split_index)
                    self.cv_results_[cv_key].append(score)
                    # update "current" best score and params
                    current_mean_score = np.mean([
                        self.cv_results_['split{0}_score'.format(k)]
                        [params_index] for k in xrange(split_index + 1)
                    ])
                    if current_mean_score > current_best_score:
                        current_best_score = current_mean_score
                        current_best_params = params
                    # verbosing
                    if self.verbose:
                        current_iter += 1
                        t = "iter: {0:{1}}/{2} ".format(
                            current_iter, current_iter_width, total_iter)
                        t += '+' * (split_index + 1) + '-' * (self.n_splits -
                                                              split_index - 1)
                        t += " elapsed: {0} sec".format(
                            width_format(timer.elapsed(), default_width=7))
                        if split_index < self.n_splits - 1:
                            t += " - best acc.: {0:.4f}  [{1}/{2} splits] at {3}"\
                                 .format(current_best_score, split_index + 1, self.n_splits, current_best_params)
                        print_inline(t)
                        if split_index < self.n_splits - 1: print
                    # after last split ...
                    if split_index == self.n_splits - 1:
                        # ... compute means, stds
                        splits_scores = [
                            self.cv_results_['split{0}_score'.format(k)]
                            [params_index] for k in xrange(self.n_splits)
                        ]
                        mean_score = np.mean(splits_scores)
                        std_score = np.std(splits_scores)
                        self.cv_results_['mean_score'].append(mean_score)
                        self.cv_results_['std_score'].append(std_score)
                        # ... and update best attributes
                        if mean_score > self.best_score_:
                            self.best_index_ = params_index
                            self.best_score_ = mean_score
                            self.best_std_ = std_score
                            self.best_params_ = params
                            self.best_model_ = self.model
                            if self.save_models:
                                self.best_model_.save(filepath=os.path.join(
                                    self.dirpath, self._best_model_name()),
                                                      **self.save_params)
                        # verbosing
                        if self.verbose:
                            print_inline(
                                " - best acc.: {0:.4f} +/- 2 * {1:.3f} at {2}\n"
                                .format(self.best_score_, self.best_std_,
                                        self.best_params_))

        # convert lists to np.ndarray
        for key in (['mean_score', 'std_score', 'params'] + [
                'split{0}_{1}'.format(k, s) for k in xrange(self.n_splits)
                for s in ('score', 'train_time', 'test_time')
        ]):
            self.cv_results_[key] = np.asarray(self.cv_results_[key])
        return self