Ejemplo n.º 1
0
def load_config(config_file='./myconfig.json'):
    if not os.path.exists(config_file):
        config_data = OrderedDict()

        ##################################################################
        # default option setting..
        config_data["data_dir"] = '~/usb/project/kakao_arena/data'
        config_data["dataset_dir"] = '~/usb/project/kakao_arena/dataset'

        ##################################################################
        # kakao default setting
        config_data["unigram_hash_size"] = 100000
        config_data["min_word_length"] = 2
        config_data["max_word_length"] = 31
        config_data["max_len"] = 32
        config_data["db_chunk_size"] = 100000
        config_data["num_workers"] = 10
        config_data["num_preidct_workers"] = 2
        config_data["embd_size"] = 128
        config_data["lr"] = 1e-4
        config_data["num_epochs"] = 100
        config_data["batch_size"] = 1024
        ##################################################################

        with open(config_file, 'w') as fp:
            json.dump(config_data, fp, ensure_ascii=False, indent=4)
        fp.close()

    return Option(config_file)
Ejemplo n.º 2
0
def main():
    opt = Option('./config.json')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # model
    model = HMCN(opt).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=opt.lr,
                                 betas=(0.9, 0.999))
    num_params = sum([p.numel() for p in model.parameters()])
    print('Total # of params: {:,}'.format(num_params))

    if continue_train == True:
        model.load_state_dict(torch.load(best_model_path))

    best_loss = 100000.
    for epoch in range(opt.num_epochs):
        train(opt, train_loader, model, criterion, optimizer, epoch)
        val_loss = evaluate(opt,
                            valid_loader,
                            model,
                            criterion,
                            make_file=False)

        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), best_model_path)
            print('model saved at loss: %.4f' % (best_loss))

        if (epoch + 1) % 5 == 0:
            torch.save(model.state_dict(),
                       save_model_path + '_E%d.pth' % (epoch + 1))

    model.load_state_dict(torch.load(best_model_path))
    dev_loss = evaluate(opt, dev_loader, model, criterion, make_file=True)

    pid_order = []
    h = h5py.File('./data/dev/data.h5py', 'r')['dev']
    pid_order.extend(h['pid'][::])
    no_ans = '{pid}\t-1\t-1\t-1\t-1'
    with open(result_path, 'r') as f:
        file_len = len(f.readlines())
        print('total prediction length:', file_len)
    with open(result_path, 'a') as f:
        pid_none = pid_order[file_len:]
        for pid in pid_none:
            f.write(no_ans.format(pid=pid))
            f.write('\n')
    print('created file at %s' % (result_path))
Ejemplo n.º 3
0
import time
import traceback
from multiprocessing import Pool

from gensim.models import Doc2Vec
from elasticsearch5 import Elasticsearch
import tqdm
import fire
import h5py
import numpy as np
import six
from six.moves import cPickle
import pandas as pd

from misc import get_logger, Option
opt = Option('./config.json')

es = Elasticsearch(hosts=opt.es_host)  #TODO conf


class Reader(object):
    def __init__(self, data_path_list, div, begin_offset, end_offset):
        self.div = div
        self.data_path_list = data_path_list
        self.begin_offset = begin_offset
        self.end_offset = end_offset

    def is_range(self, i):
        if self.begin_offset is not None and i < self.begin_offset:
            return False
        if self.end_offset is not None and self.end_offset <= i:
Ejemplo n.º 4
0
parser.add_argument('--input_root',
                    default='/data/output/tmp',
                    help='folder to load shuffled chunks')
parser.add_argument('--output_root',
                    default='/data/output',
                    help='folder to save tfrecords')
parser.add_argument('--shuffle',
                    type=lambda x: (str(x).lower() == 'true'),
                    default=True,
                    help='shuffle indices in chunks')
args = parser.parse_args()

if not os.path.exists(args.output_root):
    os.makedirs(args.output_root)

opt = Option("./config.json")

final_format = "%s_splitted.chunk.%02d"
if args.shuffle:
    final_format = "%s_shuffled.chunk.%02d"
tfrecord_format = "%s" + (
    "-%d-max%d" % (opt.unigram_hash_size, opt.max_len)) + ".%02d.tfrecord"


def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
Ejemplo n.º 5
0
# See the License for the specific language governing permissions and
# limitations under the License.

import tensorflow as tf

import keras
from keras.models import Model
from keras.layers.merge import dot
from keras.layers import Dense, Input
from keras.layers.core import Reshape

from keras.layers.embeddings import Embedding
from keras.layers.core import Dropout, Activation

from misc import get_logger, Option
opt = Option('shopping-classification/config.json')


def top1_acc(x, y):
    return keras.metrics.top_k_categorical_accuracy(x, y, k=1)


class TextOnly:
    def __init__(self):
        self.logger = get_logger('textonly')

    def get_model(self, num_classes, activation='sigmoid'):
        max_len = opt.max_len
        voca_size = opt.unigram_hash_size + 1

        with tf.device('/gpu:0'):
Ejemplo n.º 6
0
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

from keras.models import load_model
from keras.callbacks import ModelCheckpoint

from datetime import datetime
from misc import get_logger, Option
from shutil import copyfile
import sklearn.metrics as sklm

config_file_path = './config.json'
copyfile(config_file_path, 'model/config.json')     # backup config file (overwrite)

opt = Option(config_file_path)
cate1 = json.loads(open(opt.cate1, 'r').read())
os.environ["CUDA_VISIBLE_DEVICES"] = opt.gpu


class Classifier():
    def __init__(self):
        self.logger = get_logger('Classifier')
        self.num_classes = 0

    def get_sample_generator(self, ds, batch_size):
        left = 0
        limit = ds['uni'].shape[0]

        while True:
            right = min(left + batch_size, limit)
Ejemplo n.º 7
0
 def __init__(self, conf, verbose=False):
     self.logger = get_logger()
     self.verbose = verbose
     self.status = CpCybos.get_instance()
     self.opt = Option(conf)