def list_all_containers():
     port = START_PORT
     for i in range(NUM_WORKER_NODES):
         logger.info("Containers on Node co_node_{}: ".format(port))
         (out, err) = CLIUtils.run(LIST_POD_CONTAINERS_CMD.format(port))
         logger.info('list of containers: %s', out)
         port = port + 1
 def check_label_distribution(self, input_y):
     _label_distribution = np.sum(np.array(input_y), 0)
     empty_class_ = [
         i for i in range(_label_distribution.shape[0])
         if _label_distribution[i] == 0
     ]  # 包含样本量为0的类别
     self.kurtosis = stats.kurtosis(_label_distribution)
     self.normal_std = np.std(_label_distribution) / np.sum(
         _label_distribution)
     logger.info("check input_y kurtosis {}".format(self.kurtosis))
     logger.info("check input_y class: {} and normal_std is {}".format(
         empty_class_, self.normal_std))
     if len(empty_class_) == 0:  # No empty label, all label covered!
         self.shuffle = False
     else:
         self.shuffle = True
     if self.normal_std > 0.3:
         # fixme: 针对类别极不均衡
         self.imbalance_level = 2  #
     elif self.normal_std > 0.07:
         # fixme:针对类别不均衡
         self.imbalance_level = 1
     else:
         # fixme: 类别均衡
         self.imbalance_level = 0
Exemple #3
0
 def __init__(self, metadata):
     """
     Args:
       metadata: an AutoDLMetadata object. Its definition can be found in
           AutoDL_ingestion_program/dataset.py
     """
     self.done_training = False
     self.metadata = metadata
     self.domain = infer_domain(metadata)
     train_data_dir = self.metadata.get_dataset_name() + "/*"
     test_data_dir = self.metadata.get_dataset_name().replace("train", "test") + "/*"
     test_metadata_filename = self.metadata.get_dataset_name().replace("train", "test") + "/metadata.textproto"
     warmup_train_cmd = "cat {} >/dev/null".format(train_data_dir)
     warmup_test_cmd = "cat {} >/dev/null".format(test_data_dir)
     # logger.info("Note: test_metadata_filename={}, cmd={},{} AutoDL_G_CONF: {}".format(test_metadata_filename, warmup_train_cmd, warmup_test_cmd, autodl_g_conf_repr))
     logger.info(
         "Note: test_metadata_filename={}, cmd={},{}".format(
             test_metadata_filename, warmup_train_cmd, warmup_test_cmd
         )
     )
     os.system(warmup_train_cmd)
     os.system(warmup_test_cmd)
     logger.info("Note:The inferred domain of current dataset is: {}.".format(self.domain))
     # Domain识别及Model初始化
     # DomainModel = DOMAIN_TO_MODEL[self.domain]
     DomainModel = meta_domain_2_model(self.domain)
     # self.domain_metadata = get_domain_metadata(metadata, self.domain)
     # logger.info("Note:The domain metadata is {}".format(self.domain_metadata))
     self.domain_model = DomainModel(self.metadata)
     self.has_exception = False
     self.y_pred_last = None
Exemple #4
0
 def start(self, seconds):
     if self._context.sensor.is_high():
         self._blink_handler.cancel()
         automationhat.light.power.off()
         self._context.transition_to(RunningState(self._context, seconds))
     else:
         logger.info('The water tank is empty!')
Exemple #5
0
    def test_logger_info(self, mock_log):
        obj = object()
        logger.info('test', request=obj)

        self.assertEqual(mock_log.call_args, (
            ('[{0}] test'.format(id(obj)), ),
            { 'extra': {} },
        ))
    def decide_first_num(self):
        snoop_data_num = min(0.01 * self.train_num,
                             FIRST_SNOOP_DATA_NUM)  # 第一次最多取700
        snoop_X, snoop_Y = self.tf_dataset_trainsformer.get_nlp_train_dataset(
            snoop_data_num)
        label_coverage, normal_std = self.check_label_coverage(snoop_Y)
        self.check_input_length(snoop_X[:FIRST_SNOOP_DATA_NUM])
        logger.info("label_coverage is {}".format(label_coverage))
        if normal_std > 0.3:
            dataset_read_num = min(5000, int(0.1 * self.train_num))
        else:
            if self.class_num == 2 and self.train_num <= 50000:
                if label_coverage == 1.0:  # 类别均匀覆盖
                    dataset_read_num = max(int(0.01 * self.train_num), 500)
                    # 设置小样本下限
                    if self.train_num <= 10000:
                        dataset_read_num = min(
                            5000, self.domain_metadata["class_num"] * 3000)
                else:  # snoop类别有缺失, 可能为顺序进样
                    dataset_read_num = min(5000, int(0.1 * self.train_num))

            elif self.class_num == 2 and self.train_num > 50000:
                if label_coverage == 1.0:  # 类别均匀覆盖
                    # 不超过10w的数据集,取1%, 超过10w的数据集,取1000上限
                    dataset_read_num = min(int(0.01 * self.train_num), 1000)
                else:  # snoop类别有缺失, 可能为顺序进样
                    dataset_read_num = min(5000, int(0.1 * self.train_num))

            ########################### 多分类 ######################################
            elif self.class_num > 2 and self.train_num <= 50000:
                if label_coverage == 1.0:  # 类别均匀覆盖
                    dataset_read_num = min(
                        int((2 / self.class_num) * self.train_num), 1000)
                    # 设置小样本下限
                    if self.train_num <= 10000:
                        dataset_read_num = min(
                            5000, self.domain_metadata["class_num"] * 3000)
                else:
                    dataset_read_num = min(5000, int(0.1 * self.train_num))
            elif self.class_num > 2 and self.train_num > 50000:
                if label_coverage == 1.0:  # 类别均匀覆盖
                    # 不超过10w的数据集,取1%, 超过10w的数据集,取1500上限
                    dataset_read_num = min(
                        int((2 / self.class_num) * self.train_num), 1500)
                else:  # snoop类别有缺失, 可能为顺序进样
                    dataset_read_num = min(5000, int(0.1 * self.train_num))
            ########################### 多分类 ######################################
                if self.domain_metadata[
                        "language"] == "ZH" and self.check_len <= 40:
                    dataset_read_num += min(2000, 0.1 * self.train_num)

        X, Y = self.tf_dataset_trainsformer.get_nlp_train_dataset(
            dataset_read_num)
        X = X + snoop_X
        Y = np.concatenate([Y, snoop_Y], axis=0)
        return dataset_read_num, X, Y
 def make_get_request(self, router_name: str):
     url = f'{self.base_url}{router_name}'
     try:
         logger.info(f'request {url}')
         output = urllib.request.urlopen(url).read().decode()
         logger.debug(f'output: {output}')
         objs = json.loads(output)
         return objs
     except Exception as ex:
         logger.exception(f'error on {url}', exc_info=ex)
Exemple #8
0
def initialize_model(model):
    for pr_name, p in model.named_parameters():
        if 'albert_embeddings' in pr_name:
            p.requires_grad = False
        # p.data.uniform_(-opt.param_init, opt.param_init)
        elif 'rezero_alpha' in pr_name:
            logger.info('{} is rezero param'.format(pr_name))
            nn.init.zeros_(p)
        else:
            if p.dim() == 1:
                # p.data.zero_()
                p.data.normal_(0, math.sqrt(6 / (1 + p.size(0))))
            else:
                nn.init.xavier_normal_(p, math.sqrt(3))
        logger.info("{}: requires_grad {}".format(pr_name, p.requires_grad))
Exemple #9
0
def __process_instances_for_albert(instances: List[SquadInstance], albert_tokenizer: AlbertTokenizer):
    new_instances = []
    for instance in instances:
        src_limit = max_seq_len - 2
        if len(instance.src) > max_seq_len - 2:
            logger.info("src={} exceeds {}".format(len(instance.src), max_seq_len-2))
        src=[albert_tokenizer.bos_token] + instance.src[:src_limit] + [albert_tokenizer.eos_token]
        ans=instance.ans + [albert_tokenizer.eos_token]
        tgt=[albert_tokenizer.bos_token] + instance.tgt + [albert_tokenizer.eos_token]
        bio=['O'] + instance.bio[:src_limit] + ['O']
        ner=[PAD] + instance.ner[:src_limit] + [PAD]
        case=[PAD] + instance.case[:src_limit] + [PAD]
        pos=[PAD] + instance.pos[:src_limit] + [PAD]
        new_instances.append(SquadInstance(src=src, tgt=tgt, bio=bio, case=case, ner=ner, pos=pos, ans=ans))
    return new_instances
Exemple #10
0
    def train(self, dataset, remaining_time_budget=None):
        """Train method of domain-specific model."""
        # Convert training dataset to necessary format and
        # store as self.domain_dataset_train
        logger.info("Note:train_process  model.py starts train")

        try:
            # Train the model
            self.domain_model.train(dataset, remaining_time_budget)
            # Update self.done_training
            self.done_training = self.domain_model.done_training

        except Exception as exp:
            self.has_exception = True
            self.done_training = True
            error("Error, model_train exp={}, done_traning={}".format(exp, self.done_training))
Exemple #11
0
    def train(self, dataset, remaining_time_budget=None):
        """Train method of domain-specific model."""
        # Convert training dataset to necessary format and
        # store as self.domain_dataset_train
        logger.info("Note: speech_train_process  model.py starts train")
        as_timer("train_start")

        # load tf_train_dataset for first time.
        self.tf_dataset_trainsformer.init_train_tfds(dataset, self.train_num)

        if self.domain in ["speech"]:
            # Train the model with light model.
            if self.main_train_loop_num < speech_ms_mlp_conf.lightwei_train_end_loop:
                # fixme: need to be autotuned.
                ds_take_size = min(int(self.train_num * speech_ds_tds_conf.sample_ratio[self.main_train_loop_num]), self.class_num * 50)

                # self.domain_dataset_train = self.tf_dataset_trainsformer.get_speech_train_dataset(ds_take_size)

                # self.domain_model.train(self.domain_dataset_train, remaining_time_budget=remaining_time_budget)
                self.domain_model.train(self.tf_dataset_trainsformer.get_speech_train_dataset(ds_take_size), remaining_time_budget=remaining_time_budget)

                logger.info(
                    "Note: domain={}, main_train_loop_num={}, light_model train finished.".format(
                        self.domain, self.main_train_loop_num
                    )
                )
                as_timer("speech_model_basic_train")

            if self.main_train_loop_num >= speech_ms_mlp_conf.midwei_train_start_loop:
                self.speech_widsom_model.train(
                    # (self.domain_dataset_train["x"], self.domain_dataset_train["y"]), remaining_time_budget
                    self.tf_dataset_trainsformer.get_speech_train_dataset_full(), remaining_time_budget
                )
                logger.info("Note: start wisdom at np, main_train_loop_num={}".format( self.main_train_loop_num))
                as_timer("speech_tr34_train")

            logger.info("Note:time_train model.py domain_model train finished.")

            # Update self.done_training
            self.done_training = self.domain_model.done_training
            self.main_train_loop_num += 1
            # print(as_timer)
            as_timer("train_end")
            logger.info(as_timer)
        else:
            logger.error("Note: Domain is not Speech!")
Exemple #12
0
    def test(self, dataset, remaining_time_budget=None):
        """Test method of domain-specific model."""
        # Convert test dataset to necessary format and
        # store as self.domain_dataset_test
        # self.set_domain_dataset(dataset, is_training=False)

        as_timer("test_start")
        # init tf_test_dataset for the first time.
        self.tf_dataset_trainsformer.init_test_tfds(dataset)

        self.domain_dataset_test, self.X_test = self.tf_dataset_trainsformer.get_speech_test_dataset()

        # As the original metadata doesn't contain number of test examples, we
        # need to add this information
        if self.domain in ["text", "speech"] and (not self.domain_metadata["test_num"] >= 0):
            self.domain_metadata["test_num"] = len(self.X_test)
        logger.info("Note:test_process test domain metadata is {}".format(self.domain_metadata))

        # Make predictions
        if self.domain in ["speech"]:
            if (
                self.main_train_loop_num
                <= speech_ms_mlp_conf.midwei_train_start_loop + speech_ms_mlp_conf.midwei_predict_block_loop
            ):
                Y_pred = self.domain_model.test(self.domain_dataset_test, remaining_time_budget=remaining_time_budget)
                logger.info(
                    "Note: speech pasa_model, speech_main_train_loop={}, speech_main_test_loop={}".format(
                        self.main_train_loop_num, self.main_test_loop_num
                    )
                )
                # Update self.done_training
                self.done_training = self.domain_model.done_training

            else:
                Y_pred = self.speech_widsom_model.test(
                    self.domain_dataset_test, remaining_time_budget=remaining_time_budget
                )
                logger.info(
                    "Note: speech dw_model, train_loop={}, test_loop={}".format( self.main_train_loop_num, self.main_test_loop_num)
                )
                # Update self.done_training
                self.done_training = self.speech_widsom_model.done_training

            as_timer("test_end")
            logger.info(as_timer)
        else:
            logger.error("Note: Domain is not Speech!")

        self.main_test_loop_num += 1

        return Y_pred
    def test(self, dataset, remaining_time_budget=None):
        """Test method of domain-specific model."""
        # Convert test dataset to necessary format and
        # store as self.domain_dataset_test
        start = time.time()

        self.tf_dataset_trainsformer.init_test_tfds(dataset)
        end = time.time()
        self.domain_model.time_record["init_test_tfds"] = end - start

        self.set_domain_dataset(dataset, is_training=False)

        # As the original metadata doesn't contain number of test examples, we
        # need to add this information
        if self.domain in ['text', 'speech'] and \
                (not self.domain_metadata['test_num'] >= 0):
            self.domain_metadata['test_num'] = len(self.X_test)
        logger.info("Note:test_process test domain metadata is {}".format(
            self.domain_metadata))

        # Make predictions
        logger.info("call num is {}".format(self.call_num))
        if self.call_num == -1:
            # Y_pred = self.domain_model.test_first_svm(self.domain_dataset_test,
            #                             remaining_time_budget=remaining_time_budget)
            Y_pred = self.domain_model.test(
                self.domain_dataset_test,
                remaining_time_budget=remaining_time_budget)
            self.call_num += 1
        else:
            Y_pred = self.domain_model.test(
                self.domain_dataset_test,
                remaining_time_budget=remaining_time_budget)
        if "test_num" not in self.domain_model.feature_dict:
            self.domain_model.feature_dict["test_num"] = self.domain_metadata[
                'test_num']

        # Update self.done_training
        self.done_training = self.domain_model.done_training

        return Y_pred
    def __init__(self, metadata):
        """
        Args:
          metadata: an AutoDLMetadata object. Its definition can be found in
              AutoDL_ingestion_program/dataset.py
        """
        self.done_training = False
        self.metadata = metadata
        self.first_round_sample_maxnum = 200
        self.call_num = -1  # 0
        self.domain_dataset_train_dict = {"x": [], "y": np.array([])}
        # self.domain = infer_domain(metadata)
        self.domain = "text"
        logger.info("Note:The AutoDL_G_CONF: {}".format(autodl_g_conf_repr))
        logger.info("Note:The inferred domain of current dataset is: {}." \
                    .format(self.domain))

        # Domain识别及Model初始化
        # DomainModel = DOMAIN_TO_MODEL[self.domain]
        DomainModel = meta_domain_2_model(self.domain)

        self.domain_metadata = get_domain_metadata(metadata, self.domain)
        self.class_num = self.domain_metadata["class_num"]
        self.train_num = self.domain_metadata["train_num"]

        logger.info("Note:The domain metadata is {}".format(
            self.domain_metadata))
        self.domain_model = DomainModel(self.domain_metadata)
        # init for nlp
        self.nlp_index_to_token = None
        self.nlp_sep = None
        self.init_nlp()
        self.domain_model.vocab = self.vocabulary
        self.shuffle = False
        self.check_len = 0
        self.imbalance_level = -1
        # for tf_dataset.

        self.tf_dataset_trainsformer = TfDatasetTransformer(
            if_train_shuffle=False, config=config)
        self.tf_dataset_trainsformer.init_nlp_data(self.nlp_index_to_token,
                                                   self.nlp_sep)
        self.time_record = {}
        self.seq_len = []
        self.first_round_X = []
        self.first_round_Y = np.array([])
        self.X_test_raw = None
    def train(self, dataset, remaining_time_budget=None):
        """Train method of domain-specific model."""
        # Convert training dataset to necessary format and
        # store as self.domain_dataset_train
        logger.info("Note:train_process  model.py starts train")
        # if self.call_num==0:
        #     dataset = dataset.shuffle(min(1000, self.train_num))
        start = time.time()
        self.tf_dataset_trainsformer.init_train_tfds(dataset, self.train_num)
        end = time.time()
        self.time_record["init_train_tfds"] = end - start

        if "train_num" not in self.domain_model.feature_dict:
            self.domain_model.feature_dict["train_num"] = self.train_num
            self.domain_model.feature_dict["class_num"] = self.class_num
            self.domain_model.feature_dict["language"] = self.domain_metadata[
                'language']

        self.set_domain_dataset(dataset, is_training=True)
        logger.info(
            "Note:train_process  model.py set domain dataset finished, domain_model train starts."
        )
        self.domain_model.time_record = self.time_record
        # Train the model

        # print("check domain_y", self.domain_dataset_train_dict["y"].shape)
        if self.call_num == -1:
            # self.domain_model.train_first_svm(self.domain_dataset_train_dict["x"], self.domain_dataset_train_dict["y"],
            #                     remaining_time_budget=remaining_time_budget)
            self.domain_model.train(
                self.domain_dataset_train_dict["x"],
                self.domain_dataset_train_dict["y"],
                remaining_time_budget=remaining_time_budget)
        else:

            self.domain_model.train(
                self.domain_dataset_train_dict["x"],
                self.domain_dataset_train_dict["y"],
                remaining_time_budget=remaining_time_budget)
            self.call_num += 1

        logger.info(
            "Note:train_process  model.py domain_model train finished.")

        # Update self.done_training
        self.done_training = self.domain_model.done_training
Exemple #16
0
    def __init__(self, metadata):
        """
    Args:
      metadata: an AutoDLMetadata object. Its definition can be found in
          AutoDL_ingestion_program/dataset.py
    """
        self.done_training = False
        self.metadata = metadata
        # self.domain = infer_domain(metadata)
        self.domain = "speech"
        # logger.info("Note:The AutoDL_G_CONF: {}".format(autodl_g_conf_repr))
        logger.info("Note:The inferred domain of current dataset is: {}.".format(self.domain))
        # Domain识别及Model初始化
        # DomainModel = DOMAIN_TO_MODEL[self.domain]
        DomainModel = meta_domain_2_model(self.domain)
        self.domain_metadata = get_domain_metadata(metadata, self.domain)
        self.class_num = self.domain_metadata["class_num"]
        self.train_num = self.domain_metadata["train_num"]

        logger.info("Note:The domain metadata is {}".format(self.domain_metadata))
        self.domain_model = DomainModel(self.domain_metadata)

        # fixme: 增加更新数据.
        self.speech_widsom_model = ASpeechWidsomModel(self.domain_metadata)
        self.speech_wisdom_dataset_train = None
        logger.info("Note:Init Speech Wisdom solution, is {}".format(self.domain_metadata))
        self.main_train_loop_num = 0
        self.main_test_loop_num = 0
        #
        self.raw_tf_train_dataset = None
        self.dataset_sample_size = None
        self.dataset_read_num_second = None
        self.data_all_np_x_list = list()
        self.data_all_np_y_array = None
        self.ds_incr_flag = True  # dataset sampling if still remain to be sampled incrementally.
        self.domain_dataset_train = None
        self.domain_dataset_test = None

        # for tf_dataset.
        self.tf_dataset_trainsformer = TfDatasetTransformer(if_train_shuffle=speech_ds_tds_conf.if_shuffle)
        as_timer("model_speech_init")
Exemple #17
0
 def __init__(self, event_loop=None):
     if not event_loop:
         event_loop = asyncio.get_event_loop()
     self._context = Context(event_loop)
     logger.info('Initial state: {}'.format(self._context.state))
Exemple #18
0
 def start(self, seconds):
     logger.info('No water!')
Exemple #19
0
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from hardware import Sprinkler
from log_utils import logger
import asyncio

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    sprinkler = Sprinkler(event_loop=loop)

    def tick():
        sprinkler.start(10)

    scheduler = AsyncIOScheduler(event_loop=loop)
    scheduler.add_job(tick, 'interval', hours=1)
    scheduler.start()
    logger.info('Scheduler started')

    try:
        loop.run_forever()
    except (KeyboardInterrupt, SystemExit):
        pass
    finally:
        logger.info('Scheduler stopped')
    }
}
num_nodes = 5
num_replicas = 3
port = START_PORT
# for i in range(int(num_replicas)):
#     cont_port = START_PORT
#     image_name = "nginx"
#     # for i in range(int(num_replicas)):
#     #     cont_name = "{}_{}".format(image_name, cont_port)
#     #     (out, err) = CLIUtils.run(START_POD_CONTAINER_CMD.format(port, cont_port, image_name, cont_name))
#     #     logger.debug('started container id: {}'.format(out))
#         # cont_port = cont_port + 1
#     cont_name = "{}_{}".format(image_name, cont_port)
#     (out, err) = CLIUtils.run(START_POD_CONTAINER_CMD.format(port, cont_port, image_name, cont_name))
#     logger.debug('started container id: {}'.format(out))
#     port = port + 1
(out, err) = CLIUtils.run(
    START_POD_CONTAINER_CMD.format(cont_spec["node"],
                                   cont_spec["container"]["port"],
                                   cont_spec["container"]["image"],
                                   cont_spec["container"]["name"]))
logger.debug('started container id: {}'.format(out))

port = START_PORT
for i in range(int(num_nodes)):
    logger.info("Containers on Node co_node_{}: ".format(port))
    (out, err) = CLIUtils.run(LIST_POD_CONTAINERS_CMD.format(port))
    logger.info('list of containers: %s', out)
    port = port + 1
Exemple #21
0
#!/usr/bin/python3
import sys
from log_utils import logger
from cli_utils import CLIUtils
from constants import *
from NodeAgent import NodeAgent
import pika
import datetime

if len(sys.argv) < 2:
    logger.info("Enter the number of nodes to bring up")
    logger.info("./bringup_nodes.py <num_nodes>")
    sys.exit(1)
num_nodes = sys.argv[1]

# create exchange
connection = pika.BlockingConnection(
    pika.ConnectionParameters('coqueue', 5672, '/',
                              pika.PlainCredentials("root", "root123")))
channel = connection.channel()
channel.exchange_declare('co_topic')

port = START_PORT
for i in range(int(num_nodes)):
    node_name = "co_node_{}".format(port)
    na = NodeAgent(node_name)
    na.nodes.insert_one({
        "name": node_name,
        "description": "worker node",
        "heart_beat_time": datetime.datetime.utcnow(),
        "free_mem": 10,
 def list_containers_on_node(node):
     logger.info("Containers on Node co_node_{}: ".format(node))
     (out, err) = CLIUtils.run(LIST_POD_CONTAINERS_CMD.format(node))
     logger.info('list of containers: %s', out)
    def set_domain_dataset(self, dataset, is_training=True):
        """Recover the dataset in corresponding competition format (esp. AutoNLP
        and AutoSpeech) and set corresponding attributes:
          self.domain_dataset_train
          self.domain_dataset_test
        according to `is_training`.
        """
        # self.dataset = None
        if is_training:
            subset = 'train'
        else:
            subset = 'test'
        attr_dataset = 'domain_dataset_{}'.format(subset)

        if not hasattr(self, attr_dataset):
            logger.info(
                "Note: Begin recovering dataset format in the original " +
                "competition for the subset: {}...".format(subset))
            if self.domain == 'text':
                if DM_DS_PARAS.text.if_sample and is_training:

                    # dataset_read_num = min(5000, self.domain_metadata["class_num"] * 3000)
                    # if self.train_num >= 10000:
                    #     dataset_read_num = min(dataset_read_num, int(0.1 * self.train_num))
                    dataset_read_num, X, Y = self.decide_first_num()
                    logger.info(
                        "Note: set_domain_dataset text, dataset sampling, shuffle and take starts, train_read_num = {}"
                        .format(dataset_read_num))

                    # Get X, Y as lists of NumPy array
                    start = time.time()
                    self.check_label_distribution(np.array(Y))
                    end = time.time()
                    self.time_record["check_label_distribution"] = end - start

                    self.domain_model.imbalance_level = self.imbalance_level

                    feature_dict["check_len"] = float(self.check_len)
                    feature_dict["kurtosis"] = float(self.kurtosis)
                    feature_dict["first_detect_normal_std"] = float(
                        self.normal_std)
                    feature_dict["imbalance_level"] = self.imbalance_level
                    feature_dict["is_shuffle"] = self.shuffle

                    logger.info(
                        "Note: update domain model imbalace level after first detect!"
                    )
                    # if self.check_len <= 40 or self.normal_std>=0.2:
                    #     dataset_read_num += min(0.2*self.train_num, 12000)

                    if self.shuffle and self.domain_metadata[
                            "language"] == "ZH":
                        self.shuffle = False
                        dataset_read_num = int(0.4 * self.train_num)
                        start = time.time()
                        _X, _Y = self.tf_dataset_trainsformer.get_nlp_train_dataset(
                            dataset_read_num)
                        X = X + _X
                        Y = np.concatenate([Y, _Y], axis=0)
                        end = time.time()
                        self.time_record[
                            "get_nlp_train_dataset_new"] = end - start

                        _label_distribution = np.sum(Y, 0)
                        occu_class_ = [
                            i for i in range(_label_distribution.shape[0])
                            if _label_distribution[i] != 0
                        ]  # 已经拿到的label类别

                        if len(occu_class_) >= 2:
                            pass
                        else:
                            # 再多取20%
                            dataset_read_num = int(0.2 * self.train_num)
                            logger.info(
                                "Use extra 20% sample: Class num < 2 for ZH data!"
                            )
                            _X, _Y = self.tf_dataset_trainsformer.get_nlp_train_dataset(
                                dataset_read_num)
                            X = X + _X
                            Y = np.concatenate([Y, _Y], axis=0)
                            _label_distribution = np.sum(Y, 0)
                            occu_class_ = [
                                i for i in range(_label_distribution.shape[0])
                                if _label_distribution[i] != 0
                            ]
                            if len(occu_class_) < 2:
                                logger.info(
                                    "Use extra 100% sample: Class num < 2!")
                                dataset_read_num = int(self.train_num)
                                _X, _Y = self.tf_dataset_trainsformer.get_nlp_train_dataset(
                                    dataset_read_num)
                                X = X + _X
                                Y = np.concatenate([Y, _Y], axis=0)

                    ######################### 原始英文shuffle逻辑 ####################
                    if self.shuffle:
                        logger.info(
                            "Note: start shuffle dataset due to not enough labels!"
                        )
                        # redo take
                        start = time.time()
                        del self.tf_dataset_trainsformer
                        self.tf_dataset_trainsformer = TfDatasetTransformer(
                            if_train_shuffle=True, config=config)
                        end = time.time()
                        self.time_record[
                            "del trainsformer and init"] = end - start

                        start = time.time()
                        shuffle_size = max(int(0.5 * (self.train_num)), 10000)

                        shuffle_dataset = dataset.shuffle(shuffle_size)
                        end = time.time()
                        self.time_record["shuffle dataset"] = end - start

                        start = time.time()
                        self.tf_dataset_trainsformer.init_train_tfds(
                            shuffle_dataset, self.train_num, pad_num=20)
                        end = time.time()
                        self.time_record["init_new_train_tfds"] = end - start

                        start = time.time()
                        X, Y = self.tf_dataset_trainsformer.get_nlp_train_dataset(
                            dataset_read_num)

                        _label_distribution = np.sum(Y, 0)
                        occu_class_ = [
                            i for i in range(_label_distribution.shape[0])
                            if _label_distribution[i] != 0
                        ]  # 已经拿到的label类别
                        if len(occu_class_) >= 2:
                            pass
                        else:
                            logger.info(
                                "Use extra 100% sample: Class num < 2 for EN data!"
                            )
                            dataset_read_num = int(1 * (self.train_num))
                            _X, _Y = self.tf_dataset_trainsformer.get_nlp_train_dataset(
                                dataset_read_num)
                            X = X + _X
                            Y = np.concatenate([Y, _Y], axis=0)
                        end = time.time()
                        self.time_record[
                            "get_nlp_train_dataset_new"] = end - start

                        logger.info("Note: finish take after shuffle dataset")
                    ###################################################################

                    logger.info(
                        "Note: set_domain_dataset text, dataset sampling, shuffle and take ends, train_read_num = {}"
                        .format(dataset_read_num))
                    # self.domain_model.vocab = self.vocabulary
                    self.domain_model.avg_word_per_sample = float(
                        len(self.vocabulary) /
                        self.domain_metadata["train_num"])
                    if "avg_word_per_sample" not in feature_dict:
                        feature_dict[
                            "avg_word_per_sample"] = self.domain_model.avg_word_per_sample
                    self.domain_model.feature_dict = feature_dict
                    logger.info(
                        "Note: vocab size is {} and avg_word_per_sample is {}".
                        format(len(self.domain_model.vocab),
                               self.domain_model.avg_word_per_sample))
                elif not is_training:
                    start = time.time()
                    pad_num = 20
                    logger.info("pad num is {}".format(pad_num))
                    X, Y = self.tf_dataset_trainsformer.get_nlp_test_dataset(
                        pad_num=pad_num)
                    # self.X_test_raw = X
                    end = time.time()
                    self.domain_model.time_record[
                        "get_nlp_test_dataset_numpy_test"] = end - start

                if is_training:
                    self.first_round_X = X
                    self.first_round_Y = Y
                # Construct the corpus
                start = time.time()

                # 不转 corpus
                if self.call_num == 0:
                    corpus = []
                    seq_len = []

                    for _x in X:
                        _x = _x[_x != -1]
                        num_words = max(int(_x.shape[0] * 0.1), 301)
                        _x = _x[:num_words]
                        _x = _x.astype(str)
                        tokens = _x.tolist()
                        document = self.nlp_sep.join(tokens)
                        corpus.append(document)

                    logger.info("USE id as corpus {}")
                else:
                    corpus, seq_len = to_corpus(X, self.index_to_token,
                                                self.nlp_sep)
                    logger.info("USE word as corpus {}")

                end = time.time()
                self.seq_len = seq_len
                if is_training:
                    logger.info("to_corpus_train cost {}".format(end - start))
                    self.domain_model.time_record[
                        "to_corpus_train"] = end - start
                else:
                    logger.info("to_corpus_test cost {}".format(end - start))
                    self.domain_model.time_record[
                        "to_corpus_test"] = end - start
                # Construct the dataset for training or test
                if is_training:
                    labels = np.array(Y)
                    cnt = np.sum(np.count_nonzero(labels, axis=1), axis=0)
                    print("Check multi-label cnt {}".format(cnt))
                    if cnt > labels.shape[0]:
                        print("Check multi-label: True")
                        self.domain_model.multi_label = True
                        # self.domain_model.fasttext_embeddings_index = None
                        self.domain_model.db_model = None
                        self.domain_model.ft_model = None
                    domain_dataset = corpus, labels
                    # Set the attribute
                    self.domain_dataset_train_dict["x"] = corpus
                    self.domain_dataset_train_dict["y"] = labels
                else:
                    domain_dataset = corpus
                    # Set the attribute
                    self.domain_dataset_train_dict["x"] = corpus
                    self.X_test = corpus

                setattr(self, attr_dataset, domain_dataset)

            elif self.domain == 'speech':
                # Set the attribute
                setattr(self, attr_dataset, dataset)

            elif self.domain in ['image', 'video', 'tabular']:
                setattr(self, attr_dataset, dataset)
            else:
                raise ValueError("The domain {} doesn't exist.".format(
                    self.domain))

        else:
            if subset == 'test':
                if self.X_test_raw:
                    self.domain_dataset_test, test_seq_len = to_corpus(
                        self.X_test_raw, self.index_to_token, self.nlp_sep)
                self.X_test_raw = None
                return

            if self.domain == 'text':
                if DM_DS_PARAS.text.if_sample and is_training:
                    if self.domain_model.multi_label:
                        self.domain_model.use_multi_svm = True
                        self.domain_model.start_cnn_call_num = 2
                        dataset_read_num = self.train_num
                        if dataset_read_num > 50000:
                            dataset_read_num = 50000
                            logger.info(" Set Upper limit!")
                    else:
                        if self.imbalance_level >= 1:
                            dataset_read_num = self.train_num
                            self.domain_model.use_multi_svm = False
                            self.domain_model.start_cnn_call_num = 1
                            if dataset_read_num > 50000:
                                dataset_read_num = 50000
                                logger.info(" Set Upper limit!")
                        else:
                            self.domain_model.use_multi_svm = True
                            if self.call_num <= self.domain_model.start_first_stage_call_num - 1:
                                dataset_read_num = 3000
                                if self.check_len <= 40 or self.normal_std >= 0.2:
                                    dataset_read_num += min(
                                        int(0.2 * self.train_num), 12000)
                            else:
                                # dataset_read_num = int(self.domain_metadata["train_num"] * linear_sampling_func(self.call_num))
                                if self.call_num == self.domain_model.start_first_stage_call_num:
                                    dataset_read_num = int(
                                        0.9 *
                                        self.domain_metadata["train_num"])
                                    if dataset_read_num > 50000:
                                        dataset_read_num = 50000
                                else:
                                    if self.train_num <= 55555:
                                        dataset_read_num = 4000
                                    else:
                                        dataset_read_num = 5500

                    logger.info(
                        "Note: set_domain_dataset text, dataset sampling, shuffle and take starts, train_read_num = {}"
                        .format(dataset_read_num))
                    # Get X, Y as lists of NumPy array
                    start = time.time()
                    X, Y = self.tf_dataset_trainsformer.get_nlp_train_dataset(
                        dataset_read_num)
                    end = time.time()
                    # if self.call_num == 0:
                    #     logger.info("Use first round data!")
                    #     X = self.first_round_X + X
                    #     Y = np.concatenate([self.first_round_Y, Y], axis=0)
                    if self.call_num == 1:
                        self.time_record[
                            "get_nlp_train_dataset_to_numpy call_num=1"] = end - start
                    logger.info(
                        "Note: set_domain_dataset text, dataset sampling, shuffle and take ends, train_read_num = {}"
                        .format(dataset_read_num))

                # Construct the corpus
                corpus = []
                start = time.time()

                corpus, seq_len = to_corpus(X, self.index_to_token,
                                            self.nlp_sep)
                end = time.time()
                self.seq_len.extend(seq_len)
                # self.time_record["to_corpus when call_num=1"] = end-start
                if "avg_length" not in self.domain_model.feature_dict:
                    self.domain_model.feature_dict["avg_length"] = int(
                        np.average(self.seq_len))
                    self.domain_model.feature_dict["max_length"] = int(
                        np.max(self.seq_len))
                    self.domain_model.feature_dict["min_length"] = int(
                        np.min(self.seq_len))
                    self.domain_model.feature_dict["seq_len_std"] = int(
                        np.std(self.seq_len))

                if self.domain_model.max_length == 0:
                    if int(np.max(self.seq_len)) <= 301:
                        self.domain_model.max_length = int(np.max(
                            self.seq_len))
                        self.domain_model.bert_check_length = int(
                            np.max(self.seq_len))

                    else:
                        self.domain_model.max_length = int(
                            np.average(self.seq_len))
                        self.domain_model.bert_check_length = int(
                            np.average(self.seq_len))

                    self.domain_model.seq_len_std = int(np.std(self.seq_len))
                if self.seq_len:
                    logger.info(
                        "Note: set domain_model max_length = {}".format(
                            self.domain_model.max_length))

                    logger.info(
                        "Note: check domain_model max_length = {}".format(
                            int(np.max(self.seq_len))))
                    logger.info(
                        "Note: check domain_model max_length std = {}".format(
                            int(np.std(self.seq_len))))
                # Construct the dataset for training or test
                if is_training:
                    labels = np.array(Y)
                    domain_dataset = corpus, labels
                    print("\n upadte domain_dataset \n")
                    print("check domain_dataset_train_dict y:", labels.shape)
                    self.domain_dataset_train_dict["x"] = corpus
                    self.domain_dataset_train_dict["y"] = labels
                    # print(self.domain_dataset_train)
                    # self.domain_dataset_train = domain_dataset
                else:
                    domain_dataset = corpus
Exemple #24
0
def train(config, model, optim:Optim, train_instances, dev_instances, word_vocab, bio_vocab, feat_vocab):
    model.train()
    start_time = time.time()
    batch_num = 0
    num_trial = 0
    report_start_time = start_time
    report_loss, report_words_num = 0, 0
    for epoch in range(config['epoch']):
        for batch in batch_iter(train_instances, config['batch_size'],
                                                     word_vocab=word_vocab,
                                                     bio_vocab=bio_vocab,
                                                     feat_vocab=feat_vocab):
            logger.debug("src_tokens\n{}".format(batch.src))
            logger.debug("ans_tokens\n{}".format(batch.ans))
            batch_num += 1
            model.zero_grad()
            gen_output = model(batch)  # (tgt_len-1, B, vocab)
            gen_output = gen_output.transpose(0, 1).contiguous() # (B, tgt_len-1, vocab)
            lprobs = torch.log_softmax(gen_output, dim=-1)
            batch_size = gen_output.size(0)
            if config['max_out_cpy']:
                gold = torch.tensor([x[1:] for x in batch.tgt_extended_index], dtype=torch.long, device=model.device)
            else:
                gold = torch.tensor([x[1:] for x in batch.tgt_index], dtype=torch.long, device=model.device)
                # (B, tgt_len-1)
            batch_loss = lloss(lprobs, gold, ignore_index=word_vocab.pad_idx)
            if config['ulloss']:
                batch_loss += config['ulloss_weight'] * ulloss(lprobs, gold, ignore_index=word_vocab.pad_idx)
            if config['seq_ulloss'] and torch.rand(1).item() < config['seq_ulloss_rate']:
                batch_loss += ulloss_seq(lprobs, config['seq_ulloss_ngram'], config['seq_ulloss_seq_type'],
                                         mask_p=config['seq_ulloss_mask_p'])
            report_loss += batch_loss.item()
            report_words_num += sum(batch.tgt_len) - batch_size
            batch_loss.backward()
            optim.step()

            if batch_num % config['log_per_batches'] == 0:
                logger.info('epoch {}|batch {}|avg.loss {:.4f}|ppl {:.3f}|lr {}|t {}|total t {}'.format(
                    epoch,
                    batch_num,
                    report_loss/report_words_num,
                    math.exp(report_loss / report_words_num),
                    optim.lr,
                    user_friendly_time_since(report_start_time),
                    user_friendly_time_since(start_time)
                ))
                report_loss = report_words_num = 0
                report_start_time = time.time()

            if batch_num > config['start_validate_after_batches'] and batch_num % config['validate_per_batches'] == 0:
                ppl = evaluate_ppl(model, dev_instances,
                                           word_vocab=word_vocab,
                                           bio_vocab=bio_vocab,
                                           feat_vocab=feat_vocab)
                if optim.is_better(ppl):
                    model.save(config['model_save_path'])
                    logger.info("model saved!")
                hit_trial = optim.update_lr(ppl)
                optim.metric_history.append(ppl)
                logger.info('eval ppl {}|patience {}|current lr {}|best metric {}'.format(
                    ppl, optim.patience, optim.lr, optim.best_metric))
                if hit_trial:
                    num_trial += 1
                    logger.info("hit trial: [{}]".format(num_trial))
                    if num_trial >= config['max_num_trial']:
                        logger.info("early stop")
                        exit(0)
                    logger.info('restoring parameters')
                    state = torch.load(config['model_save_path'])
                    model.load_state_dict(state['model_state'])
                    model.to(device)
                import random
                test_instances = random.sample(train_instances, 100)
                bleus = evaluate_bleu(model, test_instances, config, word_vocab)
                logger.info("BLEU_1 {} BLEU_2 {} BLEU_3 {} BLEU_4 {} BLEU {}".format(*bleus))
Exemple #25
0
            logger.info('{} is rezero param'.format(pr_name))
            nn.init.zeros_(p)
        else:
            if p.dim() == 1:
                # p.data.zero_()
                p.data.normal_(0, math.sqrt(6 / (1 + p.size(0))))
            else:
                nn.init.xavier_normal_(p, math.sqrt(3))
        logger.info("{}: requires_grad {}".format(pr_name, p.requires_grad))


if __name__ == '__main__':
    init_logger(level='info', log_file='train.log')
    config = load_config()
    device = torch.device('cpu') if config['gpu'] < 0 else torch.device('cuda:{}'.format(config['gpu']))
    logger.info("training with param:\n{}".format(config))
    logger.info("training with device: {}".format(device))
    if config['albert']:
        word_vocab = AlbertVocab(config['albert_model_name'], cache_dir=config['albert_cache_dir'])
    else:
        word_vocab = load_word_vocab('squad_out/train.txt.vocab.word', config['vocab_size'])
    logger.info(word_vocab)
    bio_vocab = load_bio_vocab('squad_out/train.txt.vocab.bio')
    logger.info(bio_vocab)
    feat_vocab = load_feat_vocab('squad_out/train.txt.vocab.feat')
    logger.info(feat_vocab)
    train_instances = load_instances('squad_out/train.ins')
    dev_instances = load_instances('squad_out/dev.ins')
    if config['model'] == 'nmt':
        model = NMT(word_vocab, bio_vocab, feat_vocab, config['word_embed_size'],
                    config['bio_embed_size'], config['feat_embed_size'],
Exemple #26
0
def translate(model,
              instances,
              config,
              word_vocab,
              predict_save_path=None,
              predict_atten_engy_path=None):
    """

    :param model:
    :param instances:
    :param beam_size:
    :param max_decode_step:
    :param vocabs:
    :return: List[List[str]], the translated result for each instance
    """
    was_training = model.training
    model.eval()

    vocabs = {
        'word_vocab': model.word_vocab,
        'bio_vocab': model.bio_vocab,
        'feat_vocab': model.feat_vocab
    }
    max_decode_step = config['max_decode_step']
    dec_method = config['dec_method']
    beam_size = config['beam_size']
    nucleus_p = config['nucleus_p']
    logger.info("translate using method {}".format(dec_method))
    copy_hypothesis = []
    no_copy_hypothesis = []
    atten_engy = []
    total_completed = 0
    with torch.no_grad():
        for batch in tqdm(batch_iter(instances, 1, shuffle=False, **vocabs),
                          total=len(instances)):
            if dec_method == 'beam_search':
                instance_hypothesis, has_completed = model.beam_search(
                    batch, beam_size, max_decode_step)
            elif dec_method == 'nucleus_sampling':
                instance_hypothesis, has_completed = model.nucleus_sampling(
                    batch, max_decode_step, nucleus_p=nucleus_p)
            else:
                raise Exception(
                    "decoding method {} is not supported".format(dec_method))
            total_completed += int(has_completed)
            copy_hypothesis.append(instance_hypothesis[0][0])
            no_copy_hypothesis.append(instance_hypothesis[0][1])
            atten_engy.append(instance_hypothesis[0][2])
    if was_training:
        model.train(was_training)
    if predict_save_path:
        obj = []
        for idx, instance in enumerate(instances):
            obj.append({
                'idx': idx,
                'context': " ".join(instance.src),
                'ans': " ".join(instance.ans),
                'gold': " ".join(instance.tgt),
                'no_copy_predict': " ".join(no_copy_hypothesis[idx]),
                'predict': " ".join(copy_hypothesis[idx])
            })
        json.dump(obj, open(predict_save_path, 'w'), indent=2)
    if predict_atten_engy_path:
        obj = []
        for idx, (engy, instance, hypothesis) in enumerate(
                zip(atten_engy, instances, copy_hypothesis)):
            obj.append({
                'idx': idx,
                'decode_engy': str(engy),
                'src_tokens': ' '.join(instance.src),
                'output_tokens': ' '.join(hypothesis)
            })
        json.dump(obj, open(predict_atten_engy_path, 'w'), indent=2)
    logger.info("{} of {} is completed hypothesis".format(
        total_completed, len(instances)))
    return copy_hypothesis
Exemple #27
0
from hardware import Sprinkler
from log_utils import logger
import asyncio

loop = asyncio.get_event_loop()
s = Sprinkler(loop)

loop.call_later(1, s.start, 30)

logger.info('starting')
loop.run_forever()
Exemple #28
0
 def transition_to(self, state):
     logger.info('State transition: {} -> {}'.format(self.state, state))
     self.state = state
Exemple #29
0
                if len(new_src_token_list) > 0:
                    final_tokens['src'].extend(new_src_token_list)
                    final_tokens['bio'].extend([bio_tokens[idx]] * len(new_src_token_list))
                    final_tokens['case'].extend([case_tokens[idx]] * len(new_src_token_list))
                    final_tokens['ner'].extend([ner_tokens[idx]] * len(new_src_token_list))
                    final_tokens['pos'].extend([pos_tokens[idx]] * len(new_src_token_list))
                else:
                    print("zero: {} {}".format(src_token, new_src_token_list))
            for tgt_token in tgt_tokens:
                final_tokens['tgt'].extend(albert_tokenizer.tokenize(tgt_token))
        else:
            final_tokens = {'src': src_tokens, 'tgt': tgt_tokens, 'bio': bio_tokens,
                            'case': case_tokens, 'ner': ner_tokens, 'pos': pos_tokens}
        final_tokens['ans'] = __extract_answer_from_src_and_bio(final_tokens['src'], final_tokens['bio'])
        if len(final_tokens['src']) > max_src_len:
            logger.info("trimmed seq length {} to {}".format(len(final_tokens['src']), max_src_len))
        final_tokens['src'] = final_tokens['src'][:max_src_len]
        final_tokens['tgt'] = final_tokens['tgt'][:max_src_len]
        final_tokens['bio'] = final_tokens['bio'][:max_src_len]
        final_tokens['ner'] = final_tokens['ner'][:max_src_len]
        final_tokens['case'] = final_tokens['case'][:max_src_len]
        final_tokens['pos'] = final_tokens['pos'][:max_src_len]
        final_tokens['ans'] = final_tokens['ans'][:max_src_len]
        instance = SquadInstance(**final_tokens)
        instances.append(instance)
    return instances


if __name__ == '__main__':
    init_logger(level='debug')
    config = load_config()
Exemple #30
0
                zip(atten_engy, instances, copy_hypothesis)):
            obj.append({
                'idx': idx,
                'decode_engy': str(engy),
                'src_tokens': ' '.join(instance.src),
                'output_tokens': ' '.join(hypothesis)
            })
        json.dump(obj, open(predict_atten_engy_path, 'w'), indent=2)
    logger.info("{} of {} is completed hypothesis".format(
        total_completed, len(instances)))
    return copy_hypothesis


if __name__ == '__main__':
    config = load_config()
    init_logger(log_file='evaluate.log')
    device = torch.device('cpu') if config['gpu'] < 0 else torch.device(
        'cuda:{}'.format(config['gpu']))
    if config['model'] == 'nmt':
        model = NMT.load(config['model_save_path'])
        model.to(device)
    else:
        model = QGModel.load(config['model_save_path'], device)

    test_instances = load_instances(config['save_dir'] + '/test.ins')
    bleus = evaluate_bleu(model, test_instances, config, model.word_vocab,
                          config['predict_save_path'])
    logger.info(
        '\nBLEU_1: {}\nBLEU_2: {}\nBLEU_3: {}\nBLEU_4: {}\nBLEU :{}'.format(
            *bleus))