Beispiel #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-l',
                        '--layout',
                        dest="layout",
                        required=True,
                        help="layout.yaml")
    parser.add_argument('-c',
                        '--config',
                        dest="config",
                        required=True,
                        help="cluster configuration")
    parser.add_argument('-o',
                        '--output',
                        dest="output",
                        required=True,
                        help="cluster configuration")
    args = parser.parse_args()

    output_path = os.path.expanduser(args.output)

    layout = load_yaml_config(args.layout)
    config = load_yaml_config(args.config)

    masters, workers = get_masters_workers_from_layout(layout)
    head_node = masters[0]

    # fill in cpu, memory, computing_device information in both masters and workers
    # we assume the layout file the user gives is correct
    all_machines = masters + workers
    for machine in all_machines:
        sku_info = layout['machine-sku'][machine['machine-type']]
        # use math.ceil to guarantee the memory volume
        # e.g. if use set 999.1MB, we ensure there is 1000MB to avoid scheduling issues
        machine['memory_mb'] = math.ceil(
            parse_quantity(sku_info['mem']) / 1024 / 1024)
        machine['cpu_vcores'] = sku_info['cpu']['vcore']
        if 'computing-device' in sku_info:
            machine['computing_device'] = sku_info['computing-device']

    # add machine to different comupting device group
    computing_device_groups = defaultdict(list)
    for machine in all_machines:
        sku_info = layout['machine-sku'][machine['machine-type']]
        if 'computing-device' in sku_info:
            computing_device_groups[sku_info['computing-device']
                                    ['type']].append(machine['hostname'])

    environment = {
        'masters': masters,
        'workers': workers,
        'cfg': config,
        'head_node': head_node,
        'computing_device_groups': computing_device_groups,
    }

    map_table = {"env": environment}

    generate_template_file("quick-start/pre-check.yml.template",
                           "{0}/pre-check.yml".format(output_path), map_table)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-l',
                        '--layout',
                        dest="layout",
                        required=True,
                        help="layout.yaml")
    parser.add_argument('-c',
                        '--config',
                        dest="config",
                        required=True,
                        help="cluster configuration")
    args = parser.parse_args()

    layout = load_yaml_config(args.layout)
    cluster_config = load_yaml_config(args.config)
    try:
        validate_layout_schema(layout)
    except Exception as exp:
        logger.error("layout.yaml schema validation failed: \n %s", exp)
        sys.exit(1)

    if not check_layout(layout, cluster_config):
        logger.error("layout.yaml schema validation failed")
        sys.exit(1)

    logger.info("layout.yaml schema validation succeeded.")
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-l',
                        '--layout',
                        dest="layout",
                        required=True,
                        help="layout.yaml")
    parser.add_argument('-c',
                        '--config',
                        dest="config",
                        required=True,
                        help="cluster configuration")
    parser.add_argument('-o',
                        '--output',
                        dest="output",
                        required=True,
                        help="cluster configuration")
    args = parser.parse_args()

    output_path = os.path.expanduser(args.output)
    layout = load_yaml_config(args.layout)
    cluster_config = load_yaml_config(args.config)

    masters, workers = get_masters_workers_from_layout(layout)
    head_node = masters[0]

    # Hivedscheduler is enabled by default.
    # But if the user sets enable_hived_scheduler to false manually,
    # we should disable it.
    if 'enable_hived_scheduler' in cluster_config and cluster_config[
            'enable_hived_scheduler'] is False:
        hived_config = {}
    else:
        hived_config = get_hived_config(layout, cluster_config)

    environment = {
        'masters': masters,
        'workers': workers,
        'cfg': cluster_config,
        'head_node': head_node,
        'hived': hived_config
    }

    map_table = {"env": environment}

    generate_template_file(
        os.path.abspath(
            os.path.join(
                os.path.abspath(__file__),
                '../../quick-start/services-configuration.yaml.template')),
        "{0}/services-configuration.yaml".format(output_path), map_table)
def main():
    config = load_yaml_config("config.yml")
    tokenizer_path = config["data"]["tokenizer_path"]
    seq_length = config["model"]["seq_length"]
    pb_path = config["model"]["pb_path"]
    label_size = config["model"]["label_size"]
    kl_threshold = config["model"]["kl_threshold"]

    tokenizer = pickle.load(open(tokenizer_path, "rb"))

    with tf.Session() as sess:
        tf.saved_model.loader.load(sess, [tag_constants.SERVING], pb_path)
        graph = tf.get_default_graph()

        input_x = graph.get_tensor_by_name("input_x:0")
        keep_prob = graph.get_tensor_by_name("dropout_keep_prob:0")
        pred = graph.get_tensor_by_name("softmaxLayer/probs:0")

        while True:
            trackString = input("请输入轨迹序列:")
            if trackString == "exit":
                print("退出检测")
                exit(0)

            track = trackString.strip().split(" ")
            if len(track) < 2:
                print("轨迹长度至少为2")
                continue

            unknow = [x for x in track if x not in tokenizer.word_index]
            if unknow:
                print("{}轨迹点不存在".format(unknow))
                continue

            tokenizerSeq = tokenizer.texts_to_sequences([trackString])[0]
            for i in range(1, len(tokenizerSeq)):
                label = tokenizerSeq[i]
                feature = tokenizerSeq[(i - 4) if (i - 4) >= 0 else 0:i]
                if len(feature) < seq_length:
                    feature = [1] * (seq_length - len(feature)) + feature
                feature_reshape = np.array(feature).reshape(-1, 4)
                logits = sess.run(pred, feed_dict={
                    input_x: feature_reshape,
                    keep_prob: 1})
                # 多分类交叉熵
                loss = sess.run(tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits, labels=[label]))[0]
                # kl离散度
                onehot = [1e-6] * label_size
                onehot[label] = 1
                kl = stats.entropy(logits[0], onehot)
                # 用kl离散度判断异常
                abnormal = "[{}]".format(" " if kl <= kl_threshold else "×")
                print("{} 交叉熵:{:.4f}, kl离散度:{:.4f}, 移动轨迹:{} => {}".format(
                    abnormal, loss, kl, track[(i - 4) if (i - 4) >= 0 else 0:i], track[i]))
            print("")
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-l', '--layout', dest="layout", required=True,
                        help="layout.yaml")
    parser.add_argument('-c', '--config', dest="config", required=True,
                        help="cluster configuration")
    parser.add_argument('-o', '--output', dest="output", required=True,
                        help="cluster configuration")
    args = parser.parse_args()

    output_path = os.path.expanduser(args.output)

    layout = load_yaml_config(args.layout)
    cluster_config = load_yaml_config(args.config)

    masters, workers = get_masters_workers_from_layout(layout)
    head_node = masters[0]

    environment = {
        'masters': masters,
        'workers': workers,
        'cfg': cluster_config,
        'head_node': head_node
    }

    map_table = {
        "env": environment
    }
    generate_template_file(
        "quick-start/hosts.yml.template",
        "{0}/hosts.yml".format(output_path),
        map_table
    )
    generate_template_file(
        "quick-start/prophet.yml.template",
        "{0}/prophet.yml".format(output_path),
        map_table
    )
Beispiel #6
0
def download_file(local, remote):
    config = load_yaml_config('certificate.yml')
    session = boto3.session.Session(
        aws_access_key_id=config.AWS_ACCESS_ID,
        aws_secret_access_key=config.AWS_SECRET_KEY)
    s3 = session.resource('s3')

    try:
        s3.Bucket(config.AWS_BUCKET).download_file(remote, local)
        print("Download " + remote + " as " + local)
        return True
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == '404':
            print("The object does not exists")
        print("Download fail.")
        return False
def train():
    config = load_yaml_config("config.yml")
    display_step = config["model"]["display_step"]
    evaluate_step = config["model"]["evaluate_step"]
    save_step = config["model"]["save_step"]
    checkpoint_path = config["model"]["checkpoint_path"]
    pickle_path = config["data"]["pickle_path"]
    pb_path = config["model"]["pb_path"]
    model = TodAutoEncoder(config)
    print(model.input_x)
    print(model.loss)

    with open(pickle_path, "rb") as f:
        _ = pickle.load(f)
        _, sparse_test = pickle.load(f)
    card, sparse = zip(*sparse_test)
    test = dense_transform(list(sparse))

    sess = get_session()
    sess.run(tf.global_variables_initializer())

    batch_data = get_batch()
    for batch in batch_data:
        _, loss_train, step = model.step(sess, batch)
        if step % display_step == 0:
            print("step: %d => loss: %.4f" % (step, loss_train))
        if step % evaluate_step == 0:
            _, loss_test, _ = model.step(sess, test)
            print("{0:-^30}".format("evaluation loss: %.4f" % loss_test))
            print("")
        if step % save_step == 0:
            model.save(sess, checkpoint_path)
    model.save(sess, checkpoint_path)

    shutil.rmtree(pb_path, ignore_errors=True)
    builder = tf.saved_model.builder.SavedModelBuilder(pb_path)
    inputs = {'input_x': tf.saved_model.utils.build_tensor_info(model.input_x)}
    outputs = {'output': tf.saved_model.utils.build_tensor_info(model.loss)}
    signature = tf.saved_model.signature_def_utils.build_signature_def(
        inputs=inputs,
        outputs=outputs,
        method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)

    builder.add_meta_graph_and_variables(sess, [tag_constants.SERVING], {'my_signature': signature})
    builder.save()
Beispiel #8
0
def upload_file(local, remote):
    config = load_yaml_config('certificate.yml')
    session = boto3.session.Session(
        aws_access_key_id=config.AWS_ACCESS_ID,
        aws_secret_access_key=config.AWS_SECRET_KEY)
    s3 = session.client('s3')

    try:
        s3.upload_file(local, config.AWS_BUCKET, remote)
        '''
        s3.put_object(Bucket=config.AWS_BUCKET,
                      Key=remote,
                      Body=local)
        '''
        print("Upload " + local + " as " + remote)
        return True
    except:
        print("Upload fail.")
        return False
Beispiel #9
0
def train():
    config = load_yaml_config("config.yml")
    embedding_size = config["model"]["embedding_size"]
    min_count = config["model"]["min_count"]
    sg = config["model"]["sg"]
    output_path = config["data"]["output_path"]
    window = config["model"]["window"]
    embedding_model = config["model"]["embedding_model"]
    embedding_path = config["model"]["embedding_path"]

    sentences = word2vec.LineSentence(output_path)
    model = word2vec.Word2Vec(sentences,
                              size=embedding_size,
                              window=window,
                              min_count=min_count,
                              sg=sg,
                              workers=multiprocessing.cpu_count())
    print("word counts: %d" % len(model.wv.vocab.keys()))
    model.save(embedding_model)
    model.wv.save_word2vec_format(embedding_path, binary=False)
Beispiel #10
0
def get_batch():
    config = load_yaml_config("config.yml")
    pickle_path = config["data"]["pickle_path"]
    epochs = config["model"]["epochs"]
    batch_size = config["model"]["batch_size"]

    sparse_list = load_train_pkl(pickle_path)
    card, sparse = zip(*sparse_list)
    sparse = list(sparse)
    data_length = len(sparse)

    # repeat
    for epoch in range(epochs):
        # shuffle
        random.shuffle(sparse)
        # batch_size
        for batch in range(0, data_length, batch_size):
            if batch + batch_size <= data_length:
                sparse_batch = sparse[batch:(batch + batch_size)]
                # 对稀疏矩阵进行dense转化
                dense_list = dense_transform(sparse_batch)

                yield dense_list
Beispiel #11
0
def main():
    config = load_yaml_config("config.yml")
    pickle_path = config["data"]["pickle_path"]
    pb_path = config["model"]["pb_path"]

    with open(pickle_path, "rb") as f:
        _ = pickle.load(f)
        sparse_train, sparse_test = pickle.load(f)
    card_st = sparse_train + sparse_test

    with tf.Session() as sess:
        tf.saved_model.loader.load(sess, [tag_constants.SERVING], pb_path)
        graph = tf.get_default_graph()

        input_x = graph.get_tensor_by_name("input_x:0")
        pred = graph.get_tensor_by_name("cost/absolute_difference/value:0")
        # 预测
        with open("tod_result.csv", "w", encoding="utf8") as f:
            f.write("{},{}\n".format("cardno", "loss"))

            for card, st_sparse in card_st:
                st_dense = st_sparse.toarray().reshape(1, -1)
                res = sess.run(pred, feed_dict={input_x: st_dense})
                f.write("{},{}\n".format(card, res))
Beispiel #12
0
    sparse_list = []
    for card, st_dict in tfidfCstDict.items():
        # 创建稀疏矩阵
        sparse_mat = dok_matrix((loc_length, hour_length), dtype=np.float32)
        for space, hour_dict in st_dict.items():
            for hour, tfidf in hour_dict.items():
                space_index = locToIndex[space]
                sparse_mat[space_index, hour] = tfidf
        sparse_list.append((card, sparse_mat))

    return sparse_list, locToIndex


def dump_pickel():
    tfidfCstDict, locCountDict, hourCountDict = get_space_time_dict()
    sparse_list, locToIndex = get_space_time_sparse(tfidfCstDict, locCountDict, hourCountDict)
    # train_test_split
    sparse_train, sparse_test = train_test_split(sparse_list, test_size=0.2)

    with open(pickle_path, "wb") as f:
        pickle.dump(locToIndex, f)
        pickle.dump((sparse_train, sparse_test), f, protocol=0)


if __name__ == "__main__":
    config = load_yaml_config("config.yml")
    file_path = config["data"]["file_path"]
    pickle_path = config["data"]["pickle_path"]
    dump_pickel()
# Use None or 0 if you want to return all possible neighbors in the select distance.
config.batch_size = 32
# Training batch size of fnn models.
config.epochs = [50, 300]
config.epochs_train2 = 300
config.epochs_interval = 50
# Epochs is a list of len=2 containing the range of epochs after which stop training of M1 models and train a new model M2.
# M1's training will stop after epochs[0]+n*interval such that  n>0 and epochs[0]+n*interval<=epochs[1]
# M2's training will last epochs_train2 epochs.
config.epochs_interval_evaluation = 1
# M2's training will stop epochs_interval_evaluation epochs to evaluate performance
# M1's training will stop to evaluate performance only if test1=True
config.folds_number = 10
# Numbers of K-fold CV folds.
config.embedding_name = "tuned_embedding"
# The embedding to be used. There must be a directory containing the embedding in data folder.
config.test1 = False
# True if you want to evaluate M1's performances trainings on test set. Use False to skip the evaluation.
config.OUTPUTS_DIR = None
# The base path in which tests' outputs will be saved. Set as None if you want to store them in project's dir.
config.embedding_dict_to_use = None
# If you want to use the dictionary of another embedding, set this parameter with the embedding name. Use None otherwise.
# There must be a directory containing the embedding in data folder.

config = load_yaml_config(
    config,
    os.path.join(
        os.path.dirname(os.path.abspath(__file__)), "coherence_test_config.yaml"
    ),
)
Beispiel #14
0
from lettuce import step, world
from nose.tools import assert_equals, assert_true, assert_false
import utils
import os
import bunch.special

path = os.path.abspath(__file__)
dir_path = os.path.dirname(path)
utils.init(dir_path)
config_file = os.path.join(dir_path, "config.yaml")
config = utils.load_yaml_config(config_file)
bunch_working_dir = dir_path


def dump(obj):
    for attr in dir(obj):
        print "obj.%s = %s" % (attr, getattr(obj, attr))


mysql_admin = config['db']['admin']
mysql_admin_pwd = config['db']['admin_pwd']


class step_assert(object):
    def __init__(self, step):
        self.step = step

    def assert_true(self, expr):
        msg = 'Step "%s" failed ' % self.step.sentence
        assert_true(expr, msg)
config = Config()
config.ocean_traits = [0, 1, 2, 3, 4]
# OCEAN personality traits to which tune the embedding: O:0, C:1, E:2, A:3, N:4
config.epochs_number = 10
# NLP model's training epochs
config.num_reviews = 1500000
# number of reviews to use for training (training set + test set)
config.voc_dim = 6 * 10**4
# number of terms in the tuned embedding
config.train_zeros = False
# use True if you want to train weights representing padding's tokens, use False otherwise.
config.output_type = "mean"
# target of the model: 'mean' or 'sum' of known terms' scores in the review.
config.shuffle = True
# if True review from yelp dataset will be shuffled before extracting num_reviews reviews.
# if False the first num_reviews of yelp dataset will be extracted.
config.features_config = [100, int(100 / 2), int(100 / 4)]
# configuration of NLP model's architecture: features, filters and hidden units.
config.embedding_name = "new_tuned_embedding"
# name of the dir to be created that stores the tuned embedding.
config.load_reviews_from_scratch = False
# use False if you have already loaded and stored reviews, use True if you want to reload and restore reviews.
config.tune_embedding = True
# use True to train the model, use False otherwise (eg if you just want to load reviews).

config = load_yaml_config(
    config,
    os.path.join(os.path.dirname(os.path.abspath(__file__)),
                 "tune_embedding_config.yaml"),
)
Beispiel #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-l',
                        '--layout',
                        dest="layout",
                        required=True,
                        help="layout.yaml")
    parser.add_argument('-c',
                        '--config',
                        dest="config",
                        required=True,
                        help="cluster configuration")
    parser.add_argument('-o',
                        '--output',
                        dest="output",
                        required=True,
                        help="cluster configuration")
    args = parser.parse_args()

    output_path = os.path.expanduser(args.output)

    layout = load_yaml_config(args.layout)
    cluster_config = load_yaml_config(args.config)

    masters, workers = get_masters_workers_from_layout(layout)
    head_node = masters[0]

    if 'openpai_kube_network_plugin' not in cluster_config or cluster_config[
            'openpai_kube_network_plugin'] != 'weave':
        count_input = 0
        while True:
            user_input = input(
                "Are your cluster is in Azure cloud or not? (Y/N) (case sensitive)"
            )
            if user_input == "N":
                break
            if user_input == "Y":
                break
            print(" Please type Y or N. It's case sensitive.")
            count_input = count_input + 1
            if count_input == 3:
                logger.error(
                    "3 Times.........  Sorry,  we will force stopping your operation."
                )
                sys.exit(1)
        if user_input == "Y" \
            and ('openpai_kube_network_plugin' not in cluster_config or cluster_config['openpai_kube_network_plugin'] == 'calico'):
            logger.error(
                "Azure does not support calico, please change the openpai_kube_network_plugin to weave"
            )
            logger.error(
                "https://docs.projectcalico.org/reference/public-cloud/azure#why-doesnt-azure-support-calico-networking"
            )
            sys.exit(1)

    environment = {
        'masters': masters,
        'workers': workers,
        'cfg': cluster_config,
        'head_node': head_node
    }

    map_table = {"env": environment}
    generate_template_file("quick-start/hosts.yml.template",
                           "{0}/hosts.yml".format(output_path), map_table)
    generate_template_file("quick-start/openpai.yml.template",
                           "{0}/openpai.yml".format(output_path), map_table)
Beispiel #17
0
from lettuce import step, world
from nose.tools import assert_equals, assert_true, assert_false
import utils
import os
import bunch.special

path = os.path.abspath(__file__)
dir_path = os.path.dirname(path)
utils.init(dir_path)
config_file = os.path.join(dir_path, "config.yaml")
config = utils.load_yaml_config(config_file)
bunch_working_dir = dir_path

def dump(obj):
  for attr in dir(obj):
    print "obj.%s = %s" % (attr, getattr(obj, attr))

mysql_admin = config['db']['admin']
mysql_admin_pwd = config['db']['admin_pwd']

class step_assert(object):
    def __init__(self, step):
        self.step = step
    
    def assert_true(self, expr):
        msg = 'Step "%s" failed ' % self.step.sentence
        assert_true(expr, msg)
        
    def assert_false(self, expr):
        msg = 'Step "%s" failed ' % self.step.sentence
        assert_false(expr, msg)