def uploadMatrix(self, matrix_path):
     matrix_list = load_matrix(matrix_path)
     conn_matrix = chaosmonkey_pb2.ConnMatrix()
     for row in matrix_list:
         matrix_row = conn_matrix.rows.add()
         for col in row:
             matrix_row.vals.append(col)
     self._upload_to_server(self.configs, conn_matrix)
Beispiel #2
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        super(NMT, self).__init__()

        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab
        src_vocab_size = len(vocab.src)
        self.tgt_vocab_size = len(vocab.tgt)
        self.DECODER_PAD_IDX = self.vocab.tgt.word2id['<pad>']

        # initialize neural network layers...
        # could add drop-out and bidirectional arguments
        # could also change the units to GRU
        src_weights_matrix = load_matrix("data/cc.400k.de.300.vec", self.vocab.src.word2id.keys(), self.embed_size)
        self.encoder_embed = self.create_emb_layer(src_vocab_size, src_weights_matrix)
        self.NUM_LAYER = 2
        self.NUM_DIR = 2
        self.BIDIR = self.NUM_DIR == 2

        self.encoder_lstm = nn.LSTM(embed_size, hidden_size, num_layers=self.NUM_LAYER, bidirectional=self.BIDIR)
        tgt_weights_matrix = load_matrix("data/cc.400k.en.300.vec", self.vocab.tgt.word2id.keys(), self.embed_size)
        self.decoder_embed = self.create_emb_layer(self.tgt_vocab_size, tgt_weights_matrix)
        decoder_hidden_size = self.NUM_DIR * hidden_size
        self.decoder_lstm = nn.LSTM(decoder_hidden_size + embed_size, decoder_hidden_size, num_layers=self.NUM_LAYER)
        # W_a for attention
        self.decoder_W_a = nn.Linear(self.NUM_DIR * hidden_size, decoder_hidden_size, bias=False)
        # W_c for attention
        self.decoder_W_c = nn.Linear(self.NUM_DIR * hidden_size + decoder_hidden_size, decoder_hidden_size, bias=False)
        self.decoder_log_softmax = nn.LogSoftmax(dim=2)
        self.decoder_softmax = nn.Softmax(dim=2)
        self.dropout = nn.Dropout(p=self.dropout_rate)
        self.tanh = nn.Tanh()

        weights = torch.ones(self.tgt_vocab_size)
        weights[0] = 0
        self.criterion = nn.NLLLoss(weight=weights)
        # W_s for attention
        self.decoder_W_s = nn.Linear(decoder_hidden_size, self.tgt_vocab_size, bias=False)

        # initialize the parameters using uniform distribution
        for param in self.parameters():
            nn.init.uniform_(param.data, a=-0.1, b=0.1)
Beispiel #3
0
def load_model(i, dim, model_file):
    others = None
    for j in range(10):
        current = utils.load_matrix(model_file)
        if j == i:
            target = current
        elif others is None:
            others = current
        else:
            temp = np.vstack((others, current))
            others = temp
    return target, others
Beispiel #4
0
def _needleman_wunsch(seq1, seq2, matrix_filename="similarity_matrix/BLOSUM_62"):
    rows, columns = len(seq1), len(seq2)
    score_matrix = np.zeros((rows + 1, columns + 1))
    similarity_matrix = load_matrix(matrix_filename)

    for i in range(rows + 1):
        score_matrix[i][0] = cfg.gap_penalty * i
    for j in range(columns + 1):
        score_matrix[0][j] = cfg.gap_penalty * j
    for i in range(1, rows + 1):
        for j in range(1, columns + 1):
            match = score_matrix[i - 1][j - 1] + match_score(seq1[i - 1], seq2[j - 1], similarity_matrix)
            delete = score_matrix[i - 1][j] + cfg.gap_penalty
            insert = score_matrix[i][j - 1] + cfg.gap_penalty
            score_matrix[i][j] = max(match, delete, insert)

    align1, align2 = '', ''
    i, j = rows, columns
    while i > 0 and j > 0:
        score_current = score_matrix[i][j]
        score_diagonal = score_matrix[i - 1][j - 1]
        score_up = score_matrix[i][j - 1]
        score_left = score_matrix[i - 1][j]

        if score_current == score_diagonal + match_score(seq1[i - 1], seq2[j - 1], similarity_matrix):
            align1 += seq1[i - 1]
            align2 += seq2[j - 1]
            i -= 1
            j -= 1
        elif score_current == score_left + cfg.gap_penalty:
            align1 += seq1[i - 1]
            align2 += '-'
            i -= 1
        elif score_current == score_up + cfg.gap_penalty:
            align1 += '-'
            align2 += seq2[j - 1]
            j -= 1

    while i > 0:
        align1 += seq1[i - 1]
        align2 += '-'
        i -= 1
    while j > 0:
        align1 += '-'
        align2 += seq2[j - 1]
        j -= 1

    return align1, align2, count_similarity(align1, align2, min(len(seq1), len(seq2)))
Beispiel #5
0
def combine_matrices(key_idx_files,
                     mat_files,
                     cui_to_term_f="./similarities/cui_to_term.pkl"):
    # Initialize set of unique keys
    unique_keys = set()
    key_idxs = []
    mats = []
    # Load all key to index mappings and matrices
    for i in range(len(key_idx_files)):
        keys_i, mat_i = load_matrix(key_idx_files[i],
                                    mat_files[i],
                                    map_cui_to_term=True,
                                    cui_to_term_f=cui_to_term_f)
        key_to_idx_i = {k: v for v, k in enumerate(keys_i)}
        unique_keys.update(set(keys_i))
        mats.append(mat_i)
        key_idxs.append(key_to_idx_i)

    # Create master list of unique keys and master mapping
    unique_keys = list(unique_keys)
    key_to_idx = {k: v for v, k in enumerate(unique_keys)}

    # Initialize a matrix of -1's
    mat = [[-1 for _ in range(len(unique_keys))]
           for _ in range(len(unique_keys))]
    # Iterate over all word pairs
    for i in range(len(unique_keys)):
        word_1 = unique_keys[i]
        for j in range(len(unique_keys)):
            word_2 = unique_keys[j]
            # Get similarity values from all matrices
            vals = [
                get_from_matrix(mats[k], key_idxs[k], word_1, word_2)
                for k in range(len(mats))
            ]
            # Get values that are not -1
            non_neg_vals = [val for val in vals if val >= 0]
            # If there are no non-negative values, insert -1 in the matrix
            if not non_neg_vals:
                mat[i][j] = -1
            # Otherwise, insert the median of the values into the matrix
            else:
                mat[i][j] = statistics.median(non_neg_vals)
    # Save to pickle files
    with open('test_matrix.pkl', 'wb') as mat_file:
        pickle.dump(mat, mat_file)
    with open('test_key_idx.pkl', 'wb') as pkl_file:
        pickle.dump(key_to_idx, pkl_file)
Beispiel #6
0
def _smith_waterman(seq1, seq2, matrix_filename="similarity_matrix/BLOSUM_62"):
    rows, columns = len(seq1), len(seq2)
    score_matrix = np.zeros((rows + 1, columns + 1))
    pointer = np.zeros((rows + 1, columns + 1))
    similarity_matrix = load_matrix(matrix_filename)

    max_score = 0
    for i in range(1, rows + 1):
        for j in range(1, columns + 1):
            score_diagonal = score_matrix[i - 1][j - 1] + match_score(seq1[i - 1], seq2[j - 1], similarity_matrix)
            score_up = score_matrix[i][j - 1] + cfg.gap_penalty
            score_left = score_matrix[i - 1][j] + cfg.gap_penalty
            score_matrix[i][j] = max(0, score_left, score_up, score_diagonal)
            if score_matrix[i][j] == 0:
                pointer[i][j] = 0  # 0 means end of the path
            if score_matrix[i][j] == score_left:
                pointer[i][j] = 1  # 1 means trace up
            if score_matrix[i][j] == score_up:
                pointer[i][j] = 2  # 2 means trace left
            if score_matrix[i][j] == score_diagonal:
                pointer[i][j] = 3  # 3 means trace diagonal
            if score_matrix[i][j] >= max_score:
                max_i = i
                max_j = j
                max_score = score_matrix[i][j]

    align1, align2 = '', ''

    i, j = max_i, max_j

    while pointer[i][j] != 0:
        if pointer[i][j] == 3:
            align1 += seq1[i - 1]
            align2 += seq2[j - 1]
            i -= 1
            j -= 1
        elif pointer[i][j] == 2:
            align1 += '-'
            align2 += seq2[j - 1]
            j -= 1
        elif pointer[i][j] == 1:
            align1 += seq1[i - 1]
            align2 += '-'
            i -= 1

    return align1, align2, count_similarity(align1, align2, min(len(seq1), len(seq2)))
Beispiel #7
0
def prepare(freq, ss):
    # load data
    (X_train_weather, X_train_energy, y_train, X_valid_weather, X_valid_energy,
     y_valid, X_test_weather, X_test_energy,
     y_test) = load_matrix(APT_CSV % apt_name, freq, season[ss])

    print('Training data:')
    print('\tX_weather:', X_train_weather.shape)
    print('\tX_energy:', X_train_energy.shape)
    print('\ty:', y_train.shape)

    print('Validation data:')
    print('\tX_weather:', X_valid_weather.shape)
    print('\tX_energy:', X_valid_energy.shape)
    print('\ty:', y_valid.shape)

    energy_dim, weather_dim = X_train_energy.shape[1], X_train_weather.shape[1]

    return (X_train_energy, X_train_weather, y_train, X_valid_energy,
            X_valid_weather, y_valid, X_test_energy, X_test_weather, y_test,
            energy_dim, weather_dim)
Beispiel #8
0
def run_experiment(data: MovieLensDataset,
                   sparse=True,
                   grad_sensibility=1e-8,
                   param_sensibility=1e-16,
                   num_experiments=1,
                   warmup=0,
                   workers=8):
    date = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
    # try to load matrices first
    try:
        print("Loading train and test split from /tmp/..")
        trainX = load_matrix(f'trainX_{"sparse" if sparse else "full"}',
                             sparse)
        testX = load_matrix(f'testX_{"sparse" if sparse else "full"}', sparse)
    except:
        print("Loading failed, generating train-test split now..")
        # %5 test size
        test_set_size = data.n_ratings // 20
        # trainX, testX = data.train_test_split(test_set_size, workers)
        trainX, testX = data.train_test_split_simple(test_set_size)
        print(f"Saving train and test set to /tmp/ first..")
        save_matrix(f'trainX_{"sparse" if sparse else "full"}', trainX)
        save_matrix(f'testX_{"sparse" if sparse else "full"}', testX)

    # print(trainX.shape, testX.shape)
    # optional warmup
    for _ in range(warmup):
        u = init_vector(data.n_users, normalize=True)
        v = init_vector(data.n_movies, normalize=True)
        args = [u, v, trainX]
        als = ALSSparse(*args) if sparse else ALS(*args)
        u, v = als.fit(eps_g=grad_sensibility)

    stats = {}
    start = time.time()
    for i in range(num_experiments):
        u = init_vector(data.n_users, normalize=True)
        v = init_vector(data.n_movies, normalize=True)
        args = [u, v, trainX]
        als = ALSSparse(*args) if sparse else ALS(*args)
        # run Alternating Least Squares algorithm
        u, v = als.fit(eps_g=grad_sensibility, eps_params=param_sensibility)
        # average results
        stats = average_stats(stats, als.stats, i + 1)
    end = time.time()
    # additional context info non depending from experiment results
    stats['number_of_ratings'] = trainX.getnnz(
    ) if sparse else np.count_nonzero(trainX)
    stats['dataset_path'] = data.path
    stats['grad_sensibility'] = grad_sensibility
    stats['param_sensibility'] = param_sensibility
    stats['theta_diff_sensibility'] = 1e-10
    stats['num_experiments'] = num_experiments
    stats['warmup_cycles'] = warmup
    stats['experiments_total_runtime'] = end - start
    stats['date'] = date
    stats['train_mse'] = als.function_eval() / stats['number_of_ratings']
    print("Train Mean Squared error is:", stats['train_mse'])

    # free memory before testing
    del trainX
    del data

    # test on test set
    test_mse = evaluate(als.u, als.v, testX, "sparse" if sparse else "full")

    stats['test_mse'] = test_mse
    # save results
    print("Saving results..")
    with open(f'data/als_{"sparse" if sparse else "full"}_{date}.json',
              'w') as f:
        json.dump(stats, f, indent=4)

    return als
Beispiel #9
0
if ask_for.tour != [None]:
    for tour in ask_for.tour:
        with open(tour) as fd:
            trajs.append(utils.load_points(fd))

if ask_for.notsp:
    if ask_for.tour == [None] or not ask_for.pheromones:
        LOGN(
            "If you do not want to solve the TSP, you must provide a solution tour (--tour) and a pheromones matrix (--pheromones)"
        )
        sys.exit(error_codes["NO-TSP"])

    if ask_for.pheromones:
        with open(ask_for.pheromones) as fd:
            phero = utils.load_matrix(fd)

else:
    LOGN("Solve the TSP with an Ant Colony Algorithm")

    LOGN("\tConvert the segment list into an adjacency list graph")
    G = graph.graph_of(penrose_segments)

    LOGN("\tCompute a tour")
    # max_it = 10
    max_it = 2
    # num_ants = 10 #* depth
    num_ants = 2  #* depth
    decay = 0.1
    w_heur = 2.5
    w_local_phero = 0.1
Beispiel #10
0
 def __init__(self):
     global conn_mat
     conn_mat = load_matrix('matrix')
Beispiel #11
0
def train(rank, args):
    if rank is None:
        is_distributed = False
        rank = 0
    else:
        is_distributed = True

    if is_distributed:
        utils.setuplogger()
        dist.init_process_group('nccl',
                                world_size=args.nGPU,
                                init_method='env://',
                                rank=rank)

    torch.cuda.set_device(rank)

    news, news_index, category_dict, subcategory_dict, word_dict = read_news(
        os.path.join(args.train_data_dir, 'news.tsv'), args, mode='train')

    news_title, news_category, news_subcategory = get_doc_input(
        news, news_index, category_dict, subcategory_dict, word_dict, args)
    news_combined = np.concatenate([
        x
        for x in [news_title, news_category, news_subcategory] if x is not None
    ],
                                   axis=-1)

    if rank == 0:
        logging.info('Initializing word embedding matrix...')

    embedding_matrix, have_word = utils.load_matrix(args.glove_embedding_path,
                                                    word_dict,
                                                    args.word_embedding_dim)
    if rank == 0:
        logging.info(f'Word dict length: {len(word_dict)}')
        logging.info(f'Have words: {len(have_word)}')
        logging.info(
            f'Missing rate: {(len(word_dict) - len(have_word)) / len(word_dict)}'
        )

    module = importlib.import_module(f'model.{args.model}')
    model = module.Model(args, embedding_matrix, len(category_dict),
                         len(subcategory_dict))

    if args.load_ckpt_name is not None:
        ckpt_path = utils.get_checkpoint(args.model_dir, args.load_ckpt_name)
        checkpoint = torch.load(ckpt_path, map_location='cpu')
        model.load_state_dict(checkpoint['model_state_dict'])
        logging.info(f"Model loaded from {ckpt_path}.")

    optimizer = optim.Adam(model.parameters(), lr=args.lr)

    if args.enable_gpu:
        model = model.cuda(rank)

    if is_distributed:
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[rank])

    # if rank == 0:
    #     print(model)
    #     for name, param in model.named_parameters():
    #         print(name, param.requires_grad)

    data_file_path = os.path.join(args.train_data_dir,
                                  f'behaviors_np{args.npratio}_{rank}.tsv')

    dataset = DatasetTrain(data_file_path, news_index, news_combined, args)
    dataloader = DataLoader(dataset, batch_size=args.batch_size)

    logging.info('Training...')
    for ep in range(args.start_epoch, args.epochs):
        loss = 0.0
        accuary = 0.0
        for cnt, (log_ids, log_mask, input_ids,
                  targets) in enumerate(dataloader):
            if args.enable_gpu:
                log_ids = log_ids.cuda(rank, non_blocking=True)
                log_mask = log_mask.cuda(rank, non_blocking=True)
                input_ids = input_ids.cuda(rank, non_blocking=True)
                targets = targets.cuda(rank, non_blocking=True)

            bz_loss, y_hat = model(log_ids, log_mask, input_ids, targets)
            loss += bz_loss.data.float()
            accuary += utils.acc(targets, y_hat)
            optimizer.zero_grad()
            bz_loss.backward()
            optimizer.step()

            if cnt % args.log_steps == 0:
                logging.info(
                    '[{}] Ed: {}, train_loss: {:.5f}, acc: {:.5f}'.format(
                        rank, cnt * args.batch_size, loss.data / cnt,
                        accuary / cnt))

            if rank == 0 and cnt != 0 and cnt % args.save_steps == 0:
                ckpt_path = os.path.join(args.model_dir,
                                         f'epoch-{ep+1}-{cnt}.pt')
                torch.save(
                    {
                        'model_state_dict': {
                            '.'.join(k.split('.')[1:]): v
                            for k, v in model.state_dict().items()
                        } if is_distributed else model.state_dict(),
                        'category_dict': category_dict,
                        'word_dict': word_dict,
                        'subcategory_dict': subcategory_dict
                    }, ckpt_path)
                logging.info(f"Model saved to {ckpt_path}.")

        logging.info('Training finish.')

        if rank == 0:
            ckpt_path = os.path.join(args.model_dir, f'epoch-{ep+1}.pt')
            torch.save(
                {
                    'model_state_dict': {
                        '.'.join(k.split('.')[1:]): v
                        for k, v in model.state_dict().items()
                    } if is_distributed else model.state_dict(),
                    'category_dict': category_dict,
                    'subcategory_dict': subcategory_dict,
                    'word_dict': word_dict,
                }, ckpt_path)
            logging.info(f"Model saved to {ckpt_path}.")
Beispiel #12
0
trajs = []

if ask_for.tour != [None]:
    for tour in ask_for.tour:
        with open(tour) as fd:
            trajs.append( utils.load_points(fd) )

if ask_for.notsp:
    if ask_for.tour == [None] or not ask_for.pheromones:
        LOGN( "If you do not want to solve the TSP, you must provide a solution tour (--tour) and a pheromones matrix (--pheromones)" )
        sys.exit(error_codes["NO-TSP"])

    if ask_for.pheromones:
        with open(ask_for.pheromones) as fd:
            phero = utils.load_matrix(fd)

else:
    LOGN( "Solve the TSP with an Ant Colony Algorithm" )

    LOGN( "\tConvert the segment list into an adjacency list graph" )
    G = graph.graph_of( penrose_segments )

    LOGN( "\tCompute a tour" )
    # max_it = 10
    max_it = 2
    # num_ants = 10 #* depth
    num_ants = 2 #* depth
    decay = 0.1
    w_heur = 2.5
    w_local_phero = 0.1
Beispiel #13
0
        '--non-negative',
        help=
        'Setting this will solve least-squares with nonnegativity constraints',
        action='store_true',
        default=False)
    args.add_argument(
        '-w',
        '--n-workers',
        help='Number of workers used to split dataset into test-train',
        type=int,
        default=8)
    args = args.parse_args()

    try:
        print("Loading train and test split from /tmp/..")
        trainX = load_matrix(f'trainX_sparse', True)
        testX = load_matrix(f'testX_sparse', True)
    except:
        print("Loading failed, generating train-test split now..")
        dataset = MovieLensDataset(args.dataset_path, mode='sparse')
        # %5 test size
        test_set_size = dataset.n_ratings // 20
        trainX, testX = dataset.train_test_split_simple(test_set_size)
        print(f"Saving train and test set to /tmp/ first..")
        save_matrix(f'trainX_sparse', trainX)
        save_matrix(f'testX_sparse', testX)

    # train matrix to csv
    print("Storing train/test matrix to csv..")
    sparse_matrix_to_csv('/tmp/trainX.csv', trainX)
 def __init__(self):
     global conn_mat
     conn_mat = load_matrix('matrix')
     self.logger = self.set_log()