Esempio n. 1
0
    def __init__(self, params, vocab_en, vocab_trans):
        super(Lstm, self).__init__()
        self.emb_dim = params.emb_dim
        self.hidden_dim = params.hidden_dim
        self.dropout = params.dropout
        self.bidirection = params.bidirection

        self.emb_file_en = params.emb_file_en
        self.emb_file_trans = params.emb_file_trans
        self.n_words_en = vocab_en.n_words
        self.n_words_trans = vocab_trans.n_words

        # English embedding layer
        self.embedding_en = nn.Embedding(self.n_words_en,
                                         self.emb_dim,
                                         padding_idx=PAD_INDEX)
        # load English embedding
        embedding_en = load_embedding(self.emb_file_en)
        self.embedding_en.weight.data.copy_(torch.FloatTensor(embedding_en))

        # Transfer language embeddings
        self.embedding_trans = nn.Embedding(self.n_words_trans,
                                            self.emb_dim,
                                            padding_idx=PAD_INDEX)
        # load transfer language embedding
        embedding_trans = load_embedding(self.emb_file_trans)
        self.embedding_trans.weight.data.copy_(
            torch.FloatTensor(embedding_trans))

        # LSTM layers
        self.lstm = nn.LSTM(self.emb_dim,
                            self.hidden_dim,
                            dropout=self.dropout,
                            bidirectional=self.bidirection,
                            batch_first=True)
Esempio n. 2
0
    def __init__(self, params, vocab):
        super(Lstm, self).__init__()
        self.n_layer = params.n_layer
        self.emb_dim = params.emb_dim
        self.n_words = vocab.n_words
        self.hidden_dim = params.hidden_dim
        self.dropout = params.dropout
        self.bidirection = params.bidirection
        self.freeze_emb = params.freeze_emb
        self.emb_file = params.emb_file

        # embedding layer
        self.embedding = nn.Embedding(self.n_words,
                                      self.emb_dim,
                                      padding_idx=PAD_INDEX)
        # load embedding
        if self.emb_file.endswith("npy"):
            embedding = load_embedding_from_npy(self.emb_file)
        else:
            embedding = load_embedding(vocab, self.emb_dim, self.emb_file)
        self.embedding.weight.data.copy_(torch.FloatTensor(embedding))

        # LSTM layers
        self.lstm = nn.LSTM(self.emb_dim,
                            self.hidden_dim,
                            num_layers=self.n_layer,
                            dropout=self.dropout,
                            bidirectional=self.bidirection,
                            batch_first=True)
Esempio n. 3
0
 def __initialize(self):
     sd_path = self.configs.symbol_dict.path + '.yml'
     emb_path = self.configs.embedding_table.path + '.parquet'
     logger.info('Loading symbol_dict from {}'.format(sd_path))
     self.sd = read_yaml(sd_path)
     logger.info('Loading emb_table from {}'.format(emb_path))
     self.emb = load_embedding(emb_path)
Esempio n. 4
0
    def __init__(self, params, vocab_en):
        super(Lstm4pretr, self).__init__()
        self.n_layer = params.n_layer
        self.n_words_en = vocab_en.n_words
        self.emb_dim = params.emb_dim
        self.hidden_dim = params.hidden_dim
        self.dropout = params.dropout
        self.bidirection = params.bidirection
        self.embnoise = params.embnoise
        self.emb_file_en = params.emb_file_en

        # embedding layer
        self.embedding_en = nn.Embedding(self.n_words_en,
                                         self.emb_dim,
                                         padding_idx=PAD_INDEX)
        # load embedding
        embedding_en = load_embedding(vocab_en, self.emb_dim, self.emb_file_en)
        self.embedding_en.weight.data.copy_(torch.FloatTensor(embedding_en))

        # LSTM layers
        self.lstm = nn.LSTM(self.emb_dim,
                            self.hidden_dim,
                            num_layers=self.n_layer,
                            dropout=self.dropout,
                            bidirectional=self.bidirection,
                            batch_first=True)
def gen_embs_for_vocab():
    from src.datareader import datareader
    from src.utils import load_embedding, init_experiment
    from config import get_params
    params = get_params()
    logger = init_experiment(params, logger_filename=params.logger_filename)

    _, vocab = datareader()
    embedding = load_embedding(vocab, 300, "/data/sh/glove.6B.300d.txt",
                               "/data/sh/coachdata/snips/emb/oov_embs.txt")
    np.save("/data/sh/coachdata/snips/emb/slu_embs.npy", embedding)
Esempio n. 6
0
 def __init__(self, params, vocab):
     super(BiLSTMTagger, self).__init__()
     self.embedding = nn.Embedding(vocab.n_words, params.emb_dim, padding_idx=0)
     embedding = load_embedding(vocab, params.emb_dim, params.emb_file, params.usechar)
     self.embedding.weight.data.copy_(torch.FloatTensor(embedding))
     
     self.dropout = params.dropout
     self.lstm = nn.LSTM(params.emb_dim, params.lstm_hidden_dim, num_layers=params.n_layer, dropout=params.dropout, bidirectional=True, batch_first=True)
     
     self.linear = nn.Linear(params.lstm_hidden_dim * 2, params.num_tag)
     self.crf_layer = CRF(params.num_tag)
Esempio n. 7
0
def gen_embs_for_vocab():
    from src.slu.datareader import datareader
    from src.utils import load_embedding, init_experiment
    from config import get_params

    params = get_params()
    logger = init_experiment(params, logger_filename=params.logger_filename)

    _, vocab = datareader()
    embedding = load_embedding(vocab, 300, "PATH_OF_THE_WIKI_EN_VEC",
                               "../data/snips/emb/oov_embs.txt")
    np.save("../data/snips/emb/slu_embs.npy", embedding)
Esempio n. 8
0
    def __init__(self, params, vocab_en, vocab_trans):
        super(Lstm, self).__init__()
        self.n_layer = params.n_layer
        self.vocab_en = vocab_en
        self.vocab_trans = vocab_trans
        self.emb_dim = params.emb_dim
        self.hidden_dim = params.hidden_dim
        self.dropout = params.dropout
        self.bidirection = params.bidirection
        self.embnoise = params.embnoise
        self.emb_file_en = params.emb_file_en
        self.emb_file_trans = params.emb_file_trans

        if params.tar_only == False or params.zs == True:
            # embedding layer
            self.embedding_en = nn.Embedding(self.vocab_en.n_words,
                                             self.emb_dim,
                                             padding_idx=PAD_INDEX)
            # load embedding
            embedding_en = load_embedding(vocab_en, self.emb_dim,
                                          self.emb_file_en)
            self.embedding_en.weight.data.copy_(
                torch.FloatTensor(embedding_en))

        self.embedding_trans = nn.Embedding(self.vocab_trans.n_words,
                                            self.emb_dim,
                                            padding_idx=PAD_INDEX)
        # load embedding
        embedding_trans = load_embedding(vocab_trans, self.emb_dim,
                                         self.emb_file_trans)
        self.embedding_trans.weight.data.copy_(
            torch.FloatTensor(embedding_trans))

        # LSTM layers
        self.lstm = nn.LSTM(self.emb_dim,
                            self.hidden_dim,
                            num_layers=self.n_layer,
                            dropout=self.dropout,
                            bidirectional=self.bidirection,
                            batch_first=True)
Esempio n. 9
0
def transfer(params, trans_lang):
    # initialize experiment
    logger = init_experiment(params, logger_filename=params.logger_filename)
    logger.info("============== Evaluate Zero-Shot on %s ==============" %
                trans_lang)

    # dataloader
    _, _, dataloader_test, vocab = get_dataloader(params, lang=trans_lang)

    # get word embedding
    emb_file = params.emb_file_es if trans_lang == "es" else params.emb_file_th
    embedding = load_embedding(vocab, params.emb_dim, emb_file)

    # evaluate zero-shot
    evaluate_transfer = EvaluateTransfer(params, dataloader_test, embedding,
                                         vocab.n_words)
    intent_acc, slot_f1 = evaluate_transfer.evaluate()
    logger.info("Intent ACC: %.4f. Slot F1: %.4f." % (intent_acc, slot_f1))
Esempio n. 10
0
def gen_embs_for_vocab():
    _, _, _, vocab = datareader()
    embedding = load_embedding(vocab, 300, "PATH_OF_THE_WIKI_EN_VEC", "../data/ner/emb/oov_embs.txt")

    np.save("../data/ner/emb/ner_embs.npy", embedding)
Esempio n. 11
0
def get_oov_words():
    _, _, _, vocab = datareader()
    _ = load_embedding(vocab, 300, "PATH_OF_THE_WIKI_EN_VEC")
Esempio n. 12
0
def main():
    """Load the graph, create the embeddings, evaluate them with link prediction and save the results."""

    args = parse_args()

    graph = utils.load_graph(args.weighted, args.directed, args.input)
    utils.print_graph_info(graph, "original graph")

    graph.remove_nodes_from(list(nx.isolates(graph)))
    utils.print_graph_info(graph, "graph without isolates")

    edge_splitter_test = EdgeSplitter(graph)

    graph_test, X_test_edges, y_test = edge_splitter_test.train_test_split(
        p=args.test_percentage, method="global")

    edge_splitter_train = EdgeSplitter(graph_test, graph)
    graph_train, X_edges, y = edge_splitter_train.train_test_split(
        p=args.train_percentage, method="global")
    X_train_edges, X_model_selection_edges, y_train, y_model_selection = train_test_split(
        X_edges, y, train_size=0.75, test_size=0.25)

    logger.info(f'\nEmbedding algorithm started.')
    start = time.time()

    embedding.create_embedding(args, graph_train)
    time_diff = time.time() - start
    logger.info(f'\nEmbedding algorithm finished in {time_diff:.2f} seconds.')

    embeddings = utils.load_embedding(args.output)

    logger.info(f'\nEmbedding evaluation started.')
    start = time.time()
    results = evaluation.evaluate(args.classifier, embeddings, X_train_edges,
                                  y_train, X_model_selection_edges,
                                  y_model_selection)

    time_diff = time.time() - start
    logger.info(f'Embedding evaluation finished in {time_diff:.2f} seconds.')

    best_result = max(results, key=lambda result: result["roc_auc"])

    logger.info(
        f"\nBest roc_auc_score on train set using '{best_result['binary_operator'].__name__}': {best_result['roc_auc']}."
    )

    logger.info(f'\nEmbedding algorithm started.')
    start = time.time()

    embedding.create_embedding(args, graph_test)
    time_diff = time.time() - start
    logger.info(f'\nEmbedding algorithm finished in {time_diff:.2f} seconds.')

    embedding_test = utils.load_embedding(args.output)

    roc_auc, average_precision, accuracy, f1 = evaluation.evaluate_model(
        best_result["classifier"], embedding_test,
        best_result["binary_operator"], X_test_edges, y_test)

    logger.info(
        f"Scores on test set using '{best_result['binary_operator'].__name__}'."
    )
    logger.info(f"roc_auc_score: {roc_auc}")
    logger.info(f"average_precision_score: {average_precision}")
    logger.info(f"accuracy_score: {accuracy}")
    logger.info(f"f1_score on test set using: {f1}\n")

    if (args.results):
        evaluation.save_evaluation_results(
            args.dataset, args.method, args.classifier,
            (roc_auc, average_precision, accuracy, f1), args.results)