コード例 #1
0
    def __init__(self,
                 input_file_path: str,
                 gold_file_path: str,
                 max_char_len: int = 256,
                 is_crf: bool = False,
                 TASK: str = 'BIS'):
        configure_workspace()
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.max_len = max_char_len
        self.parse_dataset(input_file_path, gold_file_path)
        self.get_unigrams()
        self.create_vocabulary(is_crf)
        self.encode_labels(TASK)
        self.encoded_data = None
        self.vocab_size = len(self.char2idx.keys())
        self.out_vocab_size = len(self.label2idx.keys())

        print(f'Input Vocabulary Size: {self.vocab_size}')
        print(f'Output Vocabulary Size: {self.out_vocab_size}')
コード例 #2
0
    print("==========Validation Dataset==========")
    dev_file_path = join(DATA_PATH, 'dev.tsv')
    validation_set = TSVDatasetParser(dev_file_path, max_len=80, is_crf=crf_model)
    validation_set.encode_dataset(training_set.word2idx, training_set.labels2idx, training_set.char2idx)

    print("==========Testing Dataset==========")
    test_file_path = join(DATA_PATH, 'test.tsv')
    testing_set = TSVDatasetParser(test_file_path, max_len=80, is_crf=crf_model)
    testing_set.encode_dataset(training_set.word2idx, training_set.labels2idx, training_set.char2idx)

    return training_set, validation_set, testing_set


if __name__ == '__main__':
    RESOURCES_PATH = join(getcwd(), 'resources')
    configure_workspace(seed=1873337)
    crf_model = False
    train_dataset, dev_dataset, test_dataset = prepare_data(crf_model)

    batch_size = 64
    pretrained_embeddings = None

    embeddings_path = join(RESOURCES_PATH, 'wiki.en.vec')
    pretrained_embeddings = load_pretrained_embeddings(embeddings_path,
                                                       train_dataset.word2idx,
                                                       300, is_crf=crf_model)

    name_ = 'LSTM'
    hp = HyperParameters(name_, train_dataset.word2idx,
                         train_dataset.labels2idx,
                         pretrained_embeddings,
コード例 #3
0
    dataset.update(train_x=bert_inputs,
                   train_y=train_labels,
                   tokenizer=tokenizer)
    return dataset


if __name__ == '__main__':
    # Initialize session
    sess = tf.Session()
    # Reduce logging output.
    tf.logging.set_verbosity(tf.logging.INFO)

    params = parse_args()

    config_params = configure_workspace()
    elmo = config_params["use_elmo"]

    dataset = process_bert_data()
    vocabulary_size = dataset.get("vocabulary_size")
    output_size = dataset.get("output_size")
    pos_vocab_size = dataset.get("pos_vocab_size")
    lex_vocab_size = dataset.get("lex_vocab_size")

    model = multitask_attention_model(output_size, pos_vocab_size,
                                      lex_vocab_size, config_params)
    # max_seq_len = 512
    # model = attention_model(output_size, max_seq_len, config_params)

    # Instantiate variables
    initialize_vars(sess)
コード例 #4
0
    # Fetching the ground truth of the data
    ground_truth = []
    ground_truth_path = input_path.replace("data.xml", "gold.key.txt")
    with open(ground_truth_path, encoding='utf-8', mode='r') as ground_truth_file:
        lines = ground_truth_file.read().splitlines()
        for line in lines:
            sense_key = line.split()[1]
            ground_truth.append(sense_key)

    # Compute F1_Score
    _, _, f1score, _ = precision_recall_fscore_support(ground_truth, _predictions, average='micro')
    print(f'{model._name} F1_score: {f1score}')


def predictions_scorer():
    cmd = f"javac {os.path.join(os.getcwd(), 'resources', 'scorer.java')}"
    process = Popen(cmd.split(), stdout=PIPE)
    outs, errs = process.communicate()
    print(outs, errs)
    # TODO: COMPLETE IMPLEMENTATION OF FUNCTION SCORER


if __name__ == '__main__':
    configure_workspace()
    input_path = os.path.join(os.getcwd(), 'data', 'evaluation',
                              'multilingual_eval', 'semeval2013.de.data.xml')
    output_path = os.path.join(os.getcwd(), 'resources', 'multilingual_output.txt')
    resources_path = os.path.join(os.getcwd(), 'resources')
    lang = 'en'
    predict_multilingual(input_path, output_path, resources_path, lang)