Esempio n. 1
0
def main(args):
    """Visualization of contexts, questions, and colored answer spans."""

    # Load dataset, and optionally shuffle.
    dataset = QADataset(args, args.path)
    samples = dataset.samples
    if args.shuffle:
        random.shuffle(samples)

    vis_samples = samples[:args.samples]

    print()
    print('-' * RULE_LENGTH)
    print()

    # Visualize samples.
    for (qid, context, question, answer_start, answer_end) in vis_samples[:10]:
        cxt = _build_string(context)
        print(cxt)
        stanza.download('en')
        en_nlp = stanza.Pipeline('en')
        en_doc = en_nlp(cxt)

        for i, sent in enumerate(en_doc.sentences):
            print(f"[Sentence {i+1}")
            for word in sent.words:
                print("{:12s}\t{:12s}\t{:6s}\t{:d}\t{:12s}".format(
                    word.text, word.lemma, word.pos, word.head, word.deprel))
                print("")

        print("Mention text\tType\tStart-End")
        for ent in en_doc.ents:
            print("{}\t{}\t{}-{}".format(ent.text, ent.type, ent.start_char,
                                         ent.end_char))
def main(args):
    """Visualization of contexts, questions, and colored answer spans."""

    # Load dataset, and optionally shuffle.
    dataset = QADataset(args, args.path)
    samples = dataset.samples
    if args.shuffle:
        random.shuffle(samples)

    # print("NUMBER OF TOTAL POSSIBLE SAMPLES:", len(samples))
    vis_samples = samples[args.start:args.start + args.samples]

    print()
    print('-' * RULE_LENGTH)
    print()

    # Visualize samples.
    for (qid, context, question, answer_start, answer_end) in vis_samples:
        print('[METADATA]')
        print(f'path = \'{args.path}\'')
        print(f'question id = {qid}')
        print()

        print('[CONTEXT]')
        print(_color_context(context, answer_start, answer_end))
        print()

        print('[QUESTION]')
        print(_build_string(question))
        print()

        print('[ANSWER]')
        print(_build_string(context[answer_start:(answer_end + 1)]))
        print()

        print('-' * RULE_LENGTH)
        print()
Esempio n. 3
0
def main(args):
    """
    Main function for training, evaluating, and checkpointing.

    Args:
        args: `argparse` object.
    """
    # Print arguments.
    print('\nusing arguments:')
    _print_arguments(args)
    print()

    # Check if GPU is available.
    if not args.use_gpu and torch.cuda.is_available():
        print('warning: GPU is available but args.use_gpu = False')
        print()

    local_rank = args.local_rank
    # world_size = torch.cuda.device_count() # assume all local GPUs

    # Set up distributed process group
    rank = setup_dist(local_rank)

    # Set up datasets.
    train_dataset = QADataset(args, args.train_path)
    dev_dataset = QADataset(args, args.dev_path)

    # Create vocabulary and tokenizer.
    vocabulary = Vocabulary(train_dataset.samples, args.vocab_size)
    tokenizer = Tokenizer(vocabulary)
    for dataset in (train_dataset, dev_dataset):
        dataset.register_tokenizer(tokenizer)
    args.vocab_size = len(vocabulary)
    args.pad_token_id = tokenizer.pad_token_id
    print(f'vocab words = {len(vocabulary)}')

    # Print number of samples.
    print(f'train samples = {len(train_dataset)}')
    print(f'dev samples = {len(dev_dataset)}')
    print()

    # Select model.
    model = _select_model(args)
    #model = model.to(rank)
    #model = DDP(model, device_ids=[rank], output_device=rank)

    num_pretrained = model.load_pretrained_embeddings(
        vocabulary, args.embedding_path
    )
    pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2)
    print(f'using pre-trained embeddings from \'{args.embedding_path}\'')
    print(
        f'initialized {num_pretrained}/{len(vocabulary)} '
        f'embeddings ({pct_pretrained}%)'
    )
    print()

    # device = torch.device(f'cuda:{rank}')
    model = model.to(rank)
    model = DDP(model, device_ids=[rank], output_device=rank)

    # if args.use_gpu:
    #     model = cuda(args, model)

    if args.resume and args.model_path:
        map_location = {"cuda:0": "cuda:{}".format(rank)}
        model.load_state_dict(torch.load(args.model_path, map_location=map_location))

    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'using model \'{args.model}\' ({params} params)')
    print(model)
    print()

    if args.do_train:
        # Track training statistics for checkpointing.
        eval_history = []
        best_eval_loss = float('inf')

        # Begin training.
        for epoch in range(1, args.epochs + 1):
            # Perform training and evaluation steps.
            try:
                train_loss = train(args, epoch, model, train_dataset)
            except RuntimeError:
                print(f'NCCL Wait Timeout, rank: \'{args.local_rank}\' (exit)')
                exit(1)
            eval_loss = evaluate(args, epoch, model, dev_dataset)

            # If the model's evaluation loss yields a global improvement,
            # checkpoint the model.
            if rank == 0:
                eval_history.append(eval_loss < best_eval_loss)
                if eval_loss < best_eval_loss:
                    best_eval_loss = eval_loss
                    torch.save(model.state_dict(), args.model_path)
                
                print(
                    f'epoch = {epoch} | '
                    f'train loss = {train_loss:.6f} | '
                    f'eval loss = {eval_loss:.6f} | '
                    f"{'saving model!' if eval_history[-1] else ''}"
                )

                # If early stopping conditions are met, stop training.
                if _early_stop(args, eval_history):
                    suffix = 's' if args.early_stop > 1 else ''
                    print(
                        f'no improvement after {args.early_stop} epoch{suffix}. '
                        'early stopping...'
                    )
                    print()
                    cleanup_dist()
                    break

    if args.do_test and rank == 0:
        # Write predictions to the output file. Use the printed command
        # below to obtain official EM/F1 metrics.
        write_predictions(args, model, dev_dataset)
        eval_cmd = (
            'python3 evaluate.py '
            f'--dataset_path {args.dev_path} '
            f'--output_path {args.output_path}'
        )
        print()
        print(f'predictions written to \'{args.output_path}\'')
        print(f'compute EM/F1 with: \'{eval_cmd}\'')
        print()
Esempio n. 4
0
def main(args):
    """
    Main function for training, evaluating, and checkpointing.

    Args:
        args: `argparse` object.
    """
    # Print arguments.
    print('\nusing arguments:')
    _print_arguments(args)

    # Check if GPU is available.
    if not args.use_gpu and torch.cuda.is_available():
        print('warning: GPU is available but args.use_gpu = False')
        print()

    # Set up datasets.
    train_dataset = QADataset(args, args.train_path, is_train=True)
    dev_dataset = QADataset(args, args.dev_path, is_train=False)
    print("Start creating vocabulary and tokenizer")

    # Create vocabulary and tokenizer.
    vocabulary = Vocabulary(
        train_dataset.samples + train_dataset.culled_samples, args.vocab_size)
    tokenizer = Tokenizer(vocabulary)
    for dataset in (train_dataset, dev_dataset):
        dataset.register_tokenizer(tokenizer)
    args.vocab_size = len(vocabulary)
    args.pad_token_id = tokenizer.pad_token_id
    print(f'vocab words = {len(vocabulary)}')

    # Print number of samples.
    print(f'train samples = {len(train_dataset)}')
    print(f'dev samples = {len(dev_dataset)}')
    print()

    # Select model.
    model = _select_model(args)
    num_pretrained = model.load_pretrained_embeddings(vocabulary,
                                                      args.embedding_path)
    pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2)
    print(f'using pre-trained embeddings from \'{args.embedding_path}\'')
    print(f'initialized {num_pretrained}/{len(vocabulary)} '
          f'embeddings ({pct_pretrained}%)')
    print()

    if args.use_gpu:
        model = cuda(args, model)

    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'using model \'{args.model}\' ({params} params)')

    if args.do_train:
        # Track training statistics for checkpointing.
        eval_history = []
        best_eval_loss = float('inf')

        # Begin training.
        for epoch in range(1, args.epochs + 1):
            # Perform training and evaluation steps.
            train_loss = train(args, epoch, model, train_dataset)
            eval_loss = evaluate(args, epoch, model, dev_dataset)

            # If the model's evaluation loss yields a global improvement,
            # checkpoint the model.
            eval_history.append(eval_loss < best_eval_loss)
            if eval_loss < best_eval_loss:
                best_eval_loss = eval_loss
                torch.save(model.state_dict(), args.model_path)

            print(f'epoch = {epoch} | '
                  f'train loss = {train_loss:.6f} | '
                  f'eval loss = {eval_loss:.6f} | '
                  f"{'saving model!' if eval_history[-1] else ''}")

            # If early stopping conditions are met, stop training.
            if _early_stop(args, eval_history):
                suffix = 's' if args.early_stop > 1 else ''
                print(f'no improvement after {args.early_stop} epoch{suffix}. '
                      'early stopping...')
                print()
                break

    if args.do_test:
        # Write predictions to the output file. Use the printed command
        # below to obtain official EM/F1 metrics.
        write_predictions(args, model, dev_dataset)
        eval_cmd = ('python3 evaluate.py '
                    f'--dataset_path {args.dev_path} '
                    f'--output_path {args.output_path}')
        print()
        print(f'predictions written to \'{args.output_path}\'')
        print(f'compute EM/F1 with: \'{eval_cmd}\'')
        print()
Esempio n. 5
0
                    help='learning rate for ensemble')
parser.add_argument('--weight_decay', type=float, default=5e-3)
parser.add_argument('--num_epochs', type=int, default=200)
parser.add_argument('--batch_size', type=int, default=500)
parser.add_argument('--root', type=str, default='../movie-data')

args = parser.parse_args()
seed = 1234734614
torch.manual_seed(seed)
if args.use_cuda:
    torch.cuda.manual_seed(seed)

# dataset
dataset = QADataset(dataset='StackExchange',
                    questionFile='QuestionFeatures.tsv',
                    answerFile='AnswerFeatures.tsv',
                    userFile='UserFeatures.tsv',
                    rootFolder=args.root)
print "Dataset read", len(dataset)

PosClass = dataset.trainPairs_WFeatures[
    dataset.trainPairs_WFeatures['Credible'] == '1']
NegClass = dataset.trainPairs_WFeatures[
    dataset.trainPairs_WFeatures['Credible'] == '0']
print "Positive samples", len(PosClass)
questions = dataset.trainPairs['QuestionId'].unique()

if len(PosClass) > len(NegClass):
    NegClass_Sample = NegClass
else:
    NegClass_Sample = NegClass.sample(n=len(PosClass))
Esempio n. 6
0
def main(args):
    """
    Main function for training, evaluating, and checkpointing.

    Args:
        args: `argparse` object.
    """
    # Print arguments.
    print('\nusing arguments:')
    _print_arguments(args)
    print()

    # Check if GPU is available.
    if not args.use_gpu and torch.cuda.is_available():
        print('warning: GPU is available but args.use_gpu = False')
        print()

    # Set up datasets.
    train_dataset = QADataset(args, args.train_path)
    dev_dataset = QADataset(args, args.dev_path)

    # Create vocabulary and tokenizer.
    if args.vocab_path != None:
        print("loading vocabulary from file at {}".format(args.vocab_path))
        vocabulary = Vocabulary(train_dataset.samples,
                                args.vocab_size,
                                load_from_file=True,
                                filepath=args.vocab_path)
    else:
        print("constructing the vocab from dataset examples")
        vocabulary = Vocabulary(train_dataset.samples, args.vocab_size)

    tokenizer = Tokenizer(vocabulary)
    for dataset in (train_dataset, dev_dataset):
        dataset.register_tokenizer(tokenizer)
    args.vocab_size = len(vocabulary)
    args.pad_token_id = tokenizer.pad_token_id
    args.char_vocab_size = vocabulary.numCharacters()
    print(f'vocab words = {len(vocabulary)}')
    print(f'num characters = {args.char_vocab_size}')

    # Print number of samples.
    num_train_samples = len(train_dataset)
    print(f'train samples = {len(train_dataset)}')
    print(f'dev samples = {len(dev_dataset)}')
    print()

    # Select model.
    model = _select_model(args)
    num_pretrained = model.load_pretrained_embeddings(vocabulary,
                                                      args.embedding_path)
    pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2)
    print(f'using pre-trained embeddings from \'{args.embedding_path}\'')
    print(f'initialized {num_pretrained}/{len(vocabulary)} '
          f'embeddings ({pct_pretrained}%)')
    print()

    if args.use_gpu:
        model = cuda(args, model)

    # load the model from previous checkpoint
    if args.finetune >= 1:
        print("preparing to load {} as base model".format(args.init_model))
        model.load_state_dict(torch.load(args.init_model, map_location='cpu'))

    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'using model \'{args.model}\' ({params} params)')
    print(model)
    print()

    if args.do_train:
        # create tensorboard summary writer
        train_writer = tb.SummaryWriter(
            log_dir=os.path.join(args.logdir, args.run + "_train"))
        valid_writer = tb.SummaryWriter(
            log_dir=os.path.join(args.logdir, args.run + "_valid"))

        # Track training statistics for checkpointing.
        eval_history = []
        best_eval_loss = float('inf')

        # Begin training.
        for epoch in range(1, args.epochs + 1):
            # Perform training and evaluation steps.
            train_loss = train(args, epoch, model, train_dataset, train_writer,
                               num_train_samples)
            eval_loss = evaluate(args, epoch, model, dev_dataset)

            # write the loss to tensorboard
            valid_writer.add_scalar("valid_loss", eval_loss, global_step=epoch)

            # If the model's evaluation loss yields a global improvement,
            # checkpoint the model.
            eval_history.append(eval_loss < best_eval_loss)
            if eval_loss < best_eval_loss:
                best_eval_loss = eval_loss
                torch.save(model.state_dict(), args.model_path)

            print(f'epoch = {epoch} | '
                  f'train loss = {train_loss:.6f} | '
                  f'eval loss = {eval_loss:.6f} | '
                  f"{'saving model!' if eval_history[-1] else ''}")

            # If early stopping conditions are met, stop training.
            if _early_stop(args, eval_history):
                suffix = 's' if args.early_stop > 1 else ''
                print(f'no improvement after {args.early_stop} epoch{suffix}. '
                      'early stopping...')
                print()
                break

    if args.do_test:
        # Write predictions to the output file. Use the printed command
        # below to obtain official EM/F1 metrics.
        write_predictions(args, model, dev_dataset)
        eval_cmd = ('python3 evaluate.py '
                    f'--dataset_path {args.dev_path} '
                    f'--output_path {args.output_path}')
        print()
        print(f'predictions written to \'{args.output_path}\'')
        print(f'compute EM/F1 with: \'{eval_cmd}\'')
        print()
Esempio n. 7
0
parser = argparse.ArgumentParser(description='PyTorch Credibility Prediction Model')
parser.add_argument('--use_cuda', dest='use_cuda', default=False, action='store_true')
parser.add_argument('-lr', '--learning_rate', type=float, default=0.001,\
                    help='learning rate for FeedForward')
parser.add_argument('--weight_decay', type=float, default=5e-4)
parser.add_argument('--num_epochs', type=int, default=2)
parser.add_argument('--batch_size', type=int, default=500)
parser.add_argument('--root', type=str, default='/home/github/UserCredibility/movie-data')
glove_path = '/Users/kanika/Documents/glove.6B/glove.6B.50d.txt'
args = parser.parse_args()
seed = 1234734614
torch.manual_seed(seed)
if args.use_cuda:
	torch.cuda.manual_seed(seed)

dataset = QADataset(dataset='StackExchange',questionFile = 'QuestionFeatures.tsv', answerFile = 'AnswerFeatures.tsv', userFile = 'UserFeatures.tsv',
                                    rootFolder= args.root)


print "Dataset read", len(dataset)

###SAMPLE THE DATASET
PosClass = dataset.trainPairs_WFeatures[dataset.trainPairs_WFeatures['Credible'] == '1']
NegClass = dataset.trainPairs_WFeatures[dataset.trainPairs_WFeatures['Credible'] == '0']
print "Positive samples",len(PosClass)
questions = dataset.trainPairs['QuestionId'].unique()

if len(PosClass) > len(NegClass):
    NegClass_Sample = NegClass
else:
    NegClass_Sample = NegClass.sample(n=len(PosClass))
Esempio n. 8
0
args = parser.parse_args()
seed = 1234734614
torch.manual_seed(seed)
if args.use_cuda:
    torch.cuda.manual_seed(seed)

if args.use_content:
    print("Using content embeddings")

device = torch.device("cuda:1" if args.use_cuda else "cpu")
print device
# dataset
dataset = QADataset(dataset=args.dataset,
                    questionFile='QuestionFeatures.tsv',
                    answerFile='AnswerFeatures.tsv',
                    userFile='UserFeatures.tsv',
                    rootFolder=os.path.join(args.root, args.dataset))
print "Dataset read", args.dataset, len(dataset)

PosClass = dataset.trainPairs_WFeatures[
    dataset.trainPairs_WFeatures['Credible'] == '1']
NegClass = dataset.trainPairs_WFeatures[
    dataset.trainPairs_WFeatures['Credible'] == '0']
print "Positive samples", len(PosClass)
questions = dataset.trainPairs['QuestionId'].unique()

if len(PosClass) > len(NegClass):
    NegClass_Sample = NegClass
else:
    NegClass_Sample = NegClass.sample(n=len(PosClass))
Esempio n. 9
0
def main(args):
    """
    Main function for training, evaluating, and checkpointing.

    Args:
        args: `argparse` object.
    """
    # Print arguments.
    print('\nusing arguments:')
    _print_arguments(args)
    print("args type: ", type(args))
    print()

    # Check if GPU is available.
    if not args.use_gpu and torch.cuda.is_available():
        print('warning: GPU is available but args.use_gpu = False')
        print()

    if args.bio:
        print("training on bio dataset")
        train_dataset = QADataset(args, args.train_path)
        dev_dataset = QADataset(args, args.dev_path)
        bio_len = len(train_dataset.elems)  # len == 1504
        print("bio data size: ", bio_len)
        random.shuffle(train_dataset.elems)
        train_dataset.elems = train_dataset.elems[:int(bio_len / 2)]
        dev_dataset.elems = train_dataset.elems[int(bio_len / 2):]
    else:
        # Set up datasets
        train_dataset = QADataset(
            args, args.train_path)  # len == 18885, vocab_size == 50004
        dev_dataset = QADataset(
            args, args.dev_path)  # len == 2067, vocab words == 24987

    if args.domain_adaptive:
        # NewsQA dataset
        print("domain adaptive training")
        news_train = QADataset(args, "datasets/newsqa_train.jsonl.gz"
                               )  # len == 11428, vocab words == 24989
        news_dev = QADataset(
            args,
            "datasets/newsqa_dev.jsonl.gz")  # len == 638, vocab words == 18713
        bio = QADataset(
            args,
            "datasets/bioasq.jsonl.gz")  # len == 1504, vocab words == 18715

        train_dataset.elems = train_dataset.elems + news_train.elems + news_dev.elems

    print("total dataset size: ", len(train_dataset.elems))

    # Create vocabulary and tokenizer.
    vocabulary = Vocabulary(train_dataset.samples, args.vocab_size)
    tokenizer = Tokenizer(vocabulary)
    for dataset in (train_dataset, dev_dataset):
        dataset.register_tokenizer(tokenizer)
    args.vocab_size = len(vocabulary)
    args.pad_token_id = tokenizer.pad_token_id
    print(f'vocab words = {len(vocabulary)}')

    # Print number of samples.
    print(f'train samples = {len(train_dataset)}')
    print(f'dev samples = {len(dev_dataset)}')
    print()

    # Select model.
    model = _select_model(args)
    num_pretrained = model.load_pretrained_embeddings(vocabulary,
                                                      args.embedding_path)
    pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2)
    print(f'using pre-trained embeddings from \'{args.embedding_path}\'')
    print(f'initialized {num_pretrained}/{len(vocabulary)} '
          f'embeddings ({pct_pretrained}%)')
    print()

    if args.use_gpu:
        model = cuda(args, model)

    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f'using model \'{args.model}\' ({params} params)')
    print(model)
    print()

    if args.do_train:
        # Track training statistics for checkpointing.
        eval_history = []
        best_eval_loss = float('inf')

        # Begin training.
        for epoch in range(1, args.epochs + 1):
            if args.use_EDA_aug:
                # randomly shuffle the data 1st
                print("shuffling dataset")
                random.shuffle(train_dataset.samples)
                # Perform augmentation on the training data
                print("performing augmentation on dataset...")
                train_dataset_copy = deepcopy(train_dataset)
                print("prob for char aug is: ", args.char_aug)
                augmented_train_dataset = EDA(train_dataset_copy,
                                              sr_prob=0.33,
                                              rd_prob=0.05,
                                              rs_prob=0.10,
                                              ri_prob=0.10,
                                              r_shuffle_prob=0.10,
                                              r_backtrans_prob=0.0,
                                              char_aug=args.char_aug)
            else:
                print("no augmentation")

            # Perform training and evaluation steps.
            if args.use_EDA_aug:
                # ADDITION: train for augmented training set, eval on the same old dev set
                print("training on augmented dataset")
                # print ("1st context of the augmented dataset looks like: ", augmented_train_dataset.elems[0]['context'])
                a = random.randint(0, 3)
                print("random num gen: ", a)
                print("context ex of augmented data: " +
                      augmented_train_dataset.elems[a]['context'])
                print("sample ex of augmented data: " + " ".join(
                    [token
                     for token in augmented_train_dataset.samples[a][1]]))
                assert augmented_train_dataset != train_dataset
                train_loss = train(args, epoch, model, augmented_train_dataset)
            else:
                print("training on normal dataset")
                train_loss = train(args, epoch, model, train_dataset)
            eval_loss = evaluate(args, epoch, model, dev_dataset)

            # If the model's evaluation loss yields a global improvement,
            # checkpoint the model.
            eval_history.append(eval_loss < best_eval_loss)
            if eval_loss < best_eval_loss:
                best_eval_loss = eval_loss
                torch.save(model.state_dict(), args.model_path)

            print(f'epoch = {epoch} | '
                  f'train loss = {train_loss:.6f} | '
                  f'eval loss = {eval_loss:.6f} | '
                  f"{'saving model!' if eval_history[-1] else ''}")

            # If early stopping conditions are met, stop training.
            if _early_stop(args, eval_history):
                suffix = 's' if args.early_stop > 1 else ''
                print(f'no improvement after {args.early_stop} epoch{suffix}. '
                      'early stopping...')
                print()
                break

    if args.do_test:
        # Write predictions to the output file. Use the printed command
        # below to obtain official EM/F1 metrics.
        write_predictions(args, model, dev_dataset)
        eval_cmd = ('python3 evaluate.py '
                    f'--dataset_path {args.dev_path} '
                    f'--output_path {args.output_path}')
        print()
        print(f'predictions written to \'{args.output_path}\'')
        print(f'compute EM/F1 with: \'{eval_cmd}\'')
        print()
Esempio n. 10
0
en_tok_path = encparams["tokenizer_path"]
en_tokenizer = BertTokenizerFast(os.path.join(en_tok_path, "vocab.txt"))
de_tok_path = decparams["tokenizer_path"]
de_tokenizer = BertTokenizerFast(os.path.join(de_tok_path, "vocab.txt"))

# Init the dataset
train_en_file = globalparams["train_en_file"]
train_de_file = globalparams["train_de_file"]
valid_en_file = globalparams["valid_en_file"]
valid_de_file = globalparams["valid_de_file"]

enc_maxlength = encparams["max_length"]
dec_maxlength = decparams["max_length"]

batch_size = modelparams["batch_size"]
train_dataset = QADataset(train_en_file, train_de_file, en_tokenizer,
                          de_tokenizer, enc_maxlength, dec_maxlength)
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False, \
                                                drop_last=True, num_workers=1, collate_fn=train_dataset.collate_function)

valid_dataset = QADataset(valid_en_file, valid_de_file, en_tokenizer,
                          de_tokenizer, enc_maxlength, dec_maxlength)
valid_dataloader = torch.utils.data.DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False, \
                                                drop_last=True, num_workers=1, collate_fn=valid_dataset.collate_function)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

print("Loading models ..")
vocabsize = encparams["vocab_size"]
max_length = encparams["max_length"]
encoder_config = BertConfig(
Esempio n. 11
0
                    default=False,
                    action='store_true')
parser.add_argument('-lr', '--learning_rate', type=float, default=0.001,\
                    help='learning rate for FeedForward')
parser.add_argument('--num_epochs', type=int, default=100)
parser.add_argument('--root', type=str, default='/home/knarang2/StackExchange')
glove_path = '/Users/kanika/Documents/glove.6B/glove.6B.50d.txt'
args = parser.parse_args()
seed = 1234734614
torch.manual_seed(seed)
if args.use_cuda:
    torch.cuda.manual_seed(seed)

dataset = QADataset(dataset='StackExchange',
                    questionFile='QuestionFeatures.tsv',
                    answerFile='AnswerFeatures.tsv',
                    userFile='UserFeatures.tsv',
                    rootFolder=args.root)

vectorizer = IndexVectorizer(min_frequency=10)
##CHANGED HERE
textDataset = SubjObjDataset(
    os.path.join(args.root, "pairText_merge_processed.tsv"), vectorizer)
word2idx = textDataset.vectorizer.word2idx
embeddings = load_glove_embeddings(glove_path, word2idx)
#print embeddings
print "#WORDS in the Vocabulary", len(word2idx)

print "Dataset read", len(dataset)

###SAMPLE THE DATASET
from load_embeddings import GloveVector, getWeightMatrix
from data import QADataset, get_dataloader
from model import ARC1
from loss import marginLoss
import numpy as np


document_set = './datasets/DataSet_query_document/document_set.json'
tweet_set = './datasets/tweet2new/id2twitter.json'
news_set = './datasets/tweet2new/id2new.json'
query_set = './datasets/DataSet_query_document/query_set.json'
query2document_train = './datasets/DataSet_query_document/query2document_train.json'
tw2ne_train = './datasets/tweet2new/tw2ne_train.json'
tw2ne_test = './datasets/tweet2new/tw2ne_test.json'
query2document_test = './datasets/DataSet_query_document/query2document_test.json'
qa_dataset_train = QADataset(query2document_train, document_set, query_set)
dataset_sizes_train = len(qa_dataset_train)

tn_dataset_train = QADataset(tw2ne_train, news_set, tweet_set)
tn_dataset_sizes_train = len(tn_dataset_train)

tn_dataset_test = QADataset(tw2ne_test, news_set, tweet_set)
tn_dataset_sizes_test = len(tn_dataset_test)

qa_dataset_test = QADataset(query2document_test, document_set, query_set)
dataset_sizes_test = len(qa_dataset_test)
train_dataloader = get_dataloader(tn_dataset_train, 100)
test_dataloader = get_dataloader(tn_dataset_test, 1)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
Esempio n. 13
0
BATCH_SIZE = args.batch_size
timestamp = str(int(time.time()))
seed = 999
torch.manual_seed(seed)
if args.use_cuda:
    torch.cuda.manual_seed(seed)

os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
# torch.device object used throughout this script
DEVICE = torch.device("cuda:0" if args.use_cuda else "cpu")

# dataset
dataset = QADataset(dataset=args.dataset,
                    questionFile='QuestionFeatures.tsv',
                    answerFile='AnswerFeatures.tsv',
                    userFile='UserFeatures.tsv',
                    rootFolder=os.path.join(args.root, args.dataset))
print "Dataset read", len(dataset), args.dataset

PosClass = dataset.trainPairs_WFeatures[
    dataset.trainPairs_WFeatures['Credible'] == '1']
NegClass = dataset.trainPairs_WFeatures[
    dataset.trainPairs_WFeatures['Credible'] == '0']
print "Positive samples", len(PosClass)
questions = dataset.trainPairs['QuestionId'].unique()

if len(PosClass) > len(NegClass):
    NegClass_Sample = NegClass
else:
    NegClass_Sample = NegClass.sample(n=len(PosClass))