Exemple #1
0
def build_specter_input(hf_dataset: str,
                        aspect,
                        fold: Union[str, int],
                        output_path: str,
                        override: bool = False) -> None:
    """
    Run with: $ ./data_cli.py build_specter_input paperswithcode_aspects task 1 ./output/specter_input/1

    Builds the following files (needed for SPECTER training):
    - data.json containing the document ids and their relationship.
    - metadata.json containing mapping of document ids to textual fiels (e.g., title, abstract)
    - train.txt,val.txt, test.txt containing document ids corresponding to train/val/test sets (one doc id per line).

    Data structure:
    - count = 5 (same aspect)
    - count = 1 (???) => ignore

    :param aspect:
    :param hf_dataset:
    :param fold:
    :param output_path:
    :param override:
    :return:
    """
    nlp_cache_dir = './data/nlp_cache'

    if os.path.exists(output_path) and not override:
        logger.error(f'Output path exist already: {output_path}')
        sys.exit(1)
    else:
        os.makedirs(output_path)

    train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                            name='relations',
                            cache_dir=nlp_cache_dir,
                            split=get_train_split(aspect, fold))

    test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='relations',
                           cache_dir=nlp_cache_dir,
                           split=get_test_split(aspect, fold))

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir=nlp_cache_dir,
                           split='docs')

    # metadata
    metadata = {}
    for doc in docs_ds:
        metadata[doc['paper_id']] = {
            'paper_id': doc['paper_id'],
            'title': doc['title'],
            'abstract': doc['abstract'],
        }
    logger.info('Writing metadata')
    json.dump(metadata, open(os.path.join(output_path, 'metadata.json'), 'w'))

    # train/val/test ids
    train_doc_ids = set()
    test_doc_ids = set()

    # data
    data = defaultdict(dict)
    for pair in train_ds:
        # TODO include negative samples?
        count = 5 if pair['label'] == 'y' else 1
        data[pair['from_paper_id']][pair['to_paper_id']] = {'count': count}

        train_doc_ids.add(pair['from_paper_id'])
        train_doc_ids.add(pair['to_paper_id'])

    for pair in test_ds:
        count = 5 if pair['label'] == 'y' else 1
        data[pair['from_paper_id']][pair['to_paper_id']] = {'count': count}

        test_doc_ids.add(pair['from_paper_id'])
        test_doc_ids.add(pair['to_paper_id'])

    logger.info('Writing data')
    json.dump(data, open(os.path.join(output_path, 'data.json'), 'w'))

    train_doc_ids = list(train_doc_ids)
    full_test_doc_ids = list(test_doc_ids)
    random.shuffle(full_test_doc_ids)

    split_at = int(0.1 * len(full_test_doc_ids))

    val_doc_ids = full_test_doc_ids[:split_at]
    test_doc_ids = full_test_doc_ids[split_at:]

    logger.info('Writing train/val/test')
    with open(os.path.join(output_path, 'train.txt'), 'w') as f:
        for i in train_doc_ids:
            f.write(i + '\n')
    with open(os.path.join(output_path, 'val.txt'), 'w') as f:
        for i in val_doc_ids:
            f.write(i + '\n')
    with open(os.path.join(output_path, 'test.txt'), 'w') as f:
        for i in test_doc_ids:
            f.write(i + '\n')

    logger.info('done')
Exemple #2
0
def main():
    # Auto-environment
    env = get_env()

    parser = HfArgumentParser(
        (ModelArguments, TrainingArguments, ExperimentArguments))
    model_args, training_args, experiment_args = parser.parse_args_into_dataclasses(
    )

    # Adjust output with folds and model name
    #TODO disabled
    # training_args.output_dir = os.path.join(training_args.output_dir, str(experiment_args.cv_fold), model_args.get_model_name())

    # Model path from env
    if not os.path.exists(model_args.model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_args.model_name_or_path)):
        model_args.model_name_or_path = os.path.join(
            env['bert_dir'], model_args.model_name_or_path)

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Dataset args
    label_classes = get_label_classes_from_hf_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset))
    num_labels = len(label_classes)

    if num_labels > 1 and experiment_args.binary_classification:
        # In binary classification we have only single label (with y=[0;1])
        num_labels = 1
        logger.warning(f'Forcing label classes to binary: {label_classes}')

    columns = ['input_ids', 'attention_mask', 'token_type_ids',
               'labels']  # Input to transformers.forward

    # Build dataset for splits
    train_ds = load_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset),
        name='relations',
        cache_dir=experiment_args.hf_dataset_cache_dir,
        split=get_train_split(experiment_args.aspect, experiment_args.cv_fold))
    test_ds = load_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset),
        name='relations',
        cache_dir=experiment_args.hf_dataset_cache_dir,
        split=get_test_split(experiment_args.aspect, experiment_args.cv_fold))
    docs_ds = load_dataset(get_local_hf_dataset_path(
        experiment_args.hf_dataset),
                           name='docs',
                           cache_dir=experiment_args.hf_dataset_cache_dir,
                           split=datasets.Split('docs'))

    # Forced limit
    if experiment_args.dataset_limit > 0:
        logger.info(
            f'Train and test datasets limited to {experiment_args.dataset_limit} samples'
        )

        train_ds = Dataset(train_ds.data[:experiment_args.dataset_limit])
        test_ds = Dataset(test_ds.data[:experiment_args.dataset_limit])

    # Build ID => Doc mapping
    doc_id2doc = {doc[experiment_args.doc_id_col]: doc for doc in docs_ds}

    if model_args.model_name_or_path.startswith('baseline-rnn'):
        # Load Spacy as tokenizer
        spacy_nlp = spacy.load(experiment_args.spacy_model,
                               disable=["tagger", "ner", "textcat"])

        if experiment_args.multi_label:
            # Baseline models
            model = RNNForMultiLabelSequenceClassification(
                word_vectors=get_vectors_from_spacy_model(spacy_nlp),
                hidden_size=experiment_args.rnn_hidden_size,
                rnn=experiment_args.rnn_type,
                num_labels=num_labels,
                num_layers=experiment_args.rnn_num_layers,
                dropout=experiment_args.rnn_dropout,
            )
        else:
            raise NotImplementedError(
                'RNN baseline is only available for multi label classification'
            )

        tokenizer = None

    else:
        # Load pretrained Transformers models and tokenizers
        model_config = AutoConfig.from_pretrained(
            model_args.model_name_or_path,
            num_labels=num_labels,
            cache_dir=model_args.cache_dir)

        # No need for spacy
        spacy_nlp = None

        if 'longformer' in model_args.model_name_or_path:
            # TVM: a custom CUDA kernel implementation of our sliding window attention (works only on GPU)
            model_config.attention_mode = 'tvm'

            # override tokenizer name if not set
            if model_args.tokenizer_name is None:
                roberta_path = os.path.join(env['bert_dir'], 'roberta-base')
                model_args.tokenizer_name = roberta_path if os.path.exists(
                    roberta_path) else 'roberta-base'

                logger.info(
                    f'Overriding tokenizer: {model_args.tokenizer_name}')

            # override max length
            experiment_args.max_length = 4096

        if experiment_args.multi_label:
            model_cls = AutoModelForMultiLabelSequenceClassification
        else:
            model_cls = AutoModelForSequenceClassification

        model = model_cls.from_pretrained(model_args.model_name_or_path,
                                          config=model_config,
                                          cache_dir=model_args.cache_dir)
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )

        # Set token limit if defined by model (for Longformer)
        if model.config.max_position_embeddings > 0:
            tokenizer.model_max_length = model.config.max_position_embeddings

    # Init helper
    dpt = DocRelTrainerHelper(
        id2doc=doc_id2doc,
        transformers_tokenizer=tokenizer,
        spacy_nlp=spacy_nlp,
        label_classes=label_classes,
        binary_classification=experiment_args.binary_classification,
        doc_a_col=experiment_args.doc_a_col,
        doc_b_col=experiment_args.doc_b_col,
        label_col=experiment_args.label_col,
        text_from_doc_func=get_non_empty_text_from_doc,
        classification_threshold=experiment_args.classification_threshold,
        max_length=experiment_args.max_length,
        multi_label=experiment_args.multi_label,
    )

    logger.info('Converting to features (doc mapping, tokenize, ...)')

    # Build hash from settings for caching
    data_settings_hash = hashlib.md5(
        dataclasses.asdict(experiment_args).__str__().encode("utf-8") +
        dataclasses.asdict(model_args).__str__().encode("utf-8")).hexdigest()

    train_tensor_ds = train_ds.map(
        dpt.convert_to_features,
        batched=True,
        load_from_cache_file=True,
        num_proc=int(env['workers']),
        cache_file_name=os.path.join(
            experiment_args.hf_dataset_cache_dir,
            "cache-train-" + data_settings_hash + ".arrow"))
    train_tensor_ds.set_format(type='torch', columns=columns)

    test_tensor_ds = test_ds.map(
        dpt.convert_to_features,
        batched=True,
        load_from_cache_file=True,
        num_proc=int(env['workers']),
        cache_file_name=os.path.join(
            experiment_args.hf_dataset_cache_dir,
            "cache-test-" + data_settings_hash + ".arrow"))
    test_tensor_ds.set_format(type='torch', columns=columns)

    logger.info(f'Dataset columns: {columns}')
    logger.info(f'Train sample: {train_ds[0]}')
    logger.debug(f'- as tensor: {train_tensor_ds[0]}')

    logger.info(f'Test sample: {test_ds[0]}')
    logger.debug(f'- as tensor: {test_tensor_ds[0]}')

    # Load models weights (when no training but predictions)
    model_weights_path = os.path.join(training_args.output_dir,
                                      'pytorch_model.bin')

    if not training_args.do_train and experiment_args.save_predictions:
        logger.info(
            f'Loading existing model weights from disk: {model_weights_path}')
        if os.path.exists(model_weights_path):
            model.load_state_dict(torch.load(model_weights_path))
        else:
            logger.error('Weights files does not exist!')

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tensor_ds,
        eval_dataset=test_tensor_ds,
        data_collator=DocRelDataCollator(),
        #prediction_loss_only=False,
        compute_metrics=dpt.compute_metrics,
    )

    # Log additional (to Weights & Baises)
    if is_wandb_available():
        extra_config = {}
        extra_config.update(dataclasses.asdict(experiment_args))
        extra_config.update(dataclasses.asdict(model_args))

        wandb.config.update(extra_config, allow_val_change=True)

    if training_args.do_train:
        logger.info('Training started...')
        trainer.train()

        if isinstance(model, PreTrainedModel):
            trainer.save_model()
            tokenizer.save_pretrained(training_args.output_dir)

        elif isinstance(model, nn.Module):  # RNN model
            torch.save(model.state_dict(), model_weights_path)

    if experiment_args.save_predictions:
        logger.info('Predicting...')

        predictions = trainer.predict(test_tensor_ds)

        df = dpt.get_df_from_predictions(test_ds,
                                         docs_ds,
                                         predictions,
                                         exclude_columns=['abstract'])

        # Save results to disk
        df.to_csv(os.path.join(training_args.output_dir, 'results.csv'),
                  index=False)
        json.dump(
            predictions.metrics,
            open(os.path.join(training_args.output_dir, 'metrics.json'), 'w'))

    logger.info('Done')
Exemple #3
0
    def get_evaluation_df(name, doc_model, hf_dataset, aspect,
                          fold) -> Tuple[DataFrame, Dict]:
        # Init dataframe
        metrics = [
            'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs',
            'precision', 'recall', 'avg_p', 'reciprocal_rank', 'ndcg'
        ]
        df = pd.DataFrame([],
                          columns=['name', 'aspect', 'fold', 'top_k'] +
                          metrics)

        # Dataset
        test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                               name='relations',
                               cache_dir='./data/nlp_cache',
                               split=get_test_split(aspect, fold))

        logger.info(f'Test samples: {len(test_ds):,}')

        # Unique paper IDs in test set
        test_paper_ids = set(test_ds['from_paper_id']).union(
            set(test_ds['to_paper_id']))

        logger.info(f'Test paper IDs: {len(test_paper_ids):,}')
        logger.info(f'Examples: {list(test_paper_ids)[:10]}')

        # Relevance mapping
        doc_id2related_ids = defaultdict(set)  # type: Dict[Set[str]]
        for row in test_ds:
            if row['label'] == 'y':
                a = row['from_paper_id']
                b = row['to_paper_id']
                doc_id2related_ids[a].add(b)
                doc_id2related_ids[b].add(a)

        # Filter for documents in test set
        test_doc_model = KeyedVectors(vector_size=doc_model.vector_size)
        test_doc_ids = []
        test_doc_vectors = []
        missed_doc_ids = 0

        for doc_id in doc_model.vocab:
            if doc_id in test_paper_ids:
                vec = doc_model.get_vector(doc_id)
                if len(vec) != doc_model.vector_size:
                    raise ValueError(
                        f'Test document as invalid shape: {doc_id} => {vec.shape}'
                    )

                test_doc_ids.append(doc_id)
                test_doc_vectors.append(vec)
            else:
                missed_doc_ids += 1
                # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})')

        if len(test_doc_ids) != len(test_doc_vectors):
            raise ValueError(
                f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}'
            )

        logger.info(
            f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})'
        )
        logger.info(f'Test document vectors: {len(test_doc_vectors)}')

        test_doc_model.add(test_doc_ids, test_doc_vectors)
        test_doc_model.init_sims(replace=True)

        logger.info(f'Test document vectors: {test_doc_model.vectors.shape}')

        # Actual evaluation
        # k2eval_rows = defaultdict(list)
        seed_ids_without_recommendations = []
        max_top_k = max(top_ks)
        eval_rows = {top_k: defaultdict(list)
                     for top_k in top_ks
                     }  # top_k => metric_name => list of value

        seed_id2ret_docs = {}

        for seed_id in tqdm(
                test_paper_ids,
                desc=f'Evaluation ({name},aspect={aspect},fold={fold})'):
            try:
                rel_docs = doc_id2related_ids[seed_id]
                max_ret_docs = [
                    d
                    for d, score in test_doc_model.most_similar(seed_id,
                                                                topn=max_top_k)
                ]
                seed_id2ret_docs[seed_id] = max_ret_docs

                for top_k in top_ks:
                    ret_docs = max_ret_docs[:top_k]
                    rel_ret_docs_count = len(set(ret_docs) & set(rel_docs))

                    if ret_docs and rel_docs:
                        # Precision = No. of relevant documents retrieved / No. of total documents retrieved
                        precision = rel_ret_docs_count / len(ret_docs)

                        # Recall = No. of relevant documents retrieved / No. of total relevant documents
                        recall = rel_ret_docs_count / len(rel_docs)

                        # Avg. precision (for MAP)
                        avg_p = get_avg_precision(ret_docs, rel_docs)

                        # Reciprocal rank (for MRR)
                        reciprocal_rank = get_reciprocal_rank(
                            ret_docs, rel_docs)

                        # # NDCG@k
                        predicted_relevance = [
                            1 if ret_doc_id in rel_docs else 0
                            for ret_doc_id in ret_docs
                        ]
                        true_relevances = [1] * len(rel_docs)
                        ndcg_value = compute_dcg_at_k(
                            predicted_relevance, top_k) / compute_dcg_at_k(
                                true_relevances, top_k)

                        # Save metrics
                        eval_rows[top_k]['retrieved_docs'].append(
                            len(ret_docs))
                        eval_rows[top_k]['relevant_docs'].append(len(rel_docs))
                        eval_rows[top_k]['relevant_retrieved_docs'].append(
                            rel_ret_docs_count)
                        eval_rows[top_k]['precision'].append(precision)
                        eval_rows[top_k]['recall'].append(recall)
                        eval_rows[top_k]['avg_p'].append(avg_p)
                        eval_rows[top_k]['reciprocal_rank'].append(
                            reciprocal_rank)
                        eval_rows[top_k]['ndcg'].append(ndcg_value)

            except (IndexError, ValueError, KeyError) as e:
                seed_ids_without_recommendations.append(seed_id)

                logger.warning(
                    f'Cannot retrieve recommendations for #{seed_id}: {e}')

        logger.info(
            f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})'
        )

        # Summarize evaluation
        for top_k in top_ks:
            try:
                row = [name, aspect, fold, top_k]
                for metric in metrics:
                    # mean over all metrics
                    values = eval_rows[top_k][metric]
                    if len(values) > 0:
                        row.append(np.mean(values))
                    else:
                        row.append(None)

                df.loc[len(df)] = row

            except ValueError as e:
                logger.error(
                    f'Cannot summarize row: {top_k} {fold} {metrics} {e}')

        return df, seed_id2ret_docs
Exemple #4
0
def evaluate_vectors(hf_dataset: str, aspect: str, input_path: str, name: str,
                     folds: Union[str, list], top_ks: Union[str, list],
                     output_path: str):
    """

    Run with: $ ./eval_cli.py evaluate_vectors paperswithcode_aspects task ./output/pwc_doc_id2st.txt --name=sentence_transformers --folds=1,2,3,4 --top_ks=5,10,25,50 --output_path=./output/eval.csv

    :param aspect:
    :param folds:
    :param top_ks:
    :param name:
    :param hf_dataset:
    :param input_path:
    :param output_path:
    :return:
    """

    if isinstance(folds, str):
        folds = folds.split(',')
    elif isinstance(folds, int):
        folds = [folds]

    if isinstance(top_ks, str):
        top_ks = top_ks.split(',')
    elif isinstance(top_ks, int):
        top_ks = [top_ks]

    logger.info(f'Folds: {folds}')
    logger.info(f'Top-Ks: {top_ks}')

    if len(folds) < 1:
        logger.error('No folds provided')
        return

    if len(top_ks) < 1:
        logger.error('No top-k values provided')
        return

    # Load documents
    doc_model = KeyedVectors.load_word2vec_format(input_path)
    logger.info(f'Document vectors: {doc_model.vectors.shape}')

    # Normalize vectors
    doc_model.init_sims(replace=True)

    # Init dataframe
    metrics = [
        'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs',
        'precision', 'recall', 'avg_p', 'reciprocal_rank'
    ]
    df = pd.DataFrame([], columns=['name', 'fold', 'top_k'] + metrics)

    # Iterate over folds
    for fold in folds:
        logger.info(f'Current fold: {fold}')

        # Dataset
        test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                               name='relations',
                               cache_dir='./data/nlp_cache',
                               split=get_test_split(aspect, fold))

        logger.info(f'Test samples: {len(test_ds):,}')

        # Unique paper IDs in test set
        test_paper_ids = set(test_ds['from_paper_id']).union(
            set(test_ds['to_paper_id']))

        logger.info(f'Test paper IDs: {len(test_paper_ids):,}')
        logger.info(f'Examples: {list(test_paper_ids)[:10]}')

        # Relevance mapping
        doc_id2related_ids = defaultdict(set)  # type: Dict[Set[str]]
        for row in test_ds:
            if row['label'] == 'y':
                a = row['from_paper_id']
                b = row['to_paper_id']
                doc_id2related_ids[a].add(b)
                doc_id2related_ids[b].add(a)

        # Filter for documents in test set
        test_doc_model = KeyedVectors(vector_size=doc_model.vector_size)
        test_doc_ids = []
        test_doc_vectors = []
        missed_doc_ids = 0

        for doc_id in doc_model.vocab:
            if doc_id in test_paper_ids:
                vec = doc_model.get_vector(doc_id)
                if len(vec) != doc_model.vector_size:
                    raise ValueError(
                        f'Test document as invalid shape: {doc_id} => {vec.shape}'
                    )

                test_doc_ids.append(doc_id)
                test_doc_vectors.append(vec)
            else:
                missed_doc_ids += 1
                # logger.warning(f'Document ID is not part of test set: {doc_id} ({type(doc_id)})')

        if len(test_doc_ids) != len(test_doc_vectors):
            raise ValueError(
                f'Test document IDs does not match vector count: {len(test_doc_ids)} vs {len(test_doc_vectors)}'
            )

        logger.info(
            f'Test document IDs: {len(test_doc_ids)} (missed {missed_doc_ids})'
        )
        logger.info(f'Test document vectors: {len(test_doc_vectors)}')

        test_doc_model.add(test_doc_ids, test_doc_vectors)
        test_doc_model.init_sims(replace=True)

        logger.info(f'Test document vectors: {test_doc_model.vectors.shape}')

        # Actual evaluation
        # k2eval_rows = defaultdict(list)
        seed_ids_without_recommendations = []
        max_top_k = max(top_ks)
        eval_rows = {top_k: defaultdict(list)
                     for top_k in top_ks
                     }  # top_k => metric_name => list of value

        for seed_id in tqdm(test_paper_ids, desc=f'Evaluation (fold={fold})'):
            try:
                rel_docs = doc_id2related_ids[seed_id]
                max_ret_docs = [
                    d
                    for d, score in test_doc_model.most_similar(seed_id,
                                                                topn=max_top_k)
                ]
                for top_k in top_ks:
                    ret_docs = max_ret_docs[:top_k]
                    rel_ret_docs_count = len(set(ret_docs) & set(rel_docs))

                    if ret_docs and rel_docs:
                        # Precision = No. of relevant documents retrieved / No. of total documents retrieved
                        precision = rel_ret_docs_count / len(ret_docs)

                        # Recall = No. of relevant documents retrieved / No. of total relevant documents
                        recall = rel_ret_docs_count / len(rel_docs)

                        # Avg. precision (for MAP)
                        avg_p = get_avg_precision(ret_docs, rel_docs)

                        # Reciprocal rank (for MRR)
                        reciprocal_rank = get_reciprocal_rank(
                            ret_docs, rel_docs)

                        # # NDCG@k
                        # predicted_relevance = [1 if ret_doc_id in rel_docs else 0 for ret_doc_id in ret_docs]
                        # true_relevances = [1] * len(rel_docs)
                        # ndcg_value = self.compute_dcg_at_k(predicted_relevance, top_k) / self.compute_dcg_at_k(true_relevances, top_k)

                        # Save metrics
                        eval_rows[top_k]['retrieved_docs'].append(
                            len(ret_docs))
                        eval_rows[top_k]['relevant_docs'].append(len(rel_docs))
                        eval_rows[top_k]['relevant_retrieved_docs'].append(
                            rel_ret_docs_count)
                        eval_rows[top_k]['precision'].append(precision)
                        eval_rows[top_k]['recall'].append(recall)
                        eval_rows[top_k]['avg_p'].append(avg_p)
                        eval_rows[top_k]['reciprocal_rank'].append(
                            reciprocal_rank)

            except (IndexError, ValueError, KeyError) as e:
                seed_ids_without_recommendations.append(seed_id)

                logger.warning(
                    f'Cannot retrieve recommendations for #{seed_id}: {e}')

        logger.info(
            f'Completed with {len(eval_rows[top_ks[0]][metrics[0]]):,} rows (missed {len(seed_ids_without_recommendations):,})'
        )

        # Summarize evaluation
        for top_k in top_ks:
            try:
                row = [name, fold, top_k]
                for metric in metrics:
                    # mean over all metrics
                    values = eval_rows[top_k][metric]
                    if len(values) > 0:
                        row.append(np.mean(values))
                    else:
                        row.append(None)

                df.loc[len(df)] = row

            except ValueError as e:
                logger.error(
                    f'Cannot summarize row: {top_k} {fold} {metrics} {e}')

            #
            #
            # df = pd.DataFrame(k2eval_rows[top_k],
            #                   columns=['seed_id', 'retrieved_docs', 'relevant_docs', 'relevant_retrieved_docs',
            #                            'precision', 'recall', 'avg_p', 'reciprocal_rank'])
            #
            # print(df.mean())
            #
            # print(df.mean().to_frame().transpose().iloc[0])

    logger.info(f'Writing {len(df)} rows to {output_path}')

    if os.path.exists(output_path):
        # Append new rows to evaluation file
        df.to_csv(output_path, mode='a', header=False, index=False)
    else:
        # Write new files
        df.to_csv(output_path, header=True, index=False)

    logger.info('Done')
Exemple #5
0
def train(model_name_or_path: str,
          hf_dataset: str,
          aspect: str,
          fold: Union[int, str],
          output_path: str,
          train_epochs: int = 3,
          train_batch_size: int = 25,
          eval_batch_size: int = 32,
          evaluation_steps: int = 5000,
          train_on_test: bool = False,
          loss: str = 'multiple_negatives_ranking',
          override: bool = False):
    """

    # $MODEL_NAME $HF_DATASET $ASPECT $FOLD $OUTPUT_DIR --train_epochs=3 --train_batch_size=$TRAIN_BATCH_SIZE --eval_batch_size=$EVAL_BATCH_SIZE

    Run with:
    $ export CUDA_VISIBLE_DEVICES=1
    $ ./sentence_transformer_cli.py train scibert-scivocab-uncased paperswithcode_task_docs 1 ./output/st_scibert/1 --train_epochs=3 --train_batch_size=25 --eval_batch_size=32


    :param loss: Training loss function (choices: multiple_negatives_ranking, cosine)
    :param train_on_test: If True, joint training on train and test set (validation disabled)
    :param aspect:
    :param evaluation_steps:
    :param train_epochs:
    :param model_name_or_path:
    :param hf_dataset:
    :param fold:
    :param output_path:
    :param train_batch_size:
    :param eval_batch_size:
    :param override:
    :return:
    """

    top_ks = [5, 10, 25, 50]
    # cuda_device = -1

    # hf_dataset = 'paperswithcode_task_docs'
    # model_name_or_path = 'scibert-scivocab-uncased'
    # fold = 1
    max_token_length = 336  # ssee pwc_token_stats.ipynb
    nlp_cache_dir = './data/nlp_cache'

    # train_batch_size = 25
    # eval_batch_size = 32
    # override = False

    # output_path = './output/pwc_task_st/1/sci-bert'
    # output_path = os.path.join(output_path, str(fold), model_name_or_path)  # output/1/sci-bert

    if os.path.exists(output_path) and not override:
        logger.error(f'Stop. Output path exists already: {output_path}')
        sys.exit(1)

    # if cuda_device >= 0:
    #     os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device)

    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Model path from env
    if not os.path.exists(model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_name_or_path)):
        model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path)

    word_embedding_model = Transformer(model_name_or_path,
                                       max_seq_length=max_token_length)
    pooling_model = Pooling(
        word_embedding_model.get_word_embedding_dimension())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    # tokenizer = BertTokenizer.from_pretrained(model_name_or_path)

    # dataset
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir=nlp_cache_dir,
                           split='docs')
    train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                            name='relations',
                            cache_dir=nlp_cache_dir,
                            split=get_train_split(aspect, fold))
    test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='relations',
                           cache_dir=nlp_cache_dir,
                           split=get_test_split(aspect, fold))

    # filter for positive labels only
    train_ds = train_ds.filter(lambda row: row['label'] == 'y')

    logger.info(f'After filtering: {len(train_ds):,}')

    # joint training on train and test?
    if train_on_test:
        #
        # import pyarrow
        # from datasets.arrow_dataset import Dataset
        #
        # full_ds_table = pyarrow.concat_tables([train_ds.data, test_ds.data])
        # full_ds = Dataset(arrow_table=full_ds_table)
        raise NotImplementedError('TODO Evaluator')
    else:
        # standard training on test only
        train_sds = DocumentPairSentencesDataset(docs_ds,
                                                 train_ds,
                                                 model,
                                                 max_length=max_token_length,
                                                 forced_length=0)
        train_sds.tokenize_all_docs()

        evaluator = NearestNeighborsEvaluator(model,
                                              docs_ds,
                                              test_ds,
                                              top_ks=top_ks,
                                              batch_size=eval_batch_size,
                                              show_progress_bar=True)

    if loss == 'cosine':
        train_loss = losses.CosineSimilarityLoss(model)
    elif loss == 'multiple_negatives_ranking':
        # A nice advantage of MultipleNegativesRankingLoss is that it only requires positive pairs
        # https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/quora_duplicate_questions
        train_loss = losses.MultipleNegativesRankingLoss(model)
    else:
        raise ValueError(f'Unsupported loss function: {loss}')

    train_dl = DataLoader(train_sds, shuffle=True, batch_size=train_batch_size)

    # Training
    model.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=train_epochs,  # try 1-4
        warmup_steps=100,
        evaluator=evaluator,
        evaluation_steps=
        evaluation_steps,  # increase to 5000 (full dataset => 20k steps)
        output_path=output_path,
        output_path_ignore_not_empty=True)

    logger.info('Training done')
Exemple #6
0
def build_vectors(st_output_path: str,
                  hf_dataset: str,
                  aspect: str,
                  fold: Union[int, str],
                  include_all_docs: bool = False,
                  override: bool = False):
    """

    :param override:
    :param include_all_docs: Generate also vectors for samples from training data
    :param st_output_path: Path to Sentence Transformer model
    :param hf_dataset: Huggingface dataset path or name
    :param aspect:
    :param fold:
    :return:
    """
    max_token_length = 336  # ssee pwc_token_stats.ipynb
    nlp_cache_dir = './data/nlp_cache'

    out_fn = 'pwc_id2vec__all_docs.w2v.txt' if include_all_docs else 'pwc_id2vec.w2v.txt'
    out_fp = os.path.join(st_output_path, out_fn)

    if not os.path.exists(st_output_path):
        logger.error(
            f'Sentence Transformer directory does not exist: {st_output_path}')
        return

    if os.path.exists(out_fp) and not override:
        logger.error(
            f'Output path exists already and override is disabled: {out_fp}')
        return

    # Inference for best model
    best_model = SentenceTransformer(st_output_path)
    best_model.get_sentence_embedding_dimension()

    test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='relations',
                           cache_dir=nlp_cache_dir,
                           split=get_test_split(aspect, fold))

    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir=nlp_cache_dir,
                           split='docs')
    test_sds = DocumentPairSentencesDataset(docs_ds, test_ds, best_model)

    if include_all_docs:
        # use all document ids
        input_paper_ids = set(docs_ds['paper_id'])
        logger.info(f'All documents in corpus: {len(input_paper_ids):,}')

    else:
        # generate vectors from unique test documents only
        input_paper_ids = set(test_ds['from_paper_id']).union(
            set(test_ds['to_paper_id']))

    with open(out_fp, 'w') as f:
        # header
        f.write(
            f'{len(input_paper_ids)} {best_model.get_sentence_embedding_dimension()}\n'
        )

        # body
        for paper_id in tqdm(input_paper_ids, desc='Inference'):
            vec = [
                str(v) for v in best_model.encode(test_sds.get_text_from_doc(
                    paper_id),
                                                  show_progress_bar=False)
            ]

            assert len(vec) == best_model.get_sentence_embedding_dimension()

            vec_str = ' '.join(vec)
            line = f'{paper_id} {vec_str}\n'
            f.write(line)
            # break
    logger.info(f'Encoded {len(input_paper_ids):,} into {out_fp}')