Esempio n. 1
0
def extract_claims(data: pd.DataFrame(),
                   model_path: str = MODEL_PATH,
                   weight_path: str = WEIGHT_PATH,
                   col_name: str = "sentence"):
    """
    Extract Claims from given columns in a dataset to extract the claim.

    :param file_path: path to input file, which contains sentences
    :param model_path: location of model, can be downloaded offline or link can be given
    :param weight_path: location of model weight, can be downloaded offline or link can be given
    :param col_name: name of column on which claim is to be identified, should not be "sentences
    :return: labels, if a sentence is a claim or not
    """
    model = load_claim_extraction_model(model_path, weight_path)
    # print("MODEL LOADED!!!")  # noqa: T001
    reader = CrfPubmedRCTReader()
    claim_predictor = ClaimCrfPredictor(model, dataset_reader=reader)

    df = data
    if col_name not in df.columns:
        return None
    df_sentence = df.copy()

    # NOTE(alpha_darklord): The function returns a list of labels, whether a particular
    # sentence is a claim or not (0 or 1), best_paths is used to get this label,
    # later we extract sentences which have 1 label and transfer them into a list contained in column "claims"
    df_sentence["sentences"] = df_sentence[col_name]
    df_sentence["sentences"] = df_sentence.sentences.apply(sent_tokenize)
    df_sentence['pred'] = df_sentence.sentences.apply(
        lambda x: claim_predictor.predict_json({'sentences': x}))
    df_sentence['best_paths'] = df_sentence.pred.apply(
        lambda x: model.crf.viterbi_tags(
            torch.FloatTensor(x['logits']).unsqueeze(0),
            torch.LongTensor(x['mask']).unsqueeze(0)))
    df_sentence['p_claims'] = df_sentence['best_paths'].apply(
        lambda x: 100 * np.array(x[0][0]))
    df_sentence['claims'] = df_sentence.apply(
        lambda x: np.extract(x['p_claims'], x['sentences']), axis=1)
    df_claims = df_sentence[~(df_sentence.claims.str.len() == 0)]
    del df_sentence
    # NOTE(alpha_darklord): This converts a list present inside a column to different rows
    # containing individual items
    df_updated = df_claims[[col_name, "claims"]].explode("claims")
    df_updated["claim_flag"] = 1
    df_merged = df.merge(df_updated, on=[col_name], how="left")
    df_merged["claim_flag"].fillna(0, inplace=True)
    return df_merged
Esempio n. 2
0
    discourse_predictor = Predictor.from_archive(archive_,
                                                 'discourse_crf_predictor')

    model = predictor._model
    for param in list(model.parameters()):
        param.requires_grad = False
    num_classes, constraints, include_start_end_transitions = 2, None, False
    model.crf = ConditionalRandomField(
        num_classes,
        constraints,
        include_start_end_transitions=include_start_end_transitions)
    model.label_projection_layer = TimeDistributed(
        Linear(2 * EMBEDDING_DIM, num_classes))
    model.load_state_dict(
        torch.load(cached_path(WEIGHT_PATH), map_location='cpu'))
    reader = CrfPubmedRCTReader()
    claim_predictor = ClaimCrfPredictor(model, dataset_reader=reader)


def parse_pubmed_xml(pmid):
    """
    Parse article information for the given PMID
    """
    url = PUBMED_URL % pmid
    page = urllib.request.urlopen(url).read()
    tree = html.fromstring(page)
    abstract = ''
    for e in tree.xpath('//abstract/abstracttext'):
        if e is not None:
            abstract += stringify_children(e).strip()
    title = ' '.join(
            self.vocab.get_token_from_index(label, namespace='labels')
            for label in instance_labels
        ] for instance_labels in output_dict["labels"]]
        return output_dict

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {
            metric_name: metric.get_metric(reset)
            for metric_name, metric in self.metrics.items()
        }


if __name__ == '__main__':
    claim_reader = ClaimAnnotationReaderJSON()
    discourse_reader = CrfPubmedRCTReader()
    claim_train_dataset = claim_reader.read(cached_path(TRAIN_PATH))
    claim_validation_dataset = claim_reader.read(cached_path(VALIDATION_PATH))
    discourse_train_dataset = discourse_reader.read(
        cached_path(DISCOURSE_TRAIN_PATH))
    discourse_validation_dataset = discourse_reader.read(
        cached_path(DISCOURSE_VALIDATION_PATH))
    vocab = Vocabulary.from_instances(claim_train_dataset + \
                                      claim_validation_dataset + \
                                      discourse_train_dataset + \
                                      discourse_validation_dataset)
    discourse_dict = {
        'RESULTS': 0,
        'METHODS': 1,
        'CONCLUSIONS': 2,
        'BACKGROUND': 3,
Esempio n. 4
0
    for param in list(model.parameters()):
        param.requires_grad = False
    num_classes, constraints, include_start_end_transitions = 2, None, False
    model.classifier_feedforward._linear_layers = ModuleList([
        torch.nn.Linear(2 * EMBEDDING_DIM, EMBEDDING_DIM),
        torch.nn.Linear(EMBEDDING_DIM, num_classes)
    ])
    model.crf = ConditionalRandomField(
        num_classes,
        constraints,
        include_start_end_transitions=include_start_end_transitions)
    model.label_projection_layer = TimeDistributed(
        Linear(2 * EMBEDDING_DIM, num_classes))
    model.load_state_dict(torch.load(cached_path(WEIGHT_PATH)))

    reader = CrfPubmedRCTReader()
    claim_predictor = ClaimCrfPredictor(model, dataset_reader=reader)

    fixture_path = os.path.join('..', 'pubmed-rct', 'PubMed_200k_RCT',
                                'fixtures_crf.json')
    examples = read_json(fixture_path)
    pred_list = []
    for example in examples:
        sentences = sent_tokenize(example['abstract'])
        instance = reader.text_to_instance(sents=sentences)
        pred = claim_predictor.predict_instance(instance)
        logits = torch.FloatTensor(pred['logits'])
        best_paths = model.crf.viterbi_tags(
            torch.FloatTensor(pred['logits']).unsqueeze(0),
            torch.LongTensor(pred['mask']).unsqueeze(0))
        pred_list.append(best_paths[0][0])