Ejemplo n.º 1
0
 def test_blank_value_is_valid(self):
     """A blank value is treated as a real value."""
     records = [[["ext1", Reference(title="", year=2011, pages="")],
                 ["ext2",
                  Reference(title="Matt", year=2011, pages="")]]]
     aligned_probs = beliefs.validate(records)
     self.assertGreater(dict(aligned_probs[0])['ext1']['title'], 0)
     self.assertGreater(dict(aligned_probs[0])['ext1']['pages'], 0)
Ejemplo n.º 2
0
def merge_records(records: Dict[str, List[Reference]],
                  extractor_priors: list = EXTRACTORS) \
        -> Tuple[List[Reference], float]:
    """
    Merge extracted references into a single authoritative set of references.

    Takes a list of reference metadata records (each formatted according to the
    schema) and reconciles them with each other to form one primary record for
    each item. First step is to match the lists against each other using
    similarity measures. Then, for each individual record we combine the
    possible fields and augment them with possible external information to form
    a single record.

    Parameters
    ----------
    records : dict
        The reference records from multiple extraction servies/lookup services.
        Keys are extractor names, values are lists of references (dict).
        E.g. ``{"cermine": [references], "grobid": [references]}``.
    extractor_priors : list
        Represents prior level of trust in field output for each extractor.

    Returns
    -------
    list
        Authoritative reference metadata. Each item represents a single
        cite reference (``dict``).
    """
    N_extractions = len(records)
    records = {extractor: normalize.normalize_records(extraction)
               for extractor, extraction in records.items()}
    try:
        aligned_records = align.align_records(records)
    except Exception as e:
        raise RuntimeError('Alignment failed: %s' % e) from e

    try:
        aligned_probabilities = beliefs.validate(aligned_records)
    except Exception as e:
        raise RuntimeError('Validation failed: %s' % e) from e

    try:
        arbitrated_records = arbitrate.arbitrate_all(aligned_records,
                                                     aligned_probabilities,
                                                     extractor_priors,
                                                     N_extractions)
    except Exception as e:
        raise RuntimeError('Arbitration failed: %s' % e) from e

    try:
        final_records = normalize.filter_records(arbitrated_records)
    except Exception as e:
        raise RuntimeError('Filtering failed: %s' % e) from e
    return final_records
Ejemplo n.º 3
0
 def test_full_records(self):
     """Test that :func:`.validate` returns sensical probabilities."""
     aligned_probs = beliefs.validate(self.aligned_records)
     self.assertEqual(len(aligned_probs), len(self.aligned_records),
                      "Return data should have the same shape as input")
     for probs, records in zip(aligned_probs, self.aligned_records):
         these_probs = list(zip(*probs))[1]
         these_records = list(zip(*records))[1]
         self.assertEqual(
             len(these_probs), len(these_records),
             "Return data should have the same shape as input")
         for metadatum in these_probs:
             for value in metadatum.values():
                 self.assertIsInstance(value, float,
                                       "Values should be probs (float)")
                 self.assertGreaterEqual(value, 0.0,
                                         "Probability never less than 0.")
                 self.assertLessEqual(value, 1.0,
                                      "Probability never more than 1.")
Ejemplo n.º 4
0
    def test_extraction(self):
        from references.process import extract
        from references.services import refextract, cermine, grobid
        from references.process.merge import align, arbitrate, priors, beliefs, \
            normalize
        pdf_path = 'evaluation/pdfs/0801.0012.pdf'
        document_id = '0801.0012'

        extractions = {}
        extractions['cermine'] = cermine.extract_references(pdf_path)
        extractions['grobid'] = grobid.extract_references(pdf_path)
        extractions['refextract'] = refextract.extract_references(pdf_path)

        # with open('data/0801.0012.cermine.json', 'w') as f:
        #     json.dump(extractions['cermine'], f, indent=4, default=decimal_default)
        # with open('data/0801.0012.grobid.json', 'w') as f:
        #     json.dump(extractions['grobid'], f, indent=4, default=decimal_default)
        # with open('data/0801.0012.refextract.json', 'w') as f:
        #     json.dump(extractions['refextract'], f, indent=4, default=decimal_default)

        extractions = {
            extractor: normalize.normalize_records(extracted)
            for extractor, extracted in extractions.items()
        }
        # with open('data/0801.0012.normalized.json', 'w') as f:
        #     json.dump(extractions, f, indent=4, default=decimal_default)

        aligned_records = align.align_records(extractions)
        # with open('data/0801.0012.aligned.json', 'w') as f:
        #     json.dump(aligned_records, f, indent=4, default=decimal_default)
        aligned_probabilities = beliefs.validate(aligned_records)
        # with open('data/0801.0012.probabilities.json', 'w') as f:
        #     json.dump(aligned_probabilities, f, indent=4, default=decimal_default)
        arbitrated_records = arbitrate.arbitrate_all(aligned_records,
                                                     aligned_probabilities,
                                                     priors.EXTRACTORS, 3)
        # with open('data/0801.0012.arbitrated.json', 'w') as f:
        #     json.dump(arbitrated_records, f, indent=4, default=decimal_default)
        final_records, score = normalize.filter_records(arbitrated_records)
Ejemplo n.º 5
0
        raw = [row for row in csv.reader(f)]

    referenceCounts = [{k: row[i] for i, k in enumerate(raw[0])}
                       for row in raw if len(row) == len(raw[0])]

    for row in referenceCounts:

        full_path = os.path.join(basepath, row['pdf'])
        if not os.path.exists(full_path):
            continue
        document_id = row['pdf'][:-4]
        print('Extracting %s' % document_id)

        extractions = extract.extract(full_path, document_id)
        for extractor, refs in extractions.items():
            print(extractor, len(refs), row['N'])

        N_extractions = len(extractions)
        aligned_records = align.align_records(extractions)

        print('aligned', len(aligned_records), row['N'])

        aligned_probabilities = beliefs.validate(aligned_records)
        arbitrated_records = arbitrate.arbitrate_all(aligned_records,
                                                     aligned_probabilities,
                                                     priors.EXTRACTORS,
                                                     N_extractions)
        final_records, score = normalize.filter_records(arbitrated_records)
        print('final', len(final_records), row['N'], score)
        print('--')