def test_simple_records(self):
        """Regression test for alignment with fake records."""
        docs = {
            'ext1': [
                Reference(title='Matt', year=2011),
                Reference(title='Erick', year=2013),
            ],
            'ext2': [
                Reference(title='Matt', year=2011),
            ],
            'ext3': [
                Reference(title='John', year=2010),
                Reference(title='Eric', year=2013),
            ]
        }

        aligned_answer = [[["ext1", Reference(title='Matt', year=2011)],
                           ["ext2", Reference(title='Matt', year=2011)]],
                          [["ext1",
                            Reference(title='Erick', year=2013)],
                           ["ext3", Reference(title='Eric', year=2013)]],
                          [["ext3", Reference(title='John', year=2010)]]]

        aligned_calc = align.align_records(docs)
        for ref_ans, ref_calc in zip(aligned_answer, aligned_calc):
            self.assertDictEqual(dict(ref_ans), dict(ref_calc))
Beispiel #2
0
 def setUp(self):
     """Given aligned references from several extractors, and priors..."""
     self.simple_docs = {
         'ext1': [
             Reference(source='Matthew', volume='uuddlrlrba', year=2011),
             Reference(source='Erick P', volume='babaudbalrba', year=2013),
         ],
         'ext2': [
             Reference(source='Matthew', volume='uuddlrlrbaba', year=2011),
         ],
         'ext3': [
             Reference(source='Johnathan', volume='start', year=2010),
             Reference(source='Eric Pe', volume='babaudbalrba', year=2013),
         ]
     }
     self.priors = [
         ('ext1', {
             'source': 0.9,
             'volume': 0.6,
             'year': 0.1
         }),
         ('ext2', {
             'source': 0.8,
             'volume': 0.7,
             'year': 0.99
         }),
         ('ext3', {
             'source': 0.2,
             'volume': 0.9,
             'year': 0.001
         }),
     ]
 def test_blank_value_is_valid(self):
     """A blank value is treated as a real value."""
     records = [[["ext1", Reference(title="", year=2011, pages="")],
                 ["ext2",
                  Reference(title="Matt", year=2011, pages="")]]]
     aligned_probs = beliefs.validate(records)
     self.assertGreater(dict(aligned_probs[0])['ext1']['title'], 0)
     self.assertGreater(dict(aligned_probs[0])['ext1']['pages'], 0)
    def setUp(self):
        """Given some simple aligned records..."""

        self.aligned_records = [[["ext1",
                                  Reference(title="Matt", year=2011)],
                                 ["ext2",
                                  Reference(title="Matt", year=2011)]],
                                [["ext1",
                                  Reference(title="Erick", year=2013)],
                                 ["ext3",
                                  Reference(title="Eric", year=2013)]],
                                [["ext3",
                                  Reference(title="John", year=2010)]]]
 def setUp(self):
     """Given some aligned records from a real extraction...."""
     json_aligned = 'tests/data/1704.01689v1.aligned.json'
     with open(json_aligned) as f:
         self.aligned_records = [[(extractor, Reference(**data))
                                  for extractor, data in record]
                                 for record in json.load(f)]
Beispiel #6
0
def extract_identifiers(text: str) -> Reference:
    """
    Get available ID metadata from a text selection.

    Parameters
    ----------
    text : str
        Raw text from which to extract arXiv ids or DOIs

    Returns
    -------
    metadata : :class:`Reference`
        The metadata dictionary corresponding to what was found,
        see schema for formatting specifics. Generally, will be similar to:

            {
                'doi': '10.1000/xyz123',
                'identifiers': [
                    {
                        'identifier_type': 'arxiv',
                        'identifier': 'hep-th/0123456'
                    }
                ]
            }
    """
    document: Dict[str, Any] = {}
    arxivids = [
        longest_string(ID) for ID in re.findall(REGEX_ARXIV_FLEXIBLE, text)
    ]
    if arxivids:
        # if len(arxivids) > 1:
        #     document['arxiv_id'] = arxivids
        # else:
        document['arxiv_id'] = arxivids[0]

    dois = re.findall(REGEX_DOI, text)
    if dois:
        document['doi'] = dois[0]

    isbn10 = re.findall(REGEX_ISBN_10, text)
    isbn13 = re.findall(REGEX_ISBN_13, text)

    # gather the identifiers one at a time
    identifiers: List[Identifier] = []
    if isbn10:
        identifiers.extend([
            Identifier(identifier_type='ISBN', identifier=ID)  # type: ignore
            for ID in isbn10
        ])

    if isbn13:
        identifiers.extend([
            Identifier(identifier_type='ISBN', identifier=ID)  # type: ignore
            for ID in isbn13
        ])

    if identifiers:
        document['identifiers'] = identifiers

    return Reference(**document)  # type: ignore
Beispiel #7
0
def transform(refextract_metadatum: dict) -> Reference:
    """
    Restructure refextract output to match internal extraction struct.

    Parameters
    ----------
    refextract_metadatum : dict
        RefExtract output.

    Returns
    -------
    dict
    """
    metadatum: Dict[str, Any] = {'reftype': 'citation'}
    for re_key, key in FIELD_MAPPINGS:
        value = refextract_metadatum.get(re_key)
        if value:
            metadatum[key] = value[0]  # All refextract values are lists.
    if 'identifiers' in refextract_metadatum:
        metadatum['identifiers'] = [
            Identifier(**ident)  # type: ignore
            for ident in metadatum['identifiers']
        ]
    if 'author' in refextract_metadatum:
        metadatum['authors'] = [
            Author(fullname=author)  # type: ignore
            for author in refextract_metadatum['author']
        ]
    return Reference(**metadatum)  # type: ignore
Beispiel #8
0
def digest(metadata: Reference) -> str:
    """
    Create a single string that represents the record.

    It does so by recursively digesting the structure, taking any strings in a
    list or dictionary value and combining them into a word list (single
    string)

    Parameters
    ----------
    metadata : :class:`.Reference`
        Single record. Does not necessarily have to be a dict, but that is
        what we are working with at the moment

    Returns
    -------
    digest : string
    """
    badkeys = ['raw', 'doi', 'identifiers', 'identifier', 'reftype']
    dig: str
    if isinstance(metadata, list):
        dig = clean_text(' '.join([digest(l) for l in metadata]), numok=True)
        return dig
    elif isinstance(metadata, dict):
        dig = clean_text(' '.join([
            digest(v) for k, v in metadata.to_dict().items()
            if k not in badkeys
        ]),
                         numok=True)
    else:
        dig = clean_text(str(metadata), numok=True)
    return dig
 def test_get_calls_datastore_session(self, retrieve_mock):
     """Test :func:`.reference.get` function."""
     ref = Reference(raw='asdf')
     retrieve_mock.return_value = ReferenceSet(document_id='fooid123',
                                               references=[ref],
                                               version='0.1',
                                               score=0.9,
                                               created=datetime.now(),
                                               updated=datetime.now())
     extracted_references.get('arxiv:1234.5678', ref.identifier)
     self.assertEqual(retrieve_mock.call_count, 1)
Beispiel #10
0
def format_grobid_output(output: bytes) -> List[Reference]:
    """
    Transform GROBID output to internal metadata struct.

    Take the output of GROBID and return the metadata in the format expected by
    the references schema. For a description of TEI, Text Encoding Initiative
    (the format of the XML), see the documentation on the website
    (particularly, the bibliography section):

    http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-biblStruct.html

    Parameters
    ----------
    output : dict
        The output of the GROBID API call, structured dict of metadata

    Returns
    -------
    metadata : list
        List of reference metadata (dict) conforming to references schema.
    """
    filestring = io.StringIO(output.decode('utf-8'))
    root = xml.etree.ElementTree.parse(filestring).getroot()
    _xml_set_ns(root)

    # make sure we are only dealing with the final reference list
    try:
        listbbl = list(root.iter(tag=xt('listBibl')))[0]
    except IndexError:
        msg = 'GROBID output does not contain references'
        logger.error(msg)
        raise IndexError(msg)

    blank_reference = {
        'identifiers': [{'identifier_type': '', 'identifier': ''}],
        'raw': '', 'volume': '', 'issue': '', 'pages': '', 'reftype': '',
        'doi': '', 'authors': [], 'title': '', 'year': '', 'source': '',
    }

    # ========================================================================
    # iterate over the references in that list
    references = []
    for bbl in listbbl.iter(tag=xt('biblStruct')):
        reference = dict(blank_reference)
        reference.update(_xml_format_biblStruct(bbl))
        references.append(Reference(**reference))   # type: ignore

    return references
Beispiel #11
0
def _select(pooled: dict) -> Tuple[Reference, float]:
    """Select the most likely values given their pooled weights."""
    result = {}
    max_probs = []
    for field, counts in pooled.items():
        # Feature-normalize accross distinct values.
        if len(counts) == 0:
            continue
        try:
            values, norm_prob = zip(*[(value, count / sum(counts.values()))
                                      for value, count in counts.items()
                                      if sum(counts.values()) > 0])
        except ValueError as e:
            continue
        result[field] = _cast_value(field, values[argmax(norm_prob)])
        if field == 'authors':
            result[field] = _fix_authors(result[field])
        max_probs.append(max(norm_prob))
    ref = Reference(**result)  # type: ignore
    return ref, _score(result) * mean(max_probs)
Beispiel #12
0
def calculate_belief(reference: Reference) -> dict:
    """
    Calculate the beliefs about the elements in a single record.

    Generates a data structure similar to the input but with the values
    replaced by probabilities (float in 0.0-1.0).

    Parameters
    ----------
    reference : :class:`.Reference`
        A single reference metadata record.

    Returns
    -------
    beliefs : dict
        The same structure as the input but with probabilities instead of
        the values that came in
    """
    output = {}

    for key, value in reference.to_dict().items():
        if not value:
            # Blank values are perfectly plausible, and there isn't much else
            # that we can say about them.
            output[key] = 1.
            continue
        funcs: list = BELIEF_FUNCTIONS.get(key, [unity])
        score = 0.
        for func in funcs:
            # We don't want the whole process to get derailed when one
            #  function fails.
            try:
                score += func(value)
            except Exception as e:
                logger.error('Validation for %s failed with: %s', key, e)
        output[key] = score / len(funcs)
    return output
Beispiel #13
0
def format_scienceparse_output(output: dict) -> List[Reference]:
    """
    Generate :class:`.Reference`s from ScienceParse output.

    Parameters
    ----------
    output : dict
        The output of the ScienceParse API call, structured dict of metadata

    Returns
    -------
    metadata : list
        List of :class:`.Reference` instances.
    """
    if 'references' not in output:
        msg = 'ScienceParse output does not contain references'
        logger.error(msg)
        raise KeyError(msg)

    references = []
    for ref in output['references']:
        authors = []
        for auth in ref.get('authors', []):
            if auth:
                authors.append(parse_auth_line(auth))
        authors = [
            Author(givennames=first, surname=last)  # type: ignore
            for first, last in authors
        ]
        reference = Reference(  # type: ignore
            title=ref.get('title'),
            year=str(ref.get('year')),
            source=ref.get('venue'),
            authors=authors)
        references.append(reference)

    return references
Beispiel #14
0
 def setUp(self):
     """Given some records..."""
     self.records = [(Reference(title='bar'), 0.1),
                     (Reference(title='bat'), 0.4),
                     (Reference(title='ipsum'), 0.9)]
Beispiel #15
0
def cxml_format_document(root: ET.Element) -> List[Reference]:
    """
    Convert a CERMINE XML element into a reference document.

    For example:

        {
            "author": {"givenname": "Matt", "surname", "Bierbaum"},
            "journal": "arxiv",
            "article-title": "Some bad paper",
            "year": 2017,
            "volume": 1,
            "page": 1
        }

    Parameters
    ----------
    root : ET
        reference xml root from CERMINE

    Returns
    -------
    doc : dictionary
        Formatted reference document using CERMINE metadata
    """
    reference_constructor: Dict[str, Callable] = {
        'authors': _cxml_ref_authors,
        'raw': _cxml_format_reference_line,
        'title': _cxml_element_func('article-title'),
        'source': _cxml_element_func('source'),
        'year': _cxml_element_func('year'),
        'volume': _cxml_element_func('volume'),
        'pages': _cxml_element_func('fpage'),
        'issue': _cxml_element_func('issue'),
    }

    # things that cermine does not extract / FIXME -- get these somehow?!
    # unknown_properties = {
    #     'identifiers': [{'identifier_type': '', 'identifier': ''}],
    #     'reftype': '',
    #     'doi': ''
    # }

    references = []
    for refroot in root.iter(tag='ref'):
        reference = {
            key: func(refroot)
            for key, func in reference_constructor.items()
        }

        # add regex extracted information to the metadata (not CERMINE's)
        rawline = reference.get('raw', '') or ''
        partial = regex_identifiers.extract_identifiers(rawline)

        reference['identifiers'] = [
            Identifier(**ident)  # type: ignore
            for ident in reference.get('identifiers', [])
        ]
        reference['identifiers'] += partial.identifiers
        references.append(Reference(**reference))  # type: ignore

    return references