def test_simple_records(self): """Regression test for alignment with fake records.""" docs = { 'ext1': [ Reference(title='Matt', year=2011), Reference(title='Erick', year=2013), ], 'ext2': [ Reference(title='Matt', year=2011), ], 'ext3': [ Reference(title='John', year=2010), Reference(title='Eric', year=2013), ] } aligned_answer = [[["ext1", Reference(title='Matt', year=2011)], ["ext2", Reference(title='Matt', year=2011)]], [["ext1", Reference(title='Erick', year=2013)], ["ext3", Reference(title='Eric', year=2013)]], [["ext3", Reference(title='John', year=2010)]]] aligned_calc = align.align_records(docs) for ref_ans, ref_calc in zip(aligned_answer, aligned_calc): self.assertDictEqual(dict(ref_ans), dict(ref_calc))
def setUp(self): """Given aligned references from several extractors, and priors...""" self.simple_docs = { 'ext1': [ Reference(source='Matthew', volume='uuddlrlrba', year=2011), Reference(source='Erick P', volume='babaudbalrba', year=2013), ], 'ext2': [ Reference(source='Matthew', volume='uuddlrlrbaba', year=2011), ], 'ext3': [ Reference(source='Johnathan', volume='start', year=2010), Reference(source='Eric Pe', volume='babaudbalrba', year=2013), ] } self.priors = [ ('ext1', { 'source': 0.9, 'volume': 0.6, 'year': 0.1 }), ('ext2', { 'source': 0.8, 'volume': 0.7, 'year': 0.99 }), ('ext3', { 'source': 0.2, 'volume': 0.9, 'year': 0.001 }), ]
def test_blank_value_is_valid(self): """A blank value is treated as a real value.""" records = [[["ext1", Reference(title="", year=2011, pages="")], ["ext2", Reference(title="Matt", year=2011, pages="")]]] aligned_probs = beliefs.validate(records) self.assertGreater(dict(aligned_probs[0])['ext1']['title'], 0) self.assertGreater(dict(aligned_probs[0])['ext1']['pages'], 0)
def setUp(self): """Given some simple aligned records...""" self.aligned_records = [[["ext1", Reference(title="Matt", year=2011)], ["ext2", Reference(title="Matt", year=2011)]], [["ext1", Reference(title="Erick", year=2013)], ["ext3", Reference(title="Eric", year=2013)]], [["ext3", Reference(title="John", year=2010)]]]
def setUp(self): """Given some aligned records from a real extraction....""" json_aligned = 'tests/data/1704.01689v1.aligned.json' with open(json_aligned) as f: self.aligned_records = [[(extractor, Reference(**data)) for extractor, data in record] for record in json.load(f)]
def extract_identifiers(text: str) -> Reference: """ Get available ID metadata from a text selection. Parameters ---------- text : str Raw text from which to extract arXiv ids or DOIs Returns ------- metadata : :class:`Reference` The metadata dictionary corresponding to what was found, see schema for formatting specifics. Generally, will be similar to: { 'doi': '10.1000/xyz123', 'identifiers': [ { 'identifier_type': 'arxiv', 'identifier': 'hep-th/0123456' } ] } """ document: Dict[str, Any] = {} arxivids = [ longest_string(ID) for ID in re.findall(REGEX_ARXIV_FLEXIBLE, text) ] if arxivids: # if len(arxivids) > 1: # document['arxiv_id'] = arxivids # else: document['arxiv_id'] = arxivids[0] dois = re.findall(REGEX_DOI, text) if dois: document['doi'] = dois[0] isbn10 = re.findall(REGEX_ISBN_10, text) isbn13 = re.findall(REGEX_ISBN_13, text) # gather the identifiers one at a time identifiers: List[Identifier] = [] if isbn10: identifiers.extend([ Identifier(identifier_type='ISBN', identifier=ID) # type: ignore for ID in isbn10 ]) if isbn13: identifiers.extend([ Identifier(identifier_type='ISBN', identifier=ID) # type: ignore for ID in isbn13 ]) if identifiers: document['identifiers'] = identifiers return Reference(**document) # type: ignore
def transform(refextract_metadatum: dict) -> Reference: """ Restructure refextract output to match internal extraction struct. Parameters ---------- refextract_metadatum : dict RefExtract output. Returns ------- dict """ metadatum: Dict[str, Any] = {'reftype': 'citation'} for re_key, key in FIELD_MAPPINGS: value = refextract_metadatum.get(re_key) if value: metadatum[key] = value[0] # All refextract values are lists. if 'identifiers' in refextract_metadatum: metadatum['identifiers'] = [ Identifier(**ident) # type: ignore for ident in metadatum['identifiers'] ] if 'author' in refextract_metadatum: metadatum['authors'] = [ Author(fullname=author) # type: ignore for author in refextract_metadatum['author'] ] return Reference(**metadatum) # type: ignore
def digest(metadata: Reference) -> str: """ Create a single string that represents the record. It does so by recursively digesting the structure, taking any strings in a list or dictionary value and combining them into a word list (single string) Parameters ---------- metadata : :class:`.Reference` Single record. Does not necessarily have to be a dict, but that is what we are working with at the moment Returns ------- digest : string """ badkeys = ['raw', 'doi', 'identifiers', 'identifier', 'reftype'] dig: str if isinstance(metadata, list): dig = clean_text(' '.join([digest(l) for l in metadata]), numok=True) return dig elif isinstance(metadata, dict): dig = clean_text(' '.join([ digest(v) for k, v in metadata.to_dict().items() if k not in badkeys ]), numok=True) else: dig = clean_text(str(metadata), numok=True) return dig
def test_get_calls_datastore_session(self, retrieve_mock): """Test :func:`.reference.get` function.""" ref = Reference(raw='asdf') retrieve_mock.return_value = ReferenceSet(document_id='fooid123', references=[ref], version='0.1', score=0.9, created=datetime.now(), updated=datetime.now()) extracted_references.get('arxiv:1234.5678', ref.identifier) self.assertEqual(retrieve_mock.call_count, 1)
def format_grobid_output(output: bytes) -> List[Reference]: """ Transform GROBID output to internal metadata struct. Take the output of GROBID and return the metadata in the format expected by the references schema. For a description of TEI, Text Encoding Initiative (the format of the XML), see the documentation on the website (particularly, the bibliography section): http://www.tei-c.org/release/doc/tei-p5-doc/en/html/ref-biblStruct.html Parameters ---------- output : dict The output of the GROBID API call, structured dict of metadata Returns ------- metadata : list List of reference metadata (dict) conforming to references schema. """ filestring = io.StringIO(output.decode('utf-8')) root = xml.etree.ElementTree.parse(filestring).getroot() _xml_set_ns(root) # make sure we are only dealing with the final reference list try: listbbl = list(root.iter(tag=xt('listBibl')))[0] except IndexError: msg = 'GROBID output does not contain references' logger.error(msg) raise IndexError(msg) blank_reference = { 'identifiers': [{'identifier_type': '', 'identifier': ''}], 'raw': '', 'volume': '', 'issue': '', 'pages': '', 'reftype': '', 'doi': '', 'authors': [], 'title': '', 'year': '', 'source': '', } # ======================================================================== # iterate over the references in that list references = [] for bbl in listbbl.iter(tag=xt('biblStruct')): reference = dict(blank_reference) reference.update(_xml_format_biblStruct(bbl)) references.append(Reference(**reference)) # type: ignore return references
def _select(pooled: dict) -> Tuple[Reference, float]: """Select the most likely values given their pooled weights.""" result = {} max_probs = [] for field, counts in pooled.items(): # Feature-normalize accross distinct values. if len(counts) == 0: continue try: values, norm_prob = zip(*[(value, count / sum(counts.values())) for value, count in counts.items() if sum(counts.values()) > 0]) except ValueError as e: continue result[field] = _cast_value(field, values[argmax(norm_prob)]) if field == 'authors': result[field] = _fix_authors(result[field]) max_probs.append(max(norm_prob)) ref = Reference(**result) # type: ignore return ref, _score(result) * mean(max_probs)
def calculate_belief(reference: Reference) -> dict: """ Calculate the beliefs about the elements in a single record. Generates a data structure similar to the input but with the values replaced by probabilities (float in 0.0-1.0). Parameters ---------- reference : :class:`.Reference` A single reference metadata record. Returns ------- beliefs : dict The same structure as the input but with probabilities instead of the values that came in """ output = {} for key, value in reference.to_dict().items(): if not value: # Blank values are perfectly plausible, and there isn't much else # that we can say about them. output[key] = 1. continue funcs: list = BELIEF_FUNCTIONS.get(key, [unity]) score = 0. for func in funcs: # We don't want the whole process to get derailed when one # function fails. try: score += func(value) except Exception as e: logger.error('Validation for %s failed with: %s', key, e) output[key] = score / len(funcs) return output
def format_scienceparse_output(output: dict) -> List[Reference]: """ Generate :class:`.Reference`s from ScienceParse output. Parameters ---------- output : dict The output of the ScienceParse API call, structured dict of metadata Returns ------- metadata : list List of :class:`.Reference` instances. """ if 'references' not in output: msg = 'ScienceParse output does not contain references' logger.error(msg) raise KeyError(msg) references = [] for ref in output['references']: authors = [] for auth in ref.get('authors', []): if auth: authors.append(parse_auth_line(auth)) authors = [ Author(givennames=first, surname=last) # type: ignore for first, last in authors ] reference = Reference( # type: ignore title=ref.get('title'), year=str(ref.get('year')), source=ref.get('venue'), authors=authors) references.append(reference) return references
def setUp(self): """Given some records...""" self.records = [(Reference(title='bar'), 0.1), (Reference(title='bat'), 0.4), (Reference(title='ipsum'), 0.9)]
def cxml_format_document(root: ET.Element) -> List[Reference]: """ Convert a CERMINE XML element into a reference document. For example: { "author": {"givenname": "Matt", "surname", "Bierbaum"}, "journal": "arxiv", "article-title": "Some bad paper", "year": 2017, "volume": 1, "page": 1 } Parameters ---------- root : ET reference xml root from CERMINE Returns ------- doc : dictionary Formatted reference document using CERMINE metadata """ reference_constructor: Dict[str, Callable] = { 'authors': _cxml_ref_authors, 'raw': _cxml_format_reference_line, 'title': _cxml_element_func('article-title'), 'source': _cxml_element_func('source'), 'year': _cxml_element_func('year'), 'volume': _cxml_element_func('volume'), 'pages': _cxml_element_func('fpage'), 'issue': _cxml_element_func('issue'), } # things that cermine does not extract / FIXME -- get these somehow?! # unknown_properties = { # 'identifiers': [{'identifier_type': '', 'identifier': ''}], # 'reftype': '', # 'doi': '' # } references = [] for refroot in root.iter(tag='ref'): reference = { key: func(refroot) for key, func in reference_constructor.items() } # add regex extracted information to the metadata (not CERMINE's) rawline = reference.get('raw', '') or '' partial = regex_identifiers.extract_identifiers(rawline) reference['identifiers'] = [ Identifier(**ident) # type: ignore for ident in reference.get('identifiers', []) ] reference['identifiers'] += partial.identifiers references.append(Reference(**reference)) # type: ignore return references