Ejemplo n.º 1
0
def read_s2_excerpt(ex):
    """
    Reads excerpts in a jsonlines format
    In this format each citation excerpt is one json line
    It is the flattened format from the original s2 data
    Args:
        ex: citation excerpt blob

    Returns:
        Citation object
    """
    citation = Citation(
        text=ex['string'],
        citing_paper_id=ex['citingPaperId'],
        cited_paper_id=ex['citedPaperId'],
        # citing_paper_title=ex['citingPaper']['title'],
        # cited_paper_title=ex['citedPaper']['title'],
        # citing_paper_year=citing_paper_year,
        # cited_paper_year=cited_paper_year,
        # citing_author_ids=citing_author_ids,
        # cited_author_ids=cited_author_ids,
        extended_context=None,  # Not available for s2 data
        section_number=None,  # Not available for s2 data
        section_title=ex['sectionName'],
        intent=ex['label'],
        # cite_marker_offset=offsets,  # Not useful here
        sents_before=None,  # not available for s2 data
        sents_after=None,  # not available for s2 data
        citation_excerpt_index=ex['excerpt_index'],
        cleaned_cite_text=regex_find_citation.sub('', ex['string'])
    )
    return citation
Ejemplo n.º 2
0
    def _read(self, file_path):
        for obj in jsonlines.open(file_path):
            citation_text = obj['text']

            if self._clean_citation:
                citation_text = regex_find_citation.sub("", citation_text)

            citation_intent = None
            section_name = obj['section_name']
            citing_paper_id = obj['citing_paper_id']
            cited_paper_id = obj['cited_paper_id']

            yield self.text_to_instance(citation_text=citation_text,
                                        intent=citation_intent,
                                        citing_paper_id=citing_paper_id,
                                        cited_paper_id=cited_paper_id,
                                        section_name=section_name)
Ejemplo n.º 3
0
def read_s2_jsonline(ex, evaluate_mode=False, clean_citation=True, multilabel=False):
    """ reads a json lines object (citation blob)
    This is a separate function to be used in the predictor
     Args:
        ex: input Example
        evaluate_mode: If we are evaluating only consider annotated excerpts
    """
    citations = []
    num_not_annotated = 0
    try:
        citing_paper_year = ex['citingPaper']['year']
    except KeyError:
        citing_paper_year = -1
    try:
        cited_paper_year = ex['citedPaper']['year']
    except KeyError:
        cited_paper_year = -1

        # authors is like: [{'name': 'S Pandav', 'ids': ['2098534'], ...}]
    try:
        citing_author_ids = [author['ids'][0] if author['ids'] else 'n/a'
                             for author in ex['citingPaper']['authors']]
    except KeyError:  # authors do not exist in the context:
        citing_author_ids = []
    try:
        cited_author_ids = [author['ids'][0] if author['ids'] else 'n/a'
                            for author in ex['citedPaper']['authors']]
    except KeyError:
        cited_author_ids = []

    for excerpt_index, excerpt_obj in enumerate(ex['context']):
        if evaluate_mode:  # only consider excerpts that are annotated
            if 'intents' not in excerpt_obj:
                num_not_annotated += 1
                continue

        try:
            offsets = [excerpt_obj['citeStart'], excerpt_obj['citeEnd']]
        except KeyError:  # context does not have citeStart or citeEnd
            offsets = [-1, -1]

        if clean_citation:
            # remove citation markers (e.g., things like [1,4], (Peters, et al 2018), etc)
            citation_text = regex_find_citation.sub("", excerpt_obj['string'])
        else:
            citation_text = excerpt_obj['string']
        section_name = excerpt_obj['sectionName']

        # intents = [e['intent'] for e in excerpt_obj['intents'] if e['score'] > 0.0]

        if 'intents' in excerpt_obj:
            if multilabel:
                intents = [e['intent'] if e['score'] > 0.0
                           else NEGATIVE_CLASS_PREFIX + e['intent'] for e in excerpt_obj['intents']]
            else:
                intents = [e['intent'] for e in excerpt_obj['intents'] if e['score'] > 0.0]
        else:
            intents = None



        citation = Citation(
            text=citation_text,
            citing_paper_id=ex['citingPaper']['id'],
            cited_paper_id=ex['citedPaper']['id'],
            citing_paper_title=ex['citingPaper']['title'],
            cited_paper_title=ex['citedPaper']['title'],
            citing_paper_year=citing_paper_year,
            cited_paper_year=cited_paper_year,
            citing_author_ids=citing_author_ids,
            cited_author_ids=cited_author_ids,
            extended_context=None,  # Not available for s2 data
            section_number=None,  # Not available for s2 data
            section_title=section_name,
            intent=intents,
            cite_marker_offset=offsets,  # Not useful here
            sents_before=None,  # not available for s2 data
            sents_after=None,  # not available for s2 data
            citation_excerpt_index=excerpt_index,
            cleaned_cite_text=citation_text
        )
        citations.append(citation)
    return citations
Ejemplo n.º 4
0
    def read(self):
        """ Reads the input data and yields a citation object"""
        data = [json.loads(line) for line in open(self.data_path)]
        num_returned_citations = 0
        num_not_annotated = 0
        for ex in data:
            try:
                citing_paper_year = ex['citingPaper']['year']
            except KeyError:
                citing_paper_year = -1
            try:
                cited_paper_year = ex['citedPaper']['year']
            except KeyError:
                cited_paper_year = -1

            # authors is like: [{'name': 'S Pandav', 'ids': ['2098534'], ...}]
            try:
                citing_author_ids = [author['ids'][0] if author['ids'] else 'n/a'
                                     for author in ex['citingPaper']['authors']]
            except KeyError:  # authors do not exist in the context:
                citing_author_ids = []
            try:
                cited_author_ids = [author['ids'][0] if author['ids'] else 'n/a'
                                    for author in ex['citedPaper']['authors']]
            except KeyError:
                cited_author_ids = []

            for excerpt_index, excerpt_obj in enumerate(ex['context']):
                if self.evaluate_mode:  # only consider excerpts that are annotated
                    if 'intents' not in excerpt_obj:
                        num_not_annotated += 1
                        continue

                try:
                    offsets = [excerpt_obj['citeStart'], excerpt_obj['citeEnd']]
                except KeyError:  # context does not have citeStart or citeEnd
                    offsets = [-1, -1]

                if self.clean_citation:
                    # remove citation markers (e.g., things like [1,4], (Peters, et al 2018), etc)
                    citation_text = regex_find_citation.sub("", excerpt_obj['string'])
                else:
                    citation_text = excerpt_obj['string']
                section_name = excerpt_obj['sectionName']

                # in case of multilabel add all possible labels and their negative prefix
                if self.multilabel:
                    intents = [e['intent'] if e['score'] > 0.0
                               else NEGATIVE_CLASS_PREFIX + e['intent'] for e in excerpt_obj['intents']]
                else:
                    intents = [e['intent'] for e in excerpt_obj['intents'] if e['score'] > 0.0]

                citation = Citation(
                    text=citation_text,
                    citing_paper_id=ex['citingPaper']['id'],
                    cited_paper_id=ex['citedPaper']['id'],
                    citing_paper_title=ex['citingPaper']['title'],
                    cited_paper_title=ex['citedPaper']['title'],
                    citing_paper_year=citing_paper_year,
                    cited_paper_year=cited_paper_year,
                    citing_author_ids=citing_author_ids,
                    cited_author_ids=cited_author_ids,
                    extended_context=None,  # Not available for s2 data
                    section_number=None,  # Not available for s2 data
                    section_title=section_name,
                    intent=intents,
                    cite_marker_offset=offsets,  # Not useful here
                    sents_before=None,  # not available for s2 data
                    sents_after=None,  # not available for s2 data
                    citation_excerpt_index=excerpt_index,
                    cleaned_cite_text=citation_text
                )
                num_returned_citations += 1
                yield citation

        logger.info(f'Total annotated citation texts returned: {num_returned_citations}; '
                    f'not annotated {num_not_annotated}')