Python findInContext Exemples, utopia.document.findInContext Python Exemples

Exemple #1

0

Afficher le fichier

    def resolve(self, citations, document=None):
        citation = {}
        pubmed_id = utopia.citation.pick_from(citations,
                                              'identifiers[pubmed]',
                                              None,
                                              record_in=citation)
        if pubmed_id is None:
            doi = utopia.citation.pick_from(citations,
                                            'identifiers[doi]',
                                            None,
                                            record_in=citation)
            if doi is not None:
                pubmed_id = utopia.tools.pubmed.identify(doi, 'doi')
                if pubmed_id is not None:
                    citation['identifiers'] = {'pubmed': pubmed_id}
            if pubmed_id is None:
                title = utopia.citation.pick_from(citations,
                                                  'title',
                                                  None,
                                                  record_in=citation)
                if title is not None:
                    title = title.strip(' .')
                    pubmed_results = utopia.tools.pubmed.search(title)
                    pubmed_title = pubmed_results.get('title', '').strip(' .')
                    if len(pubmed_title) > 0:
                        matched = False
                        pubmed_pmid = pubmed_results.get('identifiers',
                                                         {}).get('pubmed')
                        if re.sub(r'[^\w]+', ' ',
                                  title).strip().lower() == re.sub(
                                      r'[^\w]+', ' ', pubmed_title).strip(
                                      ).lower():  # Fuzzy match
                            matched = True
                        elif document is not None:
                            # Accept the pubmed title over the scraped title, if present in the document
                            matches = document.findInContext(
                                '', pubmed_title, '')  # Fuzzy match
                            if len(matches) > 0:
                                matched = True
                                pubmed_title = matches[0].text()
                        if matched:
                            citation.update(pubmed_results)
                            citation['title'] = pubmed_title

        return citation

Exemple #2

0

Afficher le fichier

    def on_activate_event(self, document):
        text = document.text().encode('utf8')
        text_hash = hashlib.md5(text).hexdigest()

        url = 'http://beta.sciencewise.info/api/utopia'
        payload = urllib.urlencode({'text': text, 'chksum': text_hash})
        response = urllib2.urlopen(url, payload, timeout=8).read()
        results = json.loads(response)
        annotations = []

        for result in results:
            before = result.get('context', {}).get('before', '')
            term = result.get('value', '')
            after = result.get('context', {}).get('after', '')
            link = result.get('link')
            definitions = []
            for definition in result.get('definitions', []):
                definitions.append(
                    (definition.get('url'), definition.get('title')))

            if len(term) > 0 and len(before) + len(term) + len(
                    after) > 0 and link is not None:
                matches = document.findInContext(before, term, after)
                if len(matches) > 0:
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'ScienceWISE'
                    annotation['property:webpageUrl'] = link
                    annotation['property:term'] = term
                    annotation['property:name'] = 'Definitions of {0}'.format(
                        term)
                    annotation[
                        'property:description'] = 'ScienceWISE ontology definitions'
                    annotation['property:sourceDatabase'] = 'sciencewise'
                    annotation[
                        'property:sourceDescription'] = '<p><a href="http://sciencewise.info/">ScienceWISE</a> provides phycists with article annotation and scientific bookmarking.</p>'
                    for url, title in definitions:
                        annotation.insertProperty('property:definitions',
                                                  '{0} {1}'.format(url, title))
                    for match in matches:
                        annotation.addExtent(match)
                    annotations.append(annotation)

        if len(annotations) > 0:
            document.addAnnotations(annotations)

Exemple #3

0

Afficher le fichier

Fichier : sciencewise.py Projet : project-renard-survey/utopia-documents-mirror

    def on_activate_event(self, document):
        text = document.text().encode('utf8')
        text_hash = hashlib.md5(text).hexdigest()

        url = 'http://beta.sciencewise.info/api/utopia'
        payload = urllib.urlencode({ 'text': text, 'chksum': text_hash })
        response = urllib2.urlopen(url, payload, timeout=8).read()
        results = json.loads(response)
        annotations = []

        for result in results:
            before = result.get('context', {}).get('before', '')
            term = result.get('value', '')
            after = result.get('context', {}).get('after', '')
            link = result.get('link')
            definitions = []
            for definition in result.get('definitions', []):
                definitions.append((definition.get('url'), definition.get('title')))

            if len(term) > 0 and len(before) + len(term) + len(after) > 0 and link is not None:
                matches = document.findInContext(before, term, after)
                if len(matches) > 0:
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'ScienceWISE'
                    annotation['property:webpageUrl'] = link
                    annotation['property:term'] = term
                    annotation['property:name'] = 'Definitions of {0}'.format(term)
                    annotation['property:description'] = 'ScienceWISE ontology definitions'
                    annotation['property:sourceDatabase'] = 'sciencewise'
                    annotation['property:sourceDescription'] = '<p><a href="http://sciencewise.info/">ScienceWISE</a> provides phycists with article annotation and scientific bookmarking.</p>'
                    for url, title in definitions:
                        annotation.insertProperty('property:definitions', '{0} {1}'.format(url, title))
                    for match in matches:
                        annotation.addExtent(match)
                    annotations.append(annotation)

        if len(annotations) > 0:
            document.addAnnotations(annotations)

Exemple #4

0

Afficher le fichier

    def on_ready_event(self, document):
        '''Fetch information from the Lazarus service'''

        permission = self.get_config('permission', False)
        if permission:
            # If an outline already exists, don't make a new one
            needs_outline = True
            for annotation in document.annotations():
                if annotation.get('concept') == 'OutlineItem':
                    needs_outline = False
                    break

            # The Lazarus server needs to know what this document is
            document_id = utopia.tools.utils.metadata(document,
                                                      'identifiers[utopia]')
            this_doi = utopia.tools.utils.metadata(document,
                                                   'identifiers[doi]')
            if this_doi is not None:
                this_doi = u'doi:' + this_doi

            # Speak to server
            params = {'fingerprint': document.fingerprints()}
            url = '{0}?{1}'.format(laz_docUrl,
                                   urllib.urlencode(params, doseq=True))
            response = urllib2.urlopen(url, timeout=60)
            if response.getcode() == 204:
                request = urllib2.Request(
                    url,
                    data=document.data(),
                    headers={'Content-Type': 'application/pdf'})
                response = urllib2.urlopen(request, timeout=60)
            #response = open('/Users/dave/Desktop/ananiadou_tibtech06.pdf-response.xml', 'r')

            # Create Metadata link annotation
            link = document.newAccList('metadata', 50)
            link['property:sourceDatabase'] = 'lazarus'
            link['property:sourceTitle'] = 'Lazarus'
            link['property:sourceDescription'] = self.sourceDescription
            link['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                'images/lazarus-prefs-logo.png', 'image/png')

            headers = []
            pos = []
            refs = []
            annotations = []
            concepts = {}
            hits = []
            expression_annotations = []
            for kAnnotation in kend.converter.XML.parse(
                    response, kend.model.Document):
                #print kend.converter.XML.serialise(kAnnotation)[0]
                try:
                    annotations.append(
                        utopia.tools.converters.Annotation.kend2spineapi(
                            kAnnotation, document))
                except:
                    pass
            annotations.sort(key=lambda a: int(a.get('structure:order', 0)))
            for sAnnotation in annotations:
                if sAnnotation['concept'] == 'structure_element':
                    role, level = self.getHeaderRole(sAnnotation)
                    if role is not None and needs_outline:
                        while len(pos) < level:
                            pos.append(0)
                        while len(pos) > level:
                            pos.pop()
                        pos[-1] += 1

                        outline = u'.'.join([unicode(i) for i in pos])
                        anchor_name = '#lazarus.outline.{0}'.format(outline)

                        anchor = spineapi.Annotation()
                        anchor['concept'] = 'Anchor'
                        anchor['property:anchor'] = anchor_name
                        anchor.addExtents(sAnnotation.extents())
                        anchor.addAreas(sAnnotation.areas())
                        document.addAnnotation(anchor)

                        header = spineapi.Annotation()
                        header['concept'] = 'OutlineItem'
                        header['property:outlinePosition'] = outline
                        header['property:outlineTitle'] = u' '.join(
                            [e.text() for e in sAnnotation.extents()])
                        header['property:destinationAnchorName'] = anchor_name
                        document.addAnnotation(header)

                        print((u'    ' * level +
                               u'.'.join([unicode(i)
                                          for i in pos]) + u' ' + u' '.join([
                                              e.text()
                                              for e in sAnnotation.extents()
                                          ])).encode('utf8'))
                    elif 'bibitem' in sAnnotation.getAllProperties(
                            'structure:role'):
                        #refs.append(sAnnotation)
                        pass
                elif sAnnotation['concept'] == 'Citation':
                    # Hack to fix a mistake in authors property name
                    if 'property:author' in sAnnotation and not 'property:authors' in sAnnotation:
                        sAnnotation[
                            'property:authors'] = sAnnotation.getAllProperties(
                                'property:author')
                    refs.append(sAnnotation)
                elif sAnnotation['concept'] == 'LazarusConcept':
                    concept_id = sAnnotation.get('property:identifier')
                    if concept_id is not None:
                        sAnnotation['id'] = str(uuid.uuid4())
                        concepts[concept_id] = sAnnotation
                        document.addAnnotation(sAnnotation, 'Lazarus Concept')
                elif sAnnotation['concept'] == 'LazarusConceptHit':
                    hits.append(sAnnotation)
                elif sAnnotation['concept'] == 'LazarusSentenceExpression':
                    expression_annotations.append(sAnnotation)
                else:
                    document.addAnnotation(sAnnotation)

            for ref in refs:
                #print(ref.get('structure:order', '0'))
                pass
            refs = sorted(refs,
                          key=lambda ref: int(ref.get('property:order', '0')))

            for ref in refs:
                #print(ref.get('structure:order', '0'))
                pass
            for ref in refs:
                # Create Bibliography annotations
                #citation = {'unstructured': u' '.join([e.text() for e in ref.extents()])}
                #annotation = utopia.tools.utils.citation_to_annotation(citation)
                #annotation['property:order'] = ref.get('structure:order')
                #annotation.addExtents(ref.extents())
                #annotation.addAreas(ref.areas())
                #document.addAnnotation(annotation, link['scratch'])
                document.addAnnotation(ref, link['scratch'])

            # Now link hits to concepts
            for i, hit in enumerate(hits):
                concept_id = hit.get('property:identifier')
                concept = concepts.get(concept_id)
                if concept is not None:
                    concept_uuid = concept.get('id')
                    hit['property:concept_id'] = concept_uuid

                    identifier = concept.get('property:identifier')
                    name = concept.get('property:name', '???')
                    sources = concept.get('property:externalSources',
                                          'json:[]')
                    if sources.startswith('json:'):
                        sources = json.loads(sources[5:])
                    if 'property:stdInchiKey' in concept:
                        sources.append({
                            'database':
                            ' InchiKey',
                            'identifier':
                            concept['property:stdInchiKey']
                        })
                    if 'property:canonicalSmiles' in concept:
                        sources.append({
                            'database':
                            ' SMILES',
                            'identifier':
                            concept['property:canonicalSmiles']
                        })
                    kind = concept.get('property:kind')
                    kind = self.dbs.get(kind, {}).get('title', kind)
                    links = {}
                    for source in sources:
                        uri = source.get('uri')
                        if 'primary' in source.get('relationship', []):
                            links.setdefault('definition', [])
                            links['definition'].append(u'''
                                <a href="{uri}" title="{uri}">{database}</a>
                            '''.format(**source))
                        elif uri is None:
                            if source.get('database') in (' InchiKey',
                                                          ' SMILES'):
                                links.setdefault('main', [])
                                links['main'].append(u'''
                                    <tr><td>{database}:</td><td>{identifier}</td></tr>
                                '''.format(**source))
                        else:
                            identifier = source.get('identifier')
                            links_category = 'xref'
                            if 'seeAlso' in source.get('relationship',
                                                       []) or uri is None:
                                links_category = 'seeAlso'
                            links.setdefault(links_category, [])
                            if identifier is not None:
                                links[links_category].append(u'''
                                    <a href="{uri}" title="{uri}">{name}...</a> ({identifier})
                                '''.format(**source))
                            else:
                                links[links_category].append(u'''
                                    <a href="{uri}" title="{uri}">{name}...</a>
                                '''.format(**source))

                    style = u'''
                        <style>
                          .lazarus-table tbody {
                            border: none;
                          }
                          .lazarus-table td:first-of-type {
                            text-align: right;
                            font-weight: bold;
                          }
                          .lazarus-table td {
                            vertical-align: top;
                          }
                          .lazarus-table td:first-of-type {
                            white-space: nowrap;
                          }
                          .lazarus-table td:not(:first-of-type) {
                            word-break: break-all;
                          }
                          .lazarus-table tr td {
                            padding-top: 0ex;
                            padding-bottom: 0ex;
                          }
                          .lazarus-table tbody:not(:first-of-type) tr:first-of-type td {
                            padding-top: 1ex;
                          }
                        </style>
                    '''
                    html = u'''
                        <table class="lazarus-table">
                          <tr><td>Name:</td><td>{name}</td></tr>
                    '''.format(**{'name': name})
                    categories = {
                        'xref': 'Related:',
                        'seeAlso': 'See also:',
                        'definition': 'Defined in:'
                    }
                    for links_category in ('main', 'xref', 'seeAlso',
                                           'definition'):
                        links_title = categories.get(links_category)
                        these_links = sorted(
                            list(set(links.get(links_category, []))))
                        if len(these_links) > 0:
                            html += '<tbody>'
                            if links_category != 'main':
                                html += u'<tr><td>{0}</td><td>'.format(
                                    links_title)
                                html += u'<br>'.join(these_links)
                                html += '</td></tr>'
                            else:
                                html += ''.join(these_links)
                            html += '</tbody>'
                    #pprint('------------------------')
                    html += u'''
                        </table>
                    '''
                    #print(html)

                    hasLinks = len(
                        links.get('xref', []) + links.get('seeAlso', [])) > 0

                    ann = spineapi.Annotation()
                    ann['concept'] = 'Collated'
                    ann['property:name'] = u'{0}'.format(name)
                    ann['property:description'] = 'Lazarus Concept'
                    ann['session:semanticTerm'] = name
                    ann['property:html'] = [style, html]
                    ann['property:sourceDescription'] = self.sourceDescription
                    ann['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                        'images/lazarus-prefs-logo.png', 'image/png')
                    ann['session:overlay'] = 'hyperlink'
                    ann['session:color'] = '#880000'
                    count = 0
                    print('====', 7)
                    if 'property:hitFragments' in hit:
                        hitFragments = hit.getAllProperties(
                            'property:hitFragments') or []
                        #pprint(hitFragments)
                        for hitFragment in hitFragments:
                            pre, _, rest = hitFragment.partition('{!')
                            match, _, post = rest.partition('!}')
                            #pprint((pre, match, post))
                            matches = document.findInContext(pre,
                                                             match,
                                                             post,
                                                             fuzzy=True)
                            count += len(matches)
                            ann.addExtents(matches)
                    if hasLinks and count > 0:
                        document.addAnnotation(ann)

            style = u'''
                <style>
                    .lazarus-expression .box {
                        background-color: #FFF0E8;
                        border-color: #EEE0D8;
                    }
                    .lazarus-related {
                        padding-left: 42px;
                        background-image: url(%s);
                        background-repeat: no-repeat;
                        background-position: top left;
                        background-size: 37px 48px;
                        min-height: 53px;
                    }
                    .lazarus-related + .lazarus-related {
                        margin-top: 5px;
                        border-top: 1px dotted #aaa;
                        padding-top: 5px;
                        background-position-y: 5px;
                        min-height: 58px;
                    }
                    .lazarus-sentence {
                        padding-left: 0.5em;
                        color: black;
                    }
                    .lazarus-sentence.negative {
                        border-left: solid 5px #bb0000;
                    }
                    .lazarus-sentence.positive {
                        border-left: solid 5px #008800;
                    }
                    .lazarus-sentence.negative a {
                        color: #bb0000;
                    }
                    .lazarus-sentence.positive a {
                        color: #008800;
                    }
                </style>
            ''' % utopia.get_plugin_data_as_url('images/pdf-page-icon.png',
                                                'image/png')

            expressions = []
            for sAnnotation in expression_annotations:
                exp = sAnnotation.get('property:expressions', 'json:{}')
                if exp.startswith('json:'):
                    exp = json.loads(exp[5:])
                context = sAnnotation.get('property:context')
                if context is not None:
                    if exp.get('negative', False):
                        exp['posneg'] = 'negative'
                    else:
                        exp['posneg'] = 'positive'

                    pprint(context)
                    pprint(exp)

                    matched_context = exp.get('context')
                    matches = []
                    if matched_context is not None:
                        matches = document.search(
                            re.sub(r'\s+', ' ', matched_context))
                        if len(matches) > 0:
                            anchor_id = str(uuid.uuid4())[1:-1]
                            anchor = spineapi.Annotation()
                            anchor['concept'] = 'Anchor'
                            anchor['property:anchor'] = anchor_id
                            anchor.addExtents(matches)
                            document.addAnnotation(anchor)

                            exp.update({
                                'anchor_id': anchor_id,
                                'sentence': context
                            })
                            expressions.append(exp)

            js = u'''
                <script>
                    $(document).on('DOMNodeInserted', function(e) {
                        var element = e.target;
                        $(element).filter('a[target="tab"]').add('a[target="tab"]', element).each(function () {
                            var fragment = $(this).closest('.-papyro-internal-citation').data('citation')['userdef']['first_fragment'];
                            $(this).attr('target', 'pdf; show=highlight; text=[' + encodeURIComponent(fragment) + ']');
                        });
                    });

                    $(function () {
                        var lazarus = {
                            expressions: %s,
                            fingerprints: %s,
                            relUrl: %s
                        };

                        var more_expressions_link = $('#lazarus-expression > p.more').hide();
                        var more_expressions_spinner = $('#lazarus-expression > div.spinner');

                        Spinners.create(more_expressions_spinner);
                        Spinners.play(more_expressions_spinner);

                        var exp_divs = [];
                        var identifiers = [];
                        for (var e = 0; e < lazarus.expressions.length; e++) {
                            var expression = lazarus.expressions[e];
                            var exp_div = $('<div class="box"></div>');
                            exp_div.data('expression', expression);
                            exp_div.hide();
                            exp_divs.push(exp_div);
                            identifiers.push(expression.identifiers);
                        }
                        var params = {
                            fingerprint: lazarus.fingerprints
                        };
                        var url = lazarus.relUrl + '?' + $.param(params, traditional=true);
                        $.ajax({
                            url: url,
                            type: 'POST',
                            dataType: 'json',
                            data: JSON.stringify(identifiers),
                            contentType: "application/json",
                            error: function (xhr, ajaxOptions, thrownError) {
                                console.log(xhr.statusText);
                                console.log(xhr.responseText);
                                console.log(xhr.status);
                                console.log(thrownError);

                                // FIXME do something here
                                Spinners.remove(more_expressions_spinner);
                            },
                            success: function (related) {
                                // Sort related according to the number of articles found
                                related.results.sort(function (l, r) {
                                    var lv = Object.keys(l.related).length;
                                    var rv = Object.keys(r.related).length;
                                    return (lv > rv) ? -1 : (lv < rv) ? 1 : 0;
                                });
                                $.each(related.results, function (idx, result) {
                                    var exp_div = exp_divs[idx];
                                    var expression = exp_div.data('expression');
                                    expression.related = result.related;
                                    delete expression.related[%s];

                                    split = expression.sentence.split(expression.context);
                                    pre = split[0];
                                    pre = pre.replace(/(\w)$/, '$1 ');
                                    pre = pre.replace(/^\s*/, '');
                                    match = expression.context;
                                    post = split[1];
                                    post = post.replace(/^(\w)/, ' $1');
                                    post = post.replace(/\s*$/, '');
                                    expression.pre = pre;
                                    expression.match = match;
                                    expression.post = post;

                                    // Create expression element
                                    exp_div.append('<p class="lazarus-sentence ' + expression.posneg + '">&ldquo;' + expression.pre + '<a target="pdf; show=select; anchor=' + expression.anchor_id + '"><strong>' + expression.match + '</strong></a>' + expression.post + '&rdquo;</p>');
                                    exp_div.data('expression', expression);

                                    $('#lazarus-expression > .content').append(exp_div);

                                    if (Object.keys(expression.related).length > 0) {
                                        var related_div = $('<div class="expandable" title="Related expressions elsewhere"></div>');
                                        var related_div_content = $('<div></div>').appendTo(related_div);
                                        function on_expand() {
                                            related_div.off('papyro:expandable:expand', on_expand);
                                            $.each(expression.related, function (idx, obj) {
                                                fragments = [];
                                                $.each(obj, function (id, obj) {
                                                    fragments.push(obj.context);
                                                });
                                                fragments.join('\\n');
                                                related_div_content.append($('<div class="lazarus-related unprocessed"></div>').append('<p><strong>&ldquo;&hellip;'+fragments+'&hellip;&rdquo;</strong></p>').hide().data('citation', {identifiers:{doi:idx},userdef:{first_fragment:fragments[0]}}));
                                                // .append(utopia.citation.render({identifiers:{doi:idx},first_fragment:fragments[0]}, true, true))
                                            });
                                            expression.related.length = 0; // empty for future

                                            if ($('.lazarus-related.unprocessed', exp_div).length > 0) {
                                                var more = $('<p class="more right"><a class="more">More related articles...</a></p>');
                                                related_div_content.append(more);
                                                function show_five_related(e) {
                                                    e.preventDefault();

                                                    $('.lazarus-related.unprocessed', exp_div).slice(0, 5).each(function (idx, obj) {
                                                        var citation = $(obj).data('citation');
                                                        $(obj).append(utopia.citation.render(citation, true, true));
                                                        $(obj).show().removeClass('unprocessed');
                                                    });
                                                    if ($('.lazarus-related.unprocessed', exp_div).length == 0) {
                                                        more.remove();
                                                    }
                                                }
                                                more.on('click', show_five_related).click();
                                            }
                                        }
                                        related_div.on('papyro:expandable:expand', on_expand);
                                        exp_div.append(related_div);
                                        utopia.processNewContent(related_div);
                                    }
                                });

                                Spinners.remove(more_expressions_spinner);
                                more_expressions_link.show();
                                $('a.more', more_expressions_link).click();
                            }
                        });

                        function append_five(e) {
                            e.preventDefault();

                            // Show the next five
                            $('#lazarus-expression > .content').children().filter(':hidden').slice(0,5).show();

                            // Hide the 'more' link if everything is now visible
                            if ($('#lazarus-expression > .content').children().filter(':hidden').length == 0) {
                                more_expressions_link.hide();
                            }
                        }

                        // Hook up 'more' link
                        $('#lazarus-expression > p.more > a.more').on('click', append_five).click();
                    });
                </script>
            ''' % (json.dumps(expressions), json.dumps(
                document.fingerprints()), json.dumps(laz_docRelUrl),
                   json.dumps(this_doi))
            #print(js.encode('utf8'))

            html = u'''
                <div id="lazarus-expression"><div class="content"></div><div class="spinner"></div><p class="more"><a class="more">More expressions...</a></p></div>
            '''

            if len(expressions) > 0:
                ann = spineapi.Annotation()
                ann['concept'] = 'Collated'
                ann['property:name'] = 'Lazarus Expressions'
                ann['property:description'] = u'Summarizing expression(s)'
                ann['property:html'] = [js, style, html]
                ann['property:sourceDescription'] = self.sourceDescription
                ann['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                    'images/lazarus-prefs-logo.png', 'image/png')
                document.addAnnotation(ann)

        else:  # no permission
            noprompt = self.get_config('noprompt', False)
            if not noprompt:
                annotation = spineapi.Annotation()
                annotation['concept'] = 'Collated'
                params = {
                    'uuid': self.uuid(),
                }
                annotation['property:html'] = utopia.get_plugin_data(
                    'tpl/denied.html').format(**params)
                annotation['property:name'] = 'Lazarus'
                annotation[
                    'property:description'] = 'Lazarus functionality is turned off'
                annotation[
                    'property:sourceDescription'] = self.sourceDescription
                annotation[
                    'property:sourceIcon'] = utopia.get_plugin_data_as_url(
                        'images/lazarus-prefs-logo.png', 'image/png')
                annotation['session:default'] = '1'
                document.addAnnotation(annotation)

Exemple #5

0

Afficher le fichier

Fichier : portland.py Projet : adhalanay/utopia-documents-mirror

    def on_ready_event(self, document):
        volume, page = None, None

        # Only send if the DOI has a Portland prefix
        doi = utopialib.utils.metadata(document, 'identifiers[doi]')
        if doi is not None and doi[:7] in registrants:
            crossref_unixref = utopialib.utils.metadata(
                document, 'raw_crossref_unixref')
            if crossref_unixref is not None:
                # Parse CrossRef redirect URL
                dom = etree.fromstring(crossref_unixref.encode('utf8'))
                resource = dom.findtext(
                    'doi_record/crossref/journal/journal_article/doi_data/resource'
                )
                if resource is not None:
                    match = self.resourceRegExp.match(resource)
                    if match is not None:
                        volume, page = match.groups()

                ### FIXME What information should be shown? Portland? BJ?
                #annotation = spineapi.Annotation()
                #annotation['concept'] = 'PublisherIdentity'
                #annotation['property:logo'] = utopia.get_plugin_data_as_url('images/logo.png', 'image/png')
                #annotation['property:title'] = 'Portland Press Limited'
                #annotation['property:webpageUrl'] = 'http://www.portlandpress.com/'
                #document.addAnnotation(annotation, 'PublisherMetadata')

        # If this document was resolved, off we go to fetch the NLM
        if None not in (volume, page):
            # Make a request to the utopia ext web service
            url = 'https://utopia.cs.manchester.ac.uk/ext/portland/nlm?{0}'
            url = url.format(urllib.urlencode({
                'volume': volume,
                'page': page
            }))
            try:
                nlm = urllib2.urlopen(url, timeout=8).read()
            except:
                raise
                return

            info = utopialib.nlm.parse(nlm)
            if info is not None and len(info) > 0:

                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True,
                                         recover=True,
                                         remove_blank_text=True,
                                         encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id'])
                              for citation in info['citations']
                              if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(
                        utopialib.eutils.efetch(id=','.join(pmids.keys()),
                                                retmode='xml',
                                                rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath(
                            'PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid',
                                                                       'pmc'),
                                                      ('pii', 'pii')):
                                id = idList.findtext(
                                    'ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Create Metadata link annotation
                link = document.newAccList('metadata', 100)
                link['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                    'images/biochemj.png', 'image/png')
                link['property:sourceTitle'] = 'Portland'
                link['property:sourceDescription'] = '''
                    <p><a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p>
                    '''

                # Create Metadata annotation
                annotation = utopialib.utils.citation_to_annotation(
                    info.get('self', {}), 'DocumentMetadata')
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = utopialib.utils.citation_to_annotation(
                        citation)
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = utopialib.utils.citation_to_annotation(
                                    citation, concept='ForwardCitation')
                                if 'doi' in citation and citation[
                                        'doi'].startswith('10.1371/'):
                                    citation[
                                        'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format(
                                            'info:doi/{0}'.format(
                                                citation['doi']))
                                if 'pmcid' in citation:
                                    citation[
                                        'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(
                                            citation['pmcid'])
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation,
                                                       link['scratch'])
                            except:
                                raise

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict=True)
                        #print regex
                        matches = document.search(
                            regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation[
                                'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(
                                    table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id

Exemple #6

0

Afficher le fichier

Fichier : highwire.py Projet : adhalanay/utopia-documents-mirror

    def on_ready_event(self, document):

        doi = utopialib.utils.metadata(document, 'identifiers[doi]')
        if doi is not None:
            info = {}

            # Resolve the DOI to find the publisher's website
            response = urllib2.urlopen('http://dx.doi.org/{0}'.format(doi),
                                       timeout=8)

            # Parse page to find (if there) the full text URL
            parser = etree.HTMLParser()
            html = etree.parse(response, parser)

            # Only continue if this is a highwire HTML page
            if len(html.xpath("/html/head/meta[@name='HW.identifier']")) == 0:
                return

            # Now make sure we have the full text XHTML
            citation_fulltext_html_url = html.xpath(
                "/html/head/meta[@name='citation_fulltext_html_url']/@content")
            if len(citation_fulltext_html_url) > 0:
                citation_fulltext_html_url = citation_fulltext_html_url[0]

                # Fetch that full text page (if different to the current one)
                if citation_fulltext_html_url != response.geturl():
                    response = urllib2.urlopen(citation_fulltext_html_url,
                                               timeout=8)
                    html = etree.parse(response, parser)

                #print etree.tostring(html, pretty_print=True, encoding='utf8')

                # Now parse out the bibliography
                info['citations'] = []
                info['citations_by_id'] = {}

                for bibitem in html.xpath(
                        "//div[contains(concat(' ', normalize-space(@class), ' '), ' ref-list ')]//ol[contains(concat(' ', normalize-space(@class), ' '), ' cit-list ')]/li"
                ):
                    citation = query(
                        bibitem, {
                            'id':
                            'a/@id',
                            'label':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' ref-label ')]/text()",
                            'title':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-article-title ')]/text()",
                            'year':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-date ')]/text()",
                            'publication-title':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-jnl-abbrev ')]/text()",
                            'volume':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-vol ')]/text()",
                            'issue':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-issue ')]/text()",
                            'pagefrom':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-fpage ')]/text()",
                            'pageto':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-lpage ')]/text()",
                            'pmid':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-pmid ')]/text()",
                            'doi':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-doi ')]/text()",
                            'etree':
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-metadata ')]",
                        })
                    authors = []
                    for a in bibitem.xpath(
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-auth ')]"
                    ):
                        surname = a.xpath(
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-surname ')]/text()"
                        )
                        given_names = a.xpath(
                            ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-given-names ')]/text()"
                        )
                        if len(surname) > 0 and len(given_names) > 0:
                            authors.append(u'{0}, {1}'.format(
                                surname[0], given_names[0]).strip(', '))
                    if len(authors) > 0:
                        citation['authors'] = authors
                    citation['contexts'] = []
                    citation['displayText'] = utopia.citation.format(citation)

                    info['citations'].append(citation)
                    info['citations_by_id'][citation['id']] = citation
                    #print citation

                #######################################################################################
                # Parse in-text citations if present

                min_length = 10
                max_length = 20
                for paragraph in html.xpath(
                        "//div[contains(concat(' ', normalize-space(@class), ' '), ' article ')]//p"
                ):
                    text_stack = [paragraph.text or '']
                    xref_stack = [None]
                    for elem in paragraph:
                        if len(elem.xpath('self::a[@class="xref-bibr"]')) > 0:
                            text_stack.append(
                                etree.tostring(elem,
                                               method='text',
                                               encoding=unicode,
                                               with_tail=False))
                            text_stack.append(elem.tail or '')
                            xref = info['citations_by_id'].get(
                                elem.get('href', '')[1:])
                            if xref is not None:
                                xref_stack += [[xref], None]
                            else:
                                xref_stack += [[], None]
                        elif isinstance(elem, etree._Entity):
                            points = entities.get(elem.text[1:-1])
                            if points is not None:
                                text_stack[-1] += ''.join(
                                    (unichr(p) for p in points))
                            else:
                                text_stack[-1] += etree.tostring(
                                    elem, encoding=unicode)
                        else:
                            if elem.get('position') == 'float':
                                text_stack[-1] += elem.tail or ''
                            else:
                                text_stack[-1] += etree.tostring(
                                    elem, method='text', encoding=unicode)
                    # Find and collapse ranges in the text
                    for i in xrange(len(xref_stack) - 3, 1, -2):
                        text = text_stack[i].strip()
                        #print len(text), (text in u'-\u2010\u2011\u2012\u2013\u2014\u2015'), u''.join(text_stack[i-1:i+2]).encode('utf8')
                        # if this text is a dash, we need to coalesce the text fragments
                        if len(
                                text
                        ) == 1 and text in u'-\u2010\u2011\u2012\u2013\u2014\u2015':
                            text_stack[i - 1:i + 2] = [
                                u''.join(text_stack[i - 1:i + 2])
                            ]
                            xref_stack[i - 1:i + 2] = [
                                xref_stack[i - 1] + xref_stack[i + 1]
                            ]
                    #for text in text_stack:
                    #    print text.encode('utf8')
                    # Then make sure we resolve the implied citations
                    for i in xrange(1, len(xref_stack), 2):
                        # Get actual cross references
                        xrefs = xref_stack[i]

                        # Expand cross references
                        try:
                            if len(xrefs) == 2:
                                labelfrom = int(xrefs[0].get('label'))
                                labelto = int(xrefs[1].get('label'))
                                candidates = {}
                                midlabels = [
                                    unicode(midlabel) for midlabel in xrange(
                                        labelfrom + 1, labelto)
                                ]
                                for candidate in info['citations']:
                                    if candidate.get('label') in midlabels:
                                        candidates[int(candidate.get(
                                            'label'))] = candidate
                                xrefs[1:-1] = candidates.values()
                        except:
                            raise
                    # Find and collapse lists in the text
                    for i in xrange(len(xref_stack) - 3, 1, -2):
                        text = text_stack[i].strip()
                        # if this text is a comma, we need to coalesce the text fragments
                        if len(text) == 1 and text == ',':
                            text_stack[i - 1:i + 2] = [
                                u''.join(text_stack[i - 1:i + 2])
                            ]
                            xref_stack[i - 1:i + 2] = [
                                xref_stack[i - 1] + xref_stack[i + 1]
                            ]
                    # Expand citations to include brackets (on both sides)
                    for i in xrange(len(xref_stack) - 2, 0, -2):
                        before = text_stack[i - 1].strip()[-1:]
                        text = text_stack[i].strip()
                        after = text_stack[i + 1].strip()[:1]
                        # if this text is a comma, we need to coalesce the text fragments
                        #print before.encode('utf'), after.encode('utf')
                        if len(before) > 0 and before in '({[' and len(
                                after) > 0 and after in ')}]':
                            text_stack[i - 1] = re.sub(r'[({[](\s*)$', r'\1',
                                                       text_stack[i - 1])
                            text_stack[i + 1] = re.sub(r'^(\s*)[)}\]]', r'\1',
                                                       text_stack[i + 1])
                            text_stack[i] = before + text_stack[i] + after
                    #print repr(text_stack)
                    for i in xrange(1, len(xref_stack), 2):
                        # Get context
                        before = u' '.join(text_stack[:i]).strip()
                        label = text_stack[i].strip()
                        after = u' '.join(text_stack[i + 1:]).strip()
                        # Strip out extraneous brackets
                        if len(
                                xref_stack[i]
                        ) > 1:  # Hack to differentiate single / multiple citations
                            # as multiple numbers tend not to have spaces between them
                            label = re.sub(
                                ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,])\s?[({[]?',
                                r'\1', label)
                        else:
                            label = re.sub(
                                ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,]\s?)[({[]?',
                                r'\1', label)
                        # Normalise context
                        before = re.sub(r'\s+', ' ',
                                        before)[-max_length:].strip()
                        label = re.sub(r'\s+', ' ', label)
                        after = re.sub(r'\s+', ' ', after)[:max_length].strip()
                        #print (before.encode('utf8'), label.encode('utf8'), after.encode('utf8'))
                        if len(before + after) > min_length:
                            for xref in xref_stack[i]:
                                xref['contexts'].append((before, label, after))
                        #print xref_stack[i]

                #######################################################################################
                # Parse tables if present

                info['tables'] = {}
                for table_url in html.xpath(
                        "//div[contains(concat(' ', normalize-space(@class), ' '), ' table-inline ')]/div[contains(concat(' ', normalize-space(@class), ' '), ' callout ')]//li[1]/a/@href"
                ):
                    table_url = urlparse.urljoin(citation_fulltext_html_url,
                                                 table_url)
                    #print table_url
                    response = urllib2.urlopen(table_url, timeout=8)
                    table_html = etree.parse(response, parser)
                    for table_expansion in table_html.xpath(
                            "//div[contains(concat(' ', normalize-space(@class), ' '), ' table-expansion ')]"
                    ):
                        id = table_expansion.get('id')
                        table = {}
                        table['xml'] = table_expansion.xpath('.//table[1]')[0]
                        table['caption_raw'] = table_expansion.xpath(
                            ".//span[contains(concat(' ', normalize-space(@class), ' '), ' caption-title ')][1]"
                        )[0]
                        if 'caption' not in table and 'caption_raw' in table:
                            table['caption'] = table['caption_raw']
                        if 'caption' in table:
                            table['caption'] = re.sub(
                                r'\s+', ' ',
                                etree.tostring(table['caption'],
                                               method='text',
                                               encoding=unicode).strip())
                        if 'xml' in table:
                            table['xml'] = etree.tostring(table['xml'],
                                                          encoding='utf8')
                        info['tables'][id] = table

                        #print table

            #print info
            if info is not None and len(info) > 0:
                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True,
                                         recover=True,
                                         remove_blank_text=True,
                                         encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id'])
                              for citation in info['citations']
                              if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(
                        utopialib.eutils.efetch(id=','.join(pmids.keys()),
                                                retmode='xml',
                                                rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath(
                            'PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid',
                                                                       'pmc'),
                                                      ('pii', 'pii')):
                                id = idList.findtext(
                                    'ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Generate sensible titles / descriptions / icons?
                journalTitle = info.get('publication-title', '')
                journalTitleSuffix = ''
                publisher = info.get('publisher', 'the publisher')
                if len(journalTitle) > 0:
                    journalTitleSuffix = ' ({0})'.format(journalTitle)

                # Create Metadata link annotation
                link = document.newAccList('metadata', 90)
                link['property:sourceIcon'] = ''
                link['property:sourceTitle'] = publisher
                link['property:sourceDescription'] = '''
                    <p>This information was provided by {0}{1}.</p>
                    '''.format(publisher, journalTitleSuffix)

                # Create Metadata annotation
                annotation = utopialib.utils.citation_to_annotation(
                    info.get('self', {}), 'DocumentMetadata')
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = utopialib.utils.citation_to_annotation(
                        citation)
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        #print (pre, label, post)
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = spineapi.Annotation()
                                annotation = utopialib.utils.citation_to_annotation(
                                    citation, concept='ForwardCitation')
                                if 'doi' in citation and citation[
                                        'doi'].startswith('10.1371/'):
                                    citation[
                                        'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format(
                                            'info:doi/{0}'.format(
                                                citation['doi']))
                                if 'pmcid' in citation:
                                    citation[
                                        'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(
                                            citation['pmcid'])
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation,
                                                       link['scratch'])
                            except:
                                raise

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict=True)
                        #print regex

                        # convert oasis tables
                        ns = {
                            'oasis':
                            'http://docs.oasis-open.org/ns/oasis-exchange/table'
                        }
                        xml = etree.fromstring(table['xml'])
                        if xml.tag == '{{{0}}}table'.format(ns['oasis']):
                            for tgroup in xml.xpath('//oasis:tgroup',
                                                    namespaces=ns):
                                columns = {}
                                for colspec in tgroup.xpath('./oasis:colspec',
                                                            namespaces=ns):
                                    columns[colspec.get('colname')] = int(
                                        colspec.get('colnum'))
                                for section in tgroup.xpath(
                                        './oasis:thead|./oasis:tbody',
                                        namespaces=ns):
                                    isHead = (
                                        section.tag == '{{{0}}}thead'.format(
                                            ns['oasis']))
                                    for row in section.xpath('./oasis:row',
                                                             namespaces=ns):
                                        for entry in row.xpath('./oasis:entry',
                                                               namespaces=ns):
                                            colname = entry.get('colname')
                                            colst = entry.get('namest')
                                            colend = entry.get('nameend')
                                            if colst is not None and colend is not None:
                                                colspan = columns[
                                                    colend] - columns[colst] + 1
                                            else:
                                                colspan = 1
                                            if colspan > 1:
                                                entry.set(
                                                    'colspan',
                                                    unicode(colspan))
                                            morerows = entry.get('morerows')
                                            if morerows is not None:
                                                rowspan = int(morerows) + 1
                                            else:
                                                rowspan = 1
                                            if rowspan > 1:
                                                entry.set(
                                                    'rowspan',
                                                    unicode(rowspan))
                                            entry.tag = 'td'
                                        row.tag = 'tr'
                                    if isHead:
                                        section.tag = 'thead'
                                    else:
                                        section.tag = 'tbody'
                                    xml.append(section)
                                xml.tag = 'table'
                                #print etree.tostring(xml, pretty_print=True, encoding='utf8')
                                table['xml'] = etree.tostring(xml,
                                                              encoding='utf8')

                        matches = document.search(
                            regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation[
                                'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(
                                    table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id

Exemple #7

0

Afficher le fichier

Fichier : hosted.py Projet : project-renard-survey/utopia-documents-mirror

    def on_ready_event(self, document):
        # See if we have any publishers' NLM hosted for this DOI
        doi = common.utils.metadata(document, 'doi')
        #print '----- DOI', doi
        if doi is not None:
            info = None
            try:
                url = 'https://utopia.cs.manchester.ac.uk/ext/hosted/nlm?'
                url += urllib.urlencode({'doi': doi.encode('utf8')})
                nlm = urllib2.urlopen(url, timeout=8).read()
                info = common.nlm.parse(nlm)
            except (urllib2.URLError, socket.timeout):
                # info will remain None
                pass

            #print info
            if info is not None and len(info) > 0:

                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')):
                                id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Generate sensible titles / descriptions / icons?
                journalTitle = info.get('publication-title', '')
                journalTitleSuffix = ''
                publisher = info.get('publisher', 'the publisher')
                if len(journalTitle) > 0:
                    journalTitleSuffix = ' ({0})'.format(journalTitle)

                # Create Metadata link annotation
                link = document.newAccList('metadata', 100)
                link['property:sourceIcon'] = ''
                link['property:sourceTitle'] = publisher
                link['property:sourceDescription'] = '''
                    <p>This information was provided by {0}{1}.</p>
                    '''.format(publisher, journalTitleSuffix)

                # Publisher identity
                if doi[:8] in ('10.1104/', '10.1105/'):
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'PublisherIdentity'
                    logo = utopia.get_plugin_data_as_url('images/aspb_logo.png', 'image/png')
                    webpageUrl = 'http://www.aspb.org/'
                    title = publisher
                    #print '====', publisher, '---', journalTitle, '---', webpageUrl
                    if doi.startswith('10.1104/'):
                        logo = utopia.get_plugin_data_as_url('images/pp_logo.png', 'image/png')
                        title = journalTitle
                        webpageUrl = 'http://www.plantphysiol.org/'
                    elif doi.startswith('10.1105/'):
                        logo = utopia.get_plugin_data_as_url('images/tpc_logo.png', 'image/png')
                        title = journalTitle
                        webpageUrl = 'http://www.plantcell.org/'

                    annotation['property:logo'] = logo
                    annotation['property:title'] = title
                    annotation['property:webpageUrl'] = webpageUrl
                    document.addAnnotation(annotation, 'PublisherMetadata')

                    link['property:sourceIcon'] = logo
                    link['property:sourceTitle'] = title

                # Create Metadata annotation
                annotation = spineapi.Annotation()
                annotation['concept'] = 'DocumentMetadata'
                for k in self.keys:
                    v = info.get(k)
                    if v is not None:
                        annotation['property:{0}'.format(k)] = v
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'DocumentReference'
                    for k in self.keys:
                        v = citation.get(k)
                        if v is not None:
                            annotation['property:{0}'.format(k)] = v
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'ForwardCitation'
                                annotation['property:state'] = 'found'
                                if 'title' in citation:
                                    annotation['property:title'] = citation['title']
                                if 'id' in citation:
                                    annotation['property:bibid'] = citation['id']
                                if 'doi' in citation and citation['doi'].startswith('10.1371/'):
                                    citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi']))
                                if 'pmcid' in citation:
                                    citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid'])
                                for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'):
                                    if k in citation:
                                        annotation['property:{0}'.format(k)] = citation[k]
                                #print annotation.get('property:label'), annotation.get('property:pdf')
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation, link['scratch'])
                                #print citation
                            except:
                                raise
                                pass # FIXME

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict = True)
                        #print regex

                        # convert oasis tables
                        ns = {'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table'}
                        xml = etree.fromstring(table['xml'])
                        if xml.tag == '{{{0}}}table'.format(ns['oasis']):
                            for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns):
                                columns = {}
                                for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns):
                                    columns[colspec.get('colname')] = int(colspec.get('colnum'))
                                for section in tgroup.xpath('./oasis:thead|./oasis:tbody', namespaces=ns):
                                    isHead = (section.tag == '{{{0}}}thead'.format(ns['oasis']))
                                    for row in section.xpath('./oasis:row', namespaces=ns):
                                        for entry in row.xpath('./oasis:entry', namespaces=ns):
                                            colname = entry.get('colname')
                                            colst = entry.get('namest')
                                            colend = entry.get('nameend')
                                            if colst is not None and colend is not None:
                                                colspan = columns[colend] - columns[colst] + 1
                                            else:
                                                colspan = 1
                                            if colspan > 1:
                                                entry.set('colspan', unicode(colspan))
                                            morerows = entry.get('morerows')
                                            if morerows is not None:
                                                rowspan = int(morerows) + 1
                                            else:
                                                rowspan = 1
                                            if rowspan > 1:
                                                entry.set('rowspan', unicode(rowspan))
                                            entry.tag = 'td'
                                        row.tag = 'tr'
                                    if isHead:
                                        section.tag = 'thead'
                                    else:
                                        section.tag = 'tbody'
                                    xml.append(section)
                                xml.tag = 'table'
                                #print etree.tostring(xml, pretty_print=True, encoding='utf8')
                                table['xml'] = etree.tostring(xml, encoding='utf8')

                        matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id

Exemple #8

0

Afficher le fichier

    def on_ready_event(self, document):
        info = utopialib.nlm.parse(
            utopialib.utils.metadata(document, 'raw_pmc_nlm'))
        if info is not None and len(info) > 0:

            # Enrich citation information with identifiers from PMC
            parser = etree.XMLParser(ns_clean=True,
                                     recover=True,
                                     remove_blank_text=True,
                                     encoding='utf8')
            pmids = dict(((citation['pmid'], citation['id'])
                          for citation in info['citations']
                          if 'pmid' in citation and 'id' in citation))
            if len(pmids) > 0:
                pubmed_abstracts = etree.fromstring(
                    utopialib.eutils.efetch(id=','.join(pmids.keys()),
                                            retmode='xml',
                                            rettype='abstract'), parser)
                for idList in pubmed_abstracts.xpath(
                        'PubmedArticle/PubmedData/ArticleIdList'):
                    #print etree.tostring(idList)
                    pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                    if pmid in pmids:
                        citation = info['citations_by_id'][pmids[pmid]]
                        for key_name, id_name in (('doi', 'doi'),
                                                  ('pmcid', 'pmc'), ('pii',
                                                                     'pii')):
                            id = idList.findtext(
                                'ArticleId[@IdType="{0}"]'.format(id_name))
                            if key_name not in citation and id is not None:
                                citation[key_name] = id

            # Create Metadata link annotation
            link = document.newAccList('metadata', 50)
            link['property:sourceDatabase'] = 'pmc'
            link['property:sourceTitle'] = 'PubMed Central'
            link[
                'property:sourceDescription'] = '<p><a href="http://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> is the U.S. National Institutes of Health (NIH) digital archive of biomedical and life sciences journal literature.</p>'

            # Create Metadata annotation
            annotation = utopialib.utils.citation_to_annotation(
                info.get('self', {}), 'DocumentMetadata')
            document.addAnnotation(annotation, link['scratch'])

            # Create Bibliography annotations
            for citation in info.get('citations', []):
                annotation = utopialib.utils.citation_to_annotation(citation)
                document.addAnnotation(annotation, link['scratch'])

            # Citations
            for citation in info['citations']:
                # Find cross refs
                for pre, label, post in citation.get('contexts', []):
                    matches = document.findInContext(pre, label, post)
                    #print matches
                    if len(matches) > 0:
                        try:
                            annotation = utopialib.utils.citation_to_annotation(
                                citation, concept='ForwardCitation')
                            if 'doi' in citation and citation[
                                    'doi'].startswith('10.1371/'):
                                citation[
                                    'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format(
                                        'info:doi/{0}'.format(citation['doi']))
                            if 'pmcid' in citation:
                                citation[
                                    'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(
                                        citation['pmcid'])
                            for match in matches:
                                annotation.addExtent(match)
                            document.addAnnotation(annotation, link['scratch'])
                        except:
                            raise

            # Tables
            for id, table in info.get('tables', {}).iteritems():
                if 'caption' in table and 'xml' in table:
                    regex = fuzz(table['caption'], strict=True)
                    matches = document.search(
                        regex, spineapi.RegExp + spineapi.IgnoreCase)
                    if len(matches) == 1:
                        annotation = spineapi.Annotation()
                        annotation['concept'] = 'Table'
                        annotation[
                            'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(
                                table['xml'])
                        annotation.addExtent(matches[0])
                        document.addAnnotation(annotation, link['scratch'])
                    else:
                        print '*********** failed to match table:', id

Exemple #9

0

Afficher le fichier

Fichier : springer.py Projet : adhalanay/utopia-documents-mirror

    def on_ready_event(self, document):

        # Only send if the DOI has a Springer prefix
        doi = utopialib.utils.metadata(document, 'identifiers[doi]')
        if doi is not None and doi[:7] in registrants:

            annotation = spineapi.Annotation()
            annotation['concept'] = 'PublisherIdentity'
            if False and doi.startswith(
                    '10.1186/'):  # This turns out not to be reliable
                annotation['property:logo'] = utopia.get_plugin_data_as_url(
                    'images/gigascience_logo.png', 'image/png')
                annotation['property:title'] = 'Giga Science'
                annotation[
                    'property:webpageUrl'] = 'http://www.gigasciencejournal.com/'
            else:
                annotation['property:logo'] = utopia.get_plugin_data_as_url(
                    'images/logo.png', 'image/png')
                annotation['property:title'] = 'Springer'
                annotation['property:webpageUrl'] = 'http://www.springer.com/'
            document.addAnnotation(annotation, 'PublisherMetadata')

            # Make a request to the utopia ext web service
            url = 'https://utopia.cs.manchester.ac.uk/ext/springer/nlm?{0}'
            url = url.format(urllib.urlencode({'doi': doi}))
            try:
                nlm = urllib2.urlopen(url, timeout=8).read()
            except (urllib2.URLError, socket.timeout):
                return

            info = utopialib.nlm.parse(nlm)
            if info is not None and len(info) > 0:

                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True,
                                         recover=True,
                                         remove_blank_text=True,
                                         encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id'])
                              for citation in info['citations']
                              if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(
                        utopialib.eutils.efetch(id=','.join(pmids.keys()),
                                                retmode='xml',
                                                rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath(
                            'PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid',
                                                                       'pmc'),
                                                      ('pii', 'pii')):
                                id = idList.findtext(
                                    'ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Create Metadata link annotation
                link = document.newAccList('metadata', 100)
                link['property:sourceIcon'] = utopia.get_plugin_data_as_url(
                    'images/annotation_icon.png', 'image/png')
                link['property:sourceTitle'] = 'Springer'
                link['property:sourceDescription'] = '''
                    <p><a href="http://www.springer.com/">Springer</a> publishing company.</p>
                    '''

                # Create Metadata annotation
                annotation = utopialib.utils.citation_to_annotation(
                    info.get('self', {}), 'DocumentMetadata')
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = utopialib.utils.citation_to_annotation(
                        citation)
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = utopialib.utils.citation_to_annotation(
                                    citation, concept='ForwardCitation')
                                if 'doi' in citation and citation[
                                        'doi'].startswith('10.1371/'):
                                    citation[
                                        'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format(
                                            'info:doi/{0}'.format(
                                                citation['doi']))
                                if 'pmcid' in citation:
                                    citation[
                                        'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(
                                            citation['pmcid'])
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation,
                                                       link['scratch'])
                            except:
                                raise

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict=True)
                        print regex
                        matches = document.search(
                            regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation[
                                'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(
                                    table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id

Exemple #10

0

Afficher le fichier

    def on_ready_event(self, document):
        doi = utopia.tools.utils.metadata(document, 'identifiers[doi]')
        if doi is not None:
            match = self.splitRegEx.match(doi)
            if match is not None:
                articleNumber = match.group('number')
                annotation = spineapi.Annotation()
                annotation['concept'] = 'PublisherIdentity'
                annotation['property:logo'] = utopia.get_plugin_data_as_url(
                    'images/logo.png', 'image/png')
                annotation['property:title'] = 'eLife'
                annotation[
                    'property:webpageUrl'] = 'http://www.elifesciences.org/'
                document.addAnnotation(annotation, 'PublisherMetadata')

                # Turn all the DOIs that are sub-DOIs of this document into links
                regex = r'{0}\.\d+'.format(re.escape(doi))
                for match in document.search(regex, spineapi.RegExp):
                    url = 'http://dx.doi.org/{0}'.format(match.text())
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'Hyperlink'
                    annotation['property:webpageUrl'] = url
                    annotation['session:volatile'] = '1'
                    annotation.addExtent(match)
                    document.addAnnotation(annotation)

                # Try to get the NLM directly from eLife
                url = 'http://elife.elifesciences.org/elife-source-xml/10.7554/eLife.{0}'
                url = url.format(articleNumber)
                try:
                    nlm = urllib2.urlopen(url, timeout=8).read()
                except (urllib2.URLError, socket.timeout):
                    return

                info = utopia.tools.nlm.parse(nlm)
                if info is not None and len(info) > 0:

                    # Enrich citation information with identifiers from PMC
                    parser = etree.XMLParser(ns_clean=True,
                                             recover=True,
                                             remove_blank_text=True,
                                             encoding='utf8')
                    pmids = dict(((citation['pmid'], citation['id'])
                                  for citation in info['citations']
                                  if 'pmid' in citation and 'id' in citation))
                    if len(pmids) > 0:
                        pubmed_abstracts = etree.fromstring(
                            utopia.tools.eutils.efetch(
                                id=','.join(pmids.keys()),
                                retmode='xml',
                                rettype='abstract'), parser)
                        for idList in pubmed_abstracts.xpath(
                                'PubmedArticle/PubmedData/ArticleIdList'):
                            #print etree.tostring(idList)
                            pmid = idList.findtext(
                                'ArticleId[@IdType="pubmed"]')
                            if pmid in pmids:
                                citation = info['citations_by_id'][pmids[pmid]]
                                for key_name, id_name in (('doi', 'doi'),
                                                          ('pmcid', 'pmc'),
                                                          ('pii', 'pii')):
                                    id = idList.findtext(
                                        'ArticleId[@IdType="{0}"]'.format(
                                            id_name))
                                    if key_name not in citation and id is not None:
                                        citation[key_name] = id
                                        #print 'KEY', key_name, id

                    # Create Metadata link annotation
                    link = document.newAccList('metadata', 100)
                    link[
                        'property:sourceIcon'] = utopia.get_plugin_data_as_url(
                            'images/annotation_icon.png', 'image/png')
                    link['property:sourceTitle'] = 'eLife'
                    link['property:sourceDescription'] = '''
                        <p>The <a href="http://www.elifesciences.org/">eLife</a> open access publishing platform.</p>
                        '''

                    # Create Metadata annotation
                    annotation = utopia.tools.utils.citation_to_annotation(
                        info.get('self', {}), 'DocumentMetadata')
                    document.addAnnotation(annotation, link['scratch'])

                    # Create Bibliography annotations
                    for citation in info.get('citations', []):
                        annotation = utopia.tools.utils.citation_to_annotation(
                            citation)
                        document.addAnnotation(annotation, link['scratch'])

                    #######################################################################################
                    # Apply parsed data to document

                    # Citations
                    for citation in info['citations']:
                        # Find cross refs
                        for pre, label, post in citation.get('contexts', []):
                            matches = document.findInContext(pre, label, post)
                            #print matches
                            if len(matches) > 0:
                                try:
                                    annotation = utopia.tools.utils.citation_to_annotation(
                                        citation, concept='ForwardCitation')
                                    if 'doi' in citation and citation[
                                            'doi'].startswith('10.1371/'):
                                        citation[
                                            'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format(
                                                'info:doi/{0}'.format(
                                                    citation['doi']))
                                    if 'pmcid' in citation:
                                        citation[
                                            'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(
                                                citation['pmcid'])
                                    for match in matches:
                                        annotation.addExtent(match)
                                    document.addAnnotation(
                                        annotation, link['scratch'])
                                except:
                                    raise

                    for id, table in info.get('tables', {}).iteritems():
                        if 'caption' in table and 'xml' in table:
                            regex = fuzz(table['caption'], strict=True)
                            print regex
                            matches = document.search(
                                regex, spineapi.RegExp + spineapi.IgnoreCase)
                            if len(matches) == 1:
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'Table'
                                annotation[
                                    'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(
                                        table['xml'])
                                annotation['session:volatile'] = '1'
                                annotation.addExtent(matches[0])
                                document.addAnnotation(annotation,
                                                       link['scratch'])
                            else:
                                print '*********** failed to match table:', id

Exemple #11

0

Afficher le fichier

Fichier : portland.py Projet : project-renard-survey/utopia-documents-mirror

    def on_ready_event(self, document):
        volume, page = None, None

        # Only send if the DOI has a Portland prefix
        doi = common.utils.metadata(document, 'doi')
        if doi is not None and doi[:7] in registrants:
            crossref_unixref = common.utils.metadata(document, 'raw_crossref_unixref')
            if crossref_unixref is not None:
                # Parse CrossRef redirect URL
                dom = etree.fromstring(crossref_unixref.encode('utf8'))
                resource = dom.findtext('doi_record/crossref/journal/journal_article/doi_data/resource')
                if resource is not None:
                    match = self.resourceRegExp.match(resource)
                    if match is not None:
                        volume, page = match.groups()

                ### FIXME What information should be shown? Portland? BJ?
                #annotation = spineapi.Annotation()
                #annotation['concept'] = 'PublisherIdentity'
                #annotation['property:logo'] = utopia.get_plugin_data_as_url('images/logo.png', 'image/png')
                #annotation['property:title'] = 'Portland Press Limited'
                #annotation['property:webpageUrl'] = 'http://www.portlandpress.com/'
                #document.addAnnotation(annotation, 'PublisherMetadata')

        # If this document was resolved, off we go to fetch the NLM
        if None not in (volume, page):
            # Make a request to the utopia ext web service
            url = 'https://utopia.cs.manchester.ac.uk/ext/portland/nlm?{0}'
            url = url.format(urllib.urlencode({'volume': volume, 'page': page}))
            try:
                nlm = urllib2.urlopen(url, timeout=8).read()
            except:
                raise
                return

            info = common.nlm.parse(nlm)
            if info is not None and len(info) > 0:

                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')):
                                id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Create Metadata link annotation
                link = document.newAccList('metadata', 100)
                link['property:sourceIcon'] = utopia.get_plugin_data_as_url('images/biochemj.png', 'image/png')
                link['property:sourceTitle'] = 'Portland'
                link['property:sourceDescription'] = '''
                    <p><a href="http://www.portlandpress.com/">Portland Press Limited</a>.</p>
                    '''

                # Create Metadata annotation
                annotation = spineapi.Annotation()
                annotation['concept'] = 'DocumentMetadata'
                for k in self.keys:
                    v = info.get(k)
                    if v is not None:
                        annotation['property:{0}'.format(k)] = v
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'DocumentReference'
                    for k in self.keys:
                        v = citation.get(k)
                        if v is not None:
                            annotation['property:{0}'.format(k)] = v
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'ForwardCitation'
                                annotation['property:state'] = 'found'
                                if 'title' in citation:
                                    annotation['property:title'] = citation['title']
                                if 'id' in citation:
                                    annotation['property:bibid'] = citation['id']
                                if 'doi' in citation and citation['doi'].startswith('10.1371/'):
                                    citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi']))
                                if 'pmcid' in citation:
                                    citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid'])
                                for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'):
                                    if k in citation:
                                        annotation['property:{0}'.format(k)] = citation[k]
                                #print annotation.get('property:label'), annotation.get('property:pdf')
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation, link['scratch'])
                                #print citation
                            except:
                                raise
                                pass # FIXME

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict = True)
                        #print regex
                        matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id

Exemple #12

0

Afficher le fichier

    def on_ready_event(self, document):
        # See if we have any publishers' NLM hosted for this DOI
        doi = utopialib.utils.metadata(document, 'identifiers[doi]')
        #print '----- DOI', doi
        if doi is not None:
            info = None
            try:
                url = 'https://utopia.cs.manchester.ac.uk/ext/hosted/nlm?'
                url += urllib.urlencode({'doi': doi.encode('utf8')})
                nlm = urllib2.urlopen(url, timeout=8).read()
                info = utopialib.nlm.parse(nlm)
            except (urllib2.URLError, socket.timeout):
                # info will remain None
                pass

            #print info
            if info is not None and len(info) > 0:

                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True,
                                         recover=True,
                                         remove_blank_text=True,
                                         encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id'])
                              for citation in info['citations']
                              if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(
                        utopialib.eutils.efetch(id=','.join(pmids.keys()),
                                                retmode='xml',
                                                rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath(
                            'PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid',
                                                                       'pmc'),
                                                      ('pii', 'pii')):
                                id = idList.findtext(
                                    'ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Generate sensible titles / descriptions / icons?
                journalTitle = info.get('publication-title', '')
                journalTitleSuffix = ''
                publisher = info.get('publisher', 'the publisher')
                if len(journalTitle) > 0:
                    journalTitleSuffix = ' ({0})'.format(journalTitle)

                # Create Metadata link annotation
                link = document.newAccList('metadata', 100)
                link['property:sourceIcon'] = ''
                link['property:sourceTitle'] = publisher
                link['property:sourceDescription'] = '''
                    <p>This information was provided by {0}{1}.</p>
                    '''.format(publisher, journalTitleSuffix)

                # Publisher identity
                if doi[:8] in ('10.1104/', '10.1105/'):
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'PublisherIdentity'
                    logo = utopia.get_plugin_data_as_url(
                        'images/aspb_logo.png', 'image/png')
                    webpageUrl = 'http://www.aspb.org/'
                    title = publisher
                    #print '====', publisher, '---', journalTitle, '---', webpageUrl
                    if doi.startswith('10.1104/'):
                        logo = utopia.get_plugin_data_as_url(
                            'images/pp_logo.png', 'image/png')
                        title = journalTitle
                        webpageUrl = 'http://www.plantphysiol.org/'
                    elif doi.startswith('10.1105/'):
                        logo = utopia.get_plugin_data_as_url(
                            'images/tpc_logo.png', 'image/png')
                        title = journalTitle
                        webpageUrl = 'http://www.plantcell.org/'

                    annotation['property:logo'] = logo
                    annotation['property:title'] = title
                    annotation['property:webpageUrl'] = webpageUrl
                    document.addAnnotation(annotation, 'PublisherMetadata')

                    link['property:sourceIcon'] = logo
                    link['property:sourceTitle'] = title

                # Create Metadata annotation
                annotation = utopialib.utils.citation_to_annotation(
                    info.get('self', {}), 'DocumentMetadata')
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = utopialib.utils.citation_to_annotation(
                        citation)
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = utopialib.utils.citation_to_annotation(
                                    citation, concept='ForwardCitation')
                                if 'doi' in citation and citation[
                                        'doi'].startswith('10.1371/'):
                                    citation[
                                        'pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format(
                                            'info:doi/{0}'.format(
                                                citation['doi']))
                                if 'pmcid' in citation:
                                    citation[
                                        'pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(
                                            citation['pmcid'])
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation,
                                                       link['scratch'])
                            except:
                                raise

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict=True)
                        #print regex

                        # convert oasis tables
                        ns = {
                            'oasis':
                            'http://docs.oasis-open.org/ns/oasis-exchange/table'
                        }
                        xml = etree.fromstring(table['xml'])
                        if xml.tag == '{{{0}}}table'.format(ns['oasis']):
                            for tgroup in xml.xpath('//oasis:tgroup',
                                                    namespaces=ns):
                                columns = {}
                                for colspec in tgroup.xpath('./oasis:colspec',
                                                            namespaces=ns):
                                    columns[colspec.get('colname')] = int(
                                        colspec.get('colnum'))
                                for section in tgroup.xpath(
                                        './oasis:thead|./oasis:tbody',
                                        namespaces=ns):
                                    isHead = (
                                        section.tag == '{{{0}}}thead'.format(
                                            ns['oasis']))
                                    for row in section.xpath('./oasis:row',
                                                             namespaces=ns):
                                        for entry in row.xpath('./oasis:entry',
                                                               namespaces=ns):
                                            colname = entry.get('colname')
                                            colst = entry.get('namest')
                                            colend = entry.get('nameend')
                                            if colst is not None and colend is not None:
                                                colspan = columns[
                                                    colend] - columns[colst] + 1
                                            else:
                                                colspan = 1
                                            if colspan > 1:
                                                entry.set(
                                                    'colspan',
                                                    unicode(colspan))
                                            morerows = entry.get('morerows')
                                            if morerows is not None:
                                                rowspan = int(morerows) + 1
                                            else:
                                                rowspan = 1
                                            if rowspan > 1:
                                                entry.set(
                                                    'rowspan',
                                                    unicode(rowspan))
                                            entry.tag = 'td'
                                        row.tag = 'tr'
                                    if isHead:
                                        section.tag = 'thead'
                                    else:
                                        section.tag = 'tbody'
                                    xml.append(section)
                                xml.tag = 'table'
                                #print etree.tostring(xml, pretty_print=True, encoding='utf8')
                                table['xml'] = etree.tostring(xml,
                                                              encoding='utf8')

                        matches = document.search(
                            regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation[
                                'session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(
                                    table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id

Exemple #13

0

Afficher le fichier

Fichier : highwire.py Projet : project-renard-survey/utopia-documents-mirror

    def on_ready_event(self, document):

        doi = common.utils.metadata(document, 'doi')
        if doi is not None:
            info = {}

            # Resolve the DOI to find the publisher's website
            response = urllib2.urlopen('http://dx.doi.org/{0}'.format(doi), timeout=8)

            # Parse page to find (if there) the full text URL
            parser = etree.HTMLParser()
            html = etree.parse(response, parser)

            # Only continue if this is a highwire HTML page
            if len(html.xpath("/html/head/meta[@name='HW.identifier']")) == 0:
                return

            # Now make sure we have the full text XHTML
            citation_fulltext_html_url = html.xpath("/html/head/meta[@name='citation_fulltext_html_url']/@content")
            if len(citation_fulltext_html_url) > 0:
                citation_fulltext_html_url = citation_fulltext_html_url[0]

                # Fetch that full text page (if different to the current one)
                if citation_fulltext_html_url != response.geturl():
                    response = urllib2.urlopen(citation_fulltext_html_url, timeout=8)
                    html = etree.parse(response, parser)

                #print etree.tostring(html, pretty_print=True, encoding='utf8')

                # Now parse out the bibliography
                info['citations'] = []
                info['citations_by_id'] = {}

                for bibitem in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' ref-list ')]//ol[contains(concat(' ', normalize-space(@class), ' '), ' cit-list ')]/li"):
                    citation = query(bibitem, {
                        'id': 'a/@id',
                        'label': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' ref-label ')]/text()",
                        'title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-article-title ')]/text()",
                        'year': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-date ')]/text()",
                        'publication-title': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-jnl-abbrev ')]/text()",
                        'volume': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-vol ')]/text()",
                        'issue': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-issue ')]/text()",
                        'pagefrom': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-fpage ')]/text()",
                        'pageto': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-lpage ')]/text()",
                        'pmid': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-pmid ')]/text()",
                        'doi': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-pub-id-doi ')]/text()",
                        'etree': ".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-metadata ')]",
                    })
                    authors = []
                    for a in bibitem.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-auth ')]"):
                        surname = a.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-surname ')]/text()")
                        given_names = a.xpath(".//*[contains(concat(' ', normalize-space(@class), ' '), ' cit-name-given-names ')]/text()")
                        if len(surname) > 0 and len(given_names) > 0:
                            authors.append(u'{0}, {1}'.format(surname[0], given_names[0]).strip(', '))
                    if len(authors) > 0:
                        citation['authors'] = authors
                    citation['contexts'] = []
                    citation['displayText'] = common.utils.format_citation(citation)

                    info['citations'].append(citation)
                    info['citations_by_id'][citation['id']] = citation
                    #print citation


                #######################################################################################
                # Parse in-text citations if present

                min_length = 10
                max_length = 20
                for paragraph in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' article ')]//p"):
                    text_stack = [paragraph.text or '']
                    xref_stack = [None]
                    for elem in paragraph:
                        if len(elem.xpath('self::a[@class="xref-bibr"]')) > 0:
                            text_stack.append(etree.tostring(elem, method='text', encoding=unicode, with_tail=False))
                            text_stack.append(elem.tail or '')
                            xref = info['citations_by_id'].get(elem.get('href', '')[1:])
                            if xref is not None:
                                xref_stack += [[xref], None]
                            else:
                                xref_stack += [[], None]
                        elif isinstance(elem, etree._Entity):
                            points = entities.get(elem.text[1:-1])
                            if points is not None:
                                text_stack[-1] += ''.join((unichr(p) for p in points))
                            else:
                                text_stack[-1] += etree.tostring(elem, encoding=unicode)
                        else:
                            if elem.get('position') == 'float':
                                text_stack[-1] += elem.tail or ''
                            else:
                                text_stack[-1] += etree.tostring(elem, method='text', encoding=unicode)
                    # Find and collapse ranges in the text
                    for i in xrange(len(xref_stack) - 3, 1, -2):
                        text = text_stack[i].strip()
                        #print len(text), (text in u'-\u2010\u2011\u2012\u2013\u2014\u2015'), u''.join(text_stack[i-1:i+2]).encode('utf8')
                        # if this text is a dash, we need to coalesce the text fragments
                        if len(text) == 1 and text in u'-\u2010\u2011\u2012\u2013\u2014\u2015':
                            text_stack[i-1:i+2] = [u''.join(text_stack[i-1:i+2])]
                            xref_stack[i-1:i+2] = [xref_stack[i-1] + xref_stack[i+1]]
                    #for text in text_stack:
                    #    print text.encode('utf8')
                    # Then make sure we resolve the implied citations
                    for i in xrange(1, len(xref_stack), 2):
                        # Get actual cross references
                        xrefs = xref_stack[i]

                        # Expand cross references
                        try:
                            if len(xrefs) == 2:
                                labelfrom = int(xrefs[0].get('label'))
                                labelto = int(xrefs[1].get('label'))
                                candidates = {}
                                midlabels = [unicode(midlabel) for midlabel in xrange(labelfrom+1, labelto)]
                                for candidate in info['citations']:
                                    if candidate.get('label') in midlabels:
                                        candidates[int(candidate.get('label'))] = candidate
                                xrefs[1:-1] = candidates.values()
                        except:
                            raise
                    # Find and collapse lists in the text
                    for i in xrange(len(xref_stack) - 3, 1, -2):
                        text = text_stack[i].strip()
                        # if this text is a comma, we need to coalesce the text fragments
                        if len(text) == 1 and text == ',':
                            text_stack[i-1:i+2] = [u''.join(text_stack[i-1:i+2])]
                            xref_stack[i-1:i+2] = [xref_stack[i-1] + xref_stack[i+1]]
                    # Expand citations to include brackets (on both sides)
                    for i in xrange(len(xref_stack) - 2, 0, -2):
                        before = text_stack[i-1].strip()[-1:]
                        text = text_stack[i].strip()
                        after = text_stack[i+1].strip()[:1]
                        # if this text is a comma, we need to coalesce the text fragments
                        #print before.encode('utf'), after.encode('utf')
                        if len(before) > 0 and before in '({[' and len(after) > 0 and after in ')}]':
                            text_stack[i-1] = re.sub(r'[({[](\s*)$', r'\1', text_stack[i-1])
                            text_stack[i+1] = re.sub(r'^(\s*)[)}\]]', r'\1', text_stack[i+1])
                            text_stack[i] = before + text_stack[i] + after
                    #print repr(text_stack)
                    for i in xrange(1, len(xref_stack), 2):
                        # Get context
                        before = u' '.join(text_stack[:i]).strip()
                        label = text_stack[i].strip()
                        after = u' '.join(text_stack[i+1:]).strip()
                        # Strip out extraneous brackets
                        if len(xref_stack[i]) > 1: # Hack to differentiate single / multiple citations
                                                   # as multiple numbers tend not to have spaces between them
                            label = re.sub(ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,])\s?[({[]?', r'\1', label)
                        else:
                            label = re.sub(ur'[)}\]]?([-\u2010\u2011\u2012\u2013\u2014\u2015,]\s?)[({[]?', r'\1', label)
                        # Normalise context
                        before = re.sub(r'\s+', ' ', before)[-max_length:].strip()
                        label = re.sub(r'\s+', ' ', label)
                        after = re.sub(r'\s+', ' ', after)[:max_length].strip()
                        #print (before.encode('utf8'), label.encode('utf8'), after.encode('utf8'))
                        if len(before + after) > min_length:
                            for xref in xref_stack[i]:
                                xref['contexts'].append((before, label, after))
                        #print xref_stack[i]

                #######################################################################################
                # Parse tables if present

                info['tables'] = {}
                for table_url in html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' table-inline ')]/div[contains(concat(' ', normalize-space(@class), ' '), ' callout ')]//li[1]/a/@href"):
                    table_url = urlparse.urljoin(citation_fulltext_html_url, table_url)
                    #print table_url
                    response = urllib2.urlopen(table_url, timeout=8)
                    table_html = etree.parse(response, parser)
                    for table_expansion in table_html.xpath("//div[contains(concat(' ', normalize-space(@class), ' '), ' table-expansion ')]"):
                        id = table_expansion.get('id')
                        table = {}
                        table['xml'] = table_expansion.xpath('.//table[1]')[0]
                        table['caption_raw'] = table_expansion.xpath(".//span[contains(concat(' ', normalize-space(@class), ' '), ' caption-title ')][1]")[0]
                        if 'caption' not in table and 'caption_raw' in table:
                            table['caption'] = table['caption_raw']
                        if 'caption' in table:
                            table['caption'] = re.sub(r'\s+', ' ', etree.tostring(table['caption'], method='text', encoding=unicode).strip())
                        if 'xml' in table: table['xml'] = etree.tostring(table['xml'], encoding='utf8')
                        info['tables'][id] = table

                        #print table









            #print info
            if info is not None and len(info) > 0:
                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding='utf8')
                pmids = dict(((citation['pmid'], citation['id']) for citation in info['citations'] if 'pmid' in citation and 'id' in citation))
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(common.eutils.efetch(id=','.join(pmids.keys()), retmode='xml', rettype='abstract'), parser)
                    for idList in pubmed_abstracts.xpath('PubmedArticle/PubmedData/ArticleIdList'):
                        #print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info['citations_by_id'][pmids[pmid]]
                            for key_name, id_name in (('doi', 'doi'), ('pmcid', 'pmc'), ('pii', 'pii')):
                                id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    #print 'KEY', key_name, id

                # Generate sensible titles / descriptions / icons?
                journalTitle = info.get('publication-title', '')
                journalTitleSuffix = ''
                publisher = info.get('publisher', 'the publisher')
                if len(journalTitle) > 0:
                    journalTitleSuffix = ' ({0})'.format(journalTitle)

                # Create Metadata link annotation
                link = document.newAccList('metadata', 90)
                link['property:sourceIcon'] = ''
                link['property:sourceTitle'] = publisher
                link['property:sourceDescription'] = '''
                    <p>This information was provided by {0}{1}.</p>
                    '''.format(publisher, journalTitleSuffix)

                # Create Metadata annotation
                annotation = spineapi.Annotation()
                annotation['concept'] = 'DocumentMetadata'
                for k in self.keys:
                    v = info.get(k)
                    if v is not None:
                        annotation['property:{0}'.format(k)] = v
                document.addAnnotation(annotation, link['scratch'])

                # Create Bibliography annotations
                for citation in info.get('citations', []):
                    annotation = spineapi.Annotation()
                    annotation['concept'] = 'DocumentReference'
                    for k in self.keys:
                        v = citation.get(k)
                        if v is not None:
                            annotation['property:{0}'.format(k)] = v
                    document.addAnnotation(annotation, link['scratch'])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info['citations']:
                    # Find cross refs
                    for pre, label, post in citation.get('contexts', []):
                        #print (pre, label, post)
                        matches = document.findInContext(pre, label, post)
                        #print matches
                        if len(matches) > 0:
                            try:
                                annotation = spineapi.Annotation()
                                annotation['concept'] = 'ForwardCitation'
                                annotation['property:state'] = 'found'
                                if 'title' in citation:
                                    annotation['property:title'] = citation['title']
                                if 'id' in citation:
                                    annotation['property:bibid'] = citation['id']
                                if 'doi' in citation and citation['doi'].startswith('10.1371/'):
                                    citation['pdf'] = 'http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF'.format('info:doi/{0}'.format(citation['doi']))
                                if 'pmcid' in citation:
                                    citation['pdf'] = 'http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/'.format(citation['pmcid'])
                                for k in ('displayText', 'label', 'pdf', 'pmid', 'pmc', 'pii', 'doi', 'first_author_surname', 'year', 'journal', 'volume', 'page_from'):
                                    if k in citation:
                                        annotation['property:{0}'.format(k)] = citation[k]
                                #print annotation.get('property:label'), annotation.get('property:pdf')
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation, link['scratch'])
                                #print citation
                            except:
                                raise
                                pass # FIXME

                for id, table in info.get('tables', {}).iteritems():
                    if 'caption' in table and 'xml' in table:
                        regex = fuzz(table['caption'], strict = True)
                        #print regex

                        # convert oasis tables
                        ns = {'oasis': 'http://docs.oasis-open.org/ns/oasis-exchange/table'}
                        xml = etree.fromstring(table['xml'])
                        if xml.tag == '{{{0}}}table'.format(ns['oasis']):
                            for tgroup in xml.xpath('//oasis:tgroup', namespaces=ns):
                                columns = {}
                                for colspec in tgroup.xpath('./oasis:colspec', namespaces=ns):
                                    columns[colspec.get('colname')] = int(colspec.get('colnum'))
                                for section in tgroup.xpath('./oasis:thead|./oasis:tbody', namespaces=ns):
                                    isHead = (section.tag == '{{{0}}}thead'.format(ns['oasis']))
                                    for row in section.xpath('./oasis:row', namespaces=ns):
                                        for entry in row.xpath('./oasis:entry', namespaces=ns):
                                            colname = entry.get('colname')
                                            colst = entry.get('namest')
                                            colend = entry.get('nameend')
                                            if colst is not None and colend is not None:
                                                colspan = columns[colend] - columns[colst] + 1
                                            else:
                                                colspan = 1
                                            if colspan > 1:
                                                entry.set('colspan', unicode(colspan))
                                            morerows = entry.get('morerows')
                                            if morerows is not None:
                                                rowspan = int(morerows) + 1
                                            else:
                                                rowspan = 1
                                            if rowspan > 1:
                                                entry.set('rowspan', unicode(rowspan))
                                            entry.tag = 'td'
                                        row.tag = 'tr'
                                    if isHead:
                                        section.tag = 'thead'
                                    else:
                                        section.tag = 'tbody'
                                    xml.append(section)
                                xml.tag = 'table'
                                #print etree.tostring(xml, pretty_print=True, encoding='utf8')
                                table['xml'] = etree.tostring(xml, encoding='utf8')

                        matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation['concept'] = 'Table'
                            annotation['session:upload_files'] = 'data:application/xml;name=data.xml;base64,%s' % base64.standard_b64encode(table['xml'])
                            annotation['session:volatile'] = '1'
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link['scratch'])
                        else:
                            print '*********** failed to match table:', id

Exemple #14

0

Afficher le fichier

Fichier : pmc.py Projet : project-renard-survey/utopia-documents-mirror

    def on_ready_event(self, document):
        info = common.nlm.parse(common.utils.metadata(document, "raw_pmc_nlm"))
        if info is not None and len(info) > 0:

            # Enrich citation information with identifiers from PMC
            parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding="utf8")
            pmids = dict(
                (
                    (citation["pmid"], citation["id"])
                    for citation in info["citations"]
                    if "pmid" in citation and "id" in citation
                )
            )
            if len(pmids) > 0:
                pubmed_abstracts = etree.fromstring(
                    common.eutils.efetch(id=",".join(pmids.keys()), retmode="xml", rettype="abstract"), parser
                )
                for idList in pubmed_abstracts.xpath("PubmedArticle/PubmedData/ArticleIdList"):
                    # print etree.tostring(idList)
                    pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                    if pmid in pmids:
                        citation = info["citations_by_id"][pmids[pmid]]
                        for key_name, id_name in (("doi", "doi"), ("pmcid", "pmc"), ("pii", "pii")):
                            id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name))
                            if key_name not in citation and id is not None:
                                citation[key_name] = id

            # Create Metadata link annotation
            link = document.newAccList("metadata", 50)
            link["property:sourceDatabase"] = "pmc"
            link["property:sourceTitle"] = "PubMed Central"
            link[
                "property:sourceDescription"
            ] = '<p><a href="http://www.ncbi.nlm.nih.gov/pmc/">PubMed Central</a> is the U.S. National Institutes of Health (NIH) digital archive of biomedical and life sciences journal literature.</p>'

            # Create Metadata annotation
            annotation = spineapi.Annotation()
            annotation["concept"] = "DocumentMetadata"
            for k in self.keys:
                v = info.get(k)
                if v is not None:
                    annotation["property:{0}".format(k)] = v
            document.addAnnotation(annotation, link["scratch"])

            # Create Bibliography annotations
            for citation in info.get("citations", []):
                annotation = spineapi.Annotation()
                annotation["concept"] = "DocumentReference"
                for k in self.keys:
                    v = citation.get(k)
                    if v is not None:
                        annotation["property:{0}".format(k)] = v
                document.addAnnotation(annotation, link["scratch"])

            # Citations
            for citation in info["citations"]:
                # Find cross refs
                for pre, label, post in citation.get("contexts", []):
                    matches = document.findInContext(pre, label, post)
                    # print matches
                    if len(matches) > 0:
                        try:
                            annotation = spineapi.Annotation()
                            annotation["concept"] = "ForwardCitation"
                            annotation["property:state"] = "found"
                            if "title" in citation:
                                annotation["property:title"] = citation["title"]
                            if "id" in citation:
                                annotation["property:bibid"] = citation["id"]
                            if "doi" in citation and citation["doi"].startswith("10.1371/"):
                                citation[
                                    "pdf"
                                ] = "http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF".format(
                                    "info:doi/{0}".format(citation["doi"])
                                )
                            if "pmcid" in citation:
                                citation["pdf"] = "http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/".format(
                                    citation["pmcid"]
                                )
                            # print citation
                            for k in self.keys + ("authors", "pdf", "first_author_surname"):
                                if k in citation:
                                    annotation["property:{0}".format(k)] = citation[k]
                            # print annotation.get('property:label'), annotation.get('property:pdf')
                            for match in matches:
                                annotation.addExtent(match)
                            document.addAnnotation(annotation, link["scratch"])
                            # print citation
                        except:
                            raise
                            pass  # FIXME

            # Tables
            for id, table in info.get("tables", {}).iteritems():
                if "caption" in table and "xml" in table:
                    regex = fuzz(table["caption"], strict=True)
                    matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase)
                    if len(matches) == 1:
                        annotation = spineapi.Annotation()
                        annotation["concept"] = "Table"
                        annotation[
                            "session:upload_files"
                        ] = "data:application/xml;name=data.xml;base64,%s" % base64.standard_b64encode(table["xml"])
                        annotation.addExtent(matches[0])
                        document.addAnnotation(annotation, link["scratch"])
                    else:
                        print "*********** failed to match table:", id

Exemple #15

0

Afficher le fichier

Fichier : elife.py Projet : project-renard-survey/utopia-documents-mirror

    def on_ready_event(self, document):
        doi = common.utils.metadata(document, "doi", "")
        match = self.splitRegEx.match(doi)
        if match is not None:
            articleNumber = match.group("number")
            annotation = spineapi.Annotation()
            annotation["concept"] = "PublisherIdentity"
            annotation["property:logo"] = utopia.get_plugin_data_as_url("images/logo.png", "image/png")
            annotation["property:title"] = "eLife"
            annotation["property:webpageUrl"] = "http://www.elifesciences.org/"
            document.addAnnotation(annotation, "PublisherMetadata")

            # Turn all the DOIs that are sub-DOIs of this document into links
            regex = r"{0}\.\d+".format(re.escape(doi))
            for match in document.search(regex, spineapi.RegExp):
                url = "http://dx.doi.org/{0}".format(match.text())
                annotation = spineapi.Annotation()
                annotation["concept"] = "Hyperlink"
                annotation["property:webpageUrl"] = url
                annotation["session:volatile"] = "1"
                annotation.addExtent(match)
                document.addAnnotation(annotation)

            # Try to get the NLM directly from eLife
            url = "http://elife.elifesciences.org/elife-source-xml/10.7554/eLife.{0}"
            url = url.format(articleNumber)
            try:
                nlm = urllib2.urlopen(url, timeout=8).read()
            except (urllib2.URLError, socket.timeout):
                return

            info = common.nlm.parse(nlm)
            if info is not None and len(info) > 0:

                # Enrich citation information with identifiers from PMC
                parser = etree.XMLParser(ns_clean=True, recover=True, remove_blank_text=True, encoding="utf8")
                pmids = dict(
                    (
                        (citation["pmid"], citation["id"])
                        for citation in info["citations"]
                        if "pmid" in citation and "id" in citation
                    )
                )
                if len(pmids) > 0:
                    pubmed_abstracts = etree.fromstring(
                        common.eutils.efetch(id=",".join(pmids.keys()), retmode="xml", rettype="abstract"), parser
                    )
                    for idList in pubmed_abstracts.xpath("PubmedArticle/PubmedData/ArticleIdList"):
                        # print etree.tostring(idList)
                        pmid = idList.findtext('ArticleId[@IdType="pubmed"]')
                        if pmid in pmids:
                            citation = info["citations_by_id"][pmids[pmid]]
                            for key_name, id_name in (("doi", "doi"), ("pmcid", "pmc"), ("pii", "pii")):
                                id = idList.findtext('ArticleId[@IdType="{0}"]'.format(id_name))
                                if key_name not in citation and id is not None:
                                    citation[key_name] = id
                                    # print 'KEY', key_name, id

                # Create Metadata link annotation
                link = document.newAccList("metadata", 100)
                link["property:sourceIcon"] = utopia.get_plugin_data_as_url("images/annotation_icon.png", "image/png")
                link["property:sourceTitle"] = "eLife"
                link[
                    "property:sourceDescription"
                ] = """
                    <p>The <a href="http://www.elifesciences.org/">eLife</a> open access publishing platform.</p>
                    """

                # Create Metadata annotation
                annotation = spineapi.Annotation()
                annotation["concept"] = "DocumentMetadata"
                for k in self.keys:
                    v = info.get(k)
                    if v is not None:
                        annotation["property:{0}".format(k)] = v
                document.addAnnotation(annotation, link["scratch"])

                # Create Bibliography annotations
                for citation in info.get("citations", []):
                    annotation = spineapi.Annotation()
                    annotation["concept"] = "DocumentReference"
                    for k in self.keys:
                        v = citation.get(k)
                        if v is not None:
                            annotation["property:{0}".format(k)] = v
                    document.addAnnotation(annotation, link["scratch"])

                #######################################################################################
                # Apply parsed data to document

                # Citations
                for citation in info["citations"]:
                    # Find cross refs
                    for pre, label, post in citation.get("contexts", []):
                        matches = document.findInContext(pre, label, post)
                        # print matches
                        if len(matches) > 0:
                            try:
                                annotation = spineapi.Annotation()
                                annotation["concept"] = "ForwardCitation"
                                annotation["property:state"] = "found"
                                if "title" in citation:
                                    annotation["property:title"] = citation["title"]
                                if "id" in citation:
                                    annotation["property:bibid"] = citation["id"]
                                if "doi" in citation and citation["doi"].startswith("10.1371/"):
                                    citation[
                                        "pdf"
                                    ] = "http://www.ploscompbiol.org/article/fetchObjectAttachment.action?uri={0}&representation=PDF".format(
                                        "info:doi/{0}".format(citation["doi"])
                                    )
                                if "pmcid" in citation:
                                    citation["pdf"] = "http://www.ncbi.nlm.nih.gov/pmc/articles/{0}/pdf/".format(
                                        citation["pmcid"]
                                    )
                                for k in (
                                    "displayText",
                                    "label",
                                    "pdf",
                                    "pmid",
                                    "pmc",
                                    "pii",
                                    "doi",
                                    "first_author_surname",
                                    "year",
                                    "journal",
                                    "volume",
                                    "page_from",
                                ):
                                    if k in citation:
                                        annotation["property:{0}".format(k)] = citation[k]
                                # print annotation.get('property:label'), annotation.get('property:pdf')
                                for match in matches:
                                    annotation.addExtent(match)
                                document.addAnnotation(annotation, link["scratch"])
                                # print citation
                            except:
                                raise
                                pass  # FIXME

                for id, table in info.get("tables", {}).iteritems():
                    if "caption" in table and "xml" in table:
                        regex = fuzz(table["caption"], strict=True)
                        print regex
                        matches = document.search(regex, spineapi.RegExp + spineapi.IgnoreCase)
                        if len(matches) == 1:
                            annotation = spineapi.Annotation()
                            annotation["concept"] = "Table"
                            annotation[
                                "session:upload_files"
                            ] = "data:application/xml;name=data.xml;base64,%s" % base64.standard_b64encode(table["xml"])
                            annotation["session:volatile"] = "1"
                            annotation.addExtent(matches[0])
                            document.addAnnotation(annotation, link["scratch"])
                        else:
                            print "*********** failed to match table:", id

Exemple #16

0

Afficher le fichier

Fichier : metadata.py Projet : project-renard-survey/utopia-documents-mirror

    def on_load_event(self, document):

        # Keep track of errors so that we can inform the user
        def add_error(component, method, category=None, message=None, exception=None):
            if exception is not None:
                if isinstance(exception, urllib2.URLError) and isinstance(exception.reason, socket.timeout):
                    exception = exception.reason

                if isinstance(exception, socket.timeout):
                    category = "timeout"
                    message = "The server did not respond"
                elif isinstance(exception, urllib2.HTTPError):
                    category = "server"
                    message = unicode(getattr(exception, "reason", "The server did not respond as expected"))
                elif isinstance(exception, urllib2.URLError):
                    category = "connection"
                    message = unicode(getattr(exception, "reason", "The server could not be found"))
            error = spineapi.Annotation()
            error["concept"] = "Error"
            error["property:component"] = component
            error["property:method"] = method
            error["property:category"] = category
            if message is not None:
                error["property:message"] = message
            document.addAnnotation(error, "errors.metadata")

        def add_success(component, method):
            error = spineapi.Annotation()
            error["concept"] = "Success"
            error["property:component"] = component
            error["property:method"] = method
            error["property:category"] = "success"
            document.addAnnotation(error, "errors.metadata")

        metadata = {"scraped": {}, "arxiv": {}, "pubmed": {}, "pmc": {}, "crossref": {}, "utopia": {}}

        authors = []
        publication = None
        volume = None
        issue = None
        year = None
        pages = None

        #################################################################################
        # Scrape DOI and title

        doi = common.doi.scrape(document)
        metadata["scraped"]["doi"] = doi
        print "scraper: doi:", (doi and doi.encode("utf8"))
        title = common.title.scrape(document)
        metadata["scraped"]["title"] = title
        print "scraper: title:", (title and title.encode("utf8"))

        #################################################################################
        # Scrape arXiv ID

        arxivid = common.arxiv.scrape(document)
        if arxivid is not None:
            metadata["scraped"]["arxivid"] = arxivid
            try:
                arxiv_results = common.arxiv.resolve(arxivid)
                if arxiv_results is not None:
                    arxiv_results.update({":whence": "arxiv", ":weight": 10})
                    common.utils.store_metadata(document, **arxiv_results)
            except Exception as e:
                add_error("ArXiv", "resolve", exception=e)
                traceback.print_exc()
            else:
                add_success("ArXiv", "resolve")

        #################################################################################
        # Fold in the CrossRef data

        issn = common.utils.metadata(document, "issn")
        if title is not None or doi is not None:
            if doi is None:
                try:
                    xref_results = common.crossref.search(title)
                    if len(xref_results) == 1:
                        xref_title = xref_results[0].get("title")
                        if xref_title is not None:
                            print "crossref: resolved title:", xref_title.encode("utf8")
                            # Accept the crossref title if present in the document (do magic dash pattern thing)
                            xref_title = re.sub(
                                ur"[^-\u002D\u007E\u00AD\u058A\u05BE\u1400\u1806\u2010-\u2015\u2053\u207B\u208B\u2212\u2E17\u2E3A\u2E3B\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",
                                lambda x: re.escape(x.group(0)),
                                xref_title,
                            )
                            xref_title = re.sub(
                                ur"[\u002D\u007E\u00AD\u058A\u05BE\u1400\u1806\u2010-\u2015\u2053\u207B\u208B\u2212\u2E17\u2E3A\u2E3B\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D-]+",
                                lambda x: r"\p{{Pd}}{{{0}}}".format(len(x.group(0))),
                                xref_title,
                            )
                            # print 'crossref: resolved title pattern:', xref_title.encode('utf8')
                            matches = document.search(xref_title, spineapi.RegExp + spineapi.IgnoreCase)
                            if len(matches) > 0:
                                doi = xref_results[0].get("doi")
                                print "crossref: accepting resolved doi"
                except Exception as e:
                    add_error("CrossRef", "search", exception=e)
                    traceback.print_exc()
                else:
                    add_success("CrossRef", "search")
            if doi is not None:
                # What is this DOI's article's title according to crossref?
                try:
                    xref_results = common.crossref.resolve(doi)
                    xref_results.update({":whence": "crossref", ":weight": 20})
                    xref_title = xref_results.get("title", "")
                    if len(xref_title) > 0:
                        print "crossref: resolved title:", xref_title.encode("utf8")
                        if (
                            re.sub(r"[^\w]+", " ", title).strip() == re.sub(r"[^\w]+", " ", xref_title).strip()
                        ):  # Fuzzy match
                            print "crossref: titles match precisely"
                            common.utils.store_metadata(document, **xref_results)
                        else:
                            # Accept the crossref title over the scraped title, if present in the document
                            matches = document.findInContext("", xref_title, "")  # Fuzzy match
                            if len(matches) > 0:
                                common.utils.store_metadata(document, **xref_results)
                                title = xref_title
                                print "crossref: overriding scraped title with crossref title"
                            else:
                                print "crossref: ignoring resolved metadata"
                                # FIXME should we discard the DOI at this point?
                except Exception as e:
                    add_error("CrossRef", "resolve", exception=e)
                    traceback.print_exc()
                else:
                    add_success("CrossRef", "resolve")

        ###########################################################################################
        # Fold in the PubMed data
        pii = common.utils.metadata(document, "pii")
        pmid = common.utils.metadata(document, "pmid")
        pmcid = common.utils.metadata(document, "pmcid")
        if pmid is None and doi is not None:  # resolve on DOI
            try:
                pmid = common.pubmed.resolve(doi, "doi")
            except Exception as e:
                add_error("PubMed", "resolve", exception=e)
                traceback.print_exc()
            else:
                add_success("PubMed", "resolve")
        if pmid is None and title is not None:  # resolve on title
            try:
                pubmed_results = common.pubmed.search(title)
                pubmed_title = pubmed_results.get("title", "").strip(" .")
                if len(pubmed_title) > 0:
                    print "pubmed: resolved title:", pubmed_title.encode("utf8")
                    pubmed_pmid = pubmed_results.get("pmid")
                    print "pubmed: resolved pmid:", pubmed_pmid
                    if (
                        re.sub(r"[^\w]+", " ", title).strip() == re.sub(r"[^\w]+", " ", pubmed_title).strip()
                    ):  # Fuzzy match
                        print "pubmed: titles match precisely"
                        title = pubmed_title
                        pmid = pubmed_pmid
                    else:
                        # Accept the pubmed title over the scraped title, if present in the document
                        matches = document.findInContext("", pubmed_title, "")  # Fuzzy match
                        if len(matches) > 0:
                            title = matches[0].text()
                            pmid = pubmed_pmid
                            print "pubmed: overriding scraped title with pubmed title"
                        else:
                            print "pubmed: ignoring resolved title"
            except Exception as e:
                add_error("PubMed", "search", exception=e)
                traceback.print_exc()
            else:
                add_success("PubMed", "search")
        if pmid is not None:
            try:
                nlm = common.pubmed.fetch(pmid)
                if nlm is not None:
                    xml = etree.fromstring(nlm)

                    pubmed_authors = []
                    for author in xml.findall("PubmedArticle/MedlineCitation/Article/AuthorList/Author"):
                        name = u""
                        lastName = author.findtext("LastName")
                        forename = author.findtext("ForeName")
                        if lastName is not None:
                            name = lastName + u", "
                        if forename is not None:
                            name += forename
                        if len(name) > 0:
                            pubmed_authors.append(name)
                    if len(pubmed_authors) == 0:
                        pubmed_authors = None

                    pubmed_pmid = xml.findtext("PubmedArticle/MedlineCitation/PMID")

                    common.utils.store_metadata(
                        document,
                        **{
                            ":whence": "pubmed",
                            ":weight": 10,
                            "raw_pubmed_nlm": nlm,
                            "authors": pubmed_authors,
                            "pmid": pubmed_pmid,
                            "title": xml.findtext("PubmedArticle/MedlineCitation/Article[1]/ArticleTitle"),
                            "issn": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/ISSN[1]"),
                            "doi": xml.findtext('PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType="doi"]'),
                            "pmcid": xml.findtext('PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType="pmc"]'),
                            "pii": xml.findtext('PubmedArticle/PubmedData/ArticleIdList/ArticleId[@IdType="pii"]'),
                            "publication-title": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/Title"),
                            "volume": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Volume"),
                            "issue": xml.findtext("PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/Issue"),
                            "year": xml.findtext(
                                "PubmedArticle/MedlineCitation/Article/Journal/JournalIssue/PubDate/Year"
                            ),
                            "pages": xml.findtext("PubmedArticle/MedlineCitation/Article[1]/Pagination/MedlinePgn"),
                            "abstract": xml.findtext("PubmedArticle/MedlineCitation/Article[1]/Abstract/AbstractText"),
                        }
                    )
                    pmid = pubmed_pmid or pmid

                    # FIXME I'm sure the above should be in common.pubmed
            except Exception as e:
                add_error("PubMed", "fetch", exception=e)
                traceback.print_exc()
            else:
                add_success("PubMed", "fetch")

        ###########################################################################################
        # Fold in the PubMedCentral data
        if pmcid is None and doi is not None:  # resolve on DOI
            try:
                pmcid = common.pmc.resolve(doi, "doi")
            except Exception as e:
                add_error("PubMed Central", "resolve", exception=e)
                traceback.print_exc()
            else:
                add_success("PubMed Central", "resolve")
        if pmcid is None and pmid is not None:  # resolve on PubMed ID
            try:
                pmcid = common.pmc.resolve(pmid, "pmid")
            except Exception as e:
                add_error("PubMed Central", "resolve", exception=e)
                traceback.print_exc()
            else:
                add_success("PubMed Central", "resolve")
        if pmcid is not None:
            common.utils.store_metadata(document, **{":whence": "pmc", ":weight": 10, "pmcid": pmcid})
            try:
                nlm = common.pmc.fetch(pmcid)
                if nlm is not None:
                    common.utils.store_metadata(document, **{":whence": "pmc", ":weight": 10, "raw_pmc_nlm": nlm})
            except Exception as e:
                add_error("PubMed Central", "fetch", exception=e)
                traceback.print_exc()
            else:
                add_success("PubMed Central", "fetch")

        ###########################################################################################

        scraped = metadata["scraped"]
        scraped.update({":whence": "document", ":weight": 5})
        common.utils.store_metadata(document, **scraped)