Python kill_html Examples, papers.utils.kill_html Python Examples

Example #1

0

Show file

File: romeo.py Project: jilljenn/dissemin

def get_or_create_publisher(romeo_xml_description):
    """
    Retrieves from the model, or creates into the model,
    the publisher corresponding to the <publisher> description
    from RoMEO
    """
    xml = romeo_xml_description
    romeo_id = None
    try:
        romeo_id = xml.attrib['id']
    except KeyError:
        raise MetadataSourceException('RoMEO did not provide a publisher id.\n'+
                'URL was: '+request)
    
    name = None
    try:
        raw_name = xml.findall('./name')[0].text.strip()
        name = fromstring(kill_html(sanitize_html(raw_name))).text
    except (KeyError, IndexError, AttributeError):
        raise MetadataSourceException('RoMEO did not provide the publisher\'s name.\n'+
                'URL was: '+request)

    alias = None
    try:
        alias = nstrip(xml.findall('./alias')[0].text)
        if alias:
            alias = fromstring(kill_html(sanitize_html(alias))).text
    except KeyError, IndexError:
        pass

Example #2

0

Show file

File: romeo.py Project: jilljenn/dissemin

def fetch_journal(search_terms, matching_mode = 'exact'):
    """
    Fetch the journal data from RoMEO. Returns an Journal object.
    search_terms should be a dictionnary object containing at least one of these fields:
    """
    allowed_fields = ['issn', 'jtitle']
    # Make the title HTML-safe before searching for it in the database or in the API
    if 'title' in search_terms:
        search_terms['title'] = kill_html(search_terms['title'])
    original_search_terms = search_terms.copy()

    # Check the arguments
    if not all(map(lambda x: x in allowed_fields, (key for key in search_terms))):
        raise ValueError('The search terms have to belong to '+str(allowed_fields)+
                'but the dictionary I got is '+str(search_terms))

    # Remove diacritics (because it has to be sent in ASCII to ROMEO)
    for key in search_terms:
        search_terms[key] = remove_diacritics(search_terms[key])

    # First check we don't have it already
    journal = find_journal_in_model(search_terms)
    if journal:
        return journal

    # Perform the query
    root = perform_romeo_query(search_terms)

    # Find the matching journals (if any)
    journals = list(root.findall('./journals/journal'))
    if not journals:
        # Retry with a less restrictive matching type
        if matching_mode == 'exact':
            return fetch_journal(original_search_terms, 'contains')
        return None
    if len(journals) > 1:
        print ("Warning, "+str(len(journals))+" journals match the RoMEO request, "+
                "defaulting to the first one")
        # TODO different behaviour: get the ISSN and try again.
    journal = journals[0]

    names = list(journal.findall('./jtitle'))
    if not names:
        raise MetadataSourceException('RoMEO returned a journal without title.\n'+
                'URL was: '+request)
    if len(names) > 1:
        print("Warning, "+str(len(names))+" names provided for one journal, "+
                "defaulting to the first one")
    name = kill_html(names[0].text)
    
    issn = None
    try:
        issn = nstrip(journal.findall('./issn')[0].text)
    except KeyError, IndexError:
        pass

Example #3

0

Show file

File: protocol.py Project: Phyks/dissemin

    def get_form_initial_data(self):
        data = super(OSFProtocol, self).get_form_initial_data()

        if self.paper.abstract:
            data['abstract'] = kill_html(self.paper.abstract)

        return data

Example #4

0

Show file

File: protocol.py Project: Phyks/dissemin

    def get_form_initial_data(self):
        data = super(HALProtocol, self).get_form_initial_data()

        data['first_name'] = self.user.first_name
        data['last_name'] = self.user.last_name

        # Abstract
        if self.paper.abstract:
            data['abstract'] = kill_html(self.paper.abstract)
        else:
            self.paper.consolidate_metadata(wait=False)

        # Topic
        topic_text = ''
        if 'abstract' in data:
            topic_text = data['abstract']
        else:
            topic_text = self.paper.title
        data['topic'] = self.predict_topic(topic_text)
        if data['topic'] == 'OTHER':
            del data['topic']

        # Depositing author
        most_similar_idx = None
        first, last = (self.user.first_name, self.user.last_name)
        if first and last:
            most_similar_idx = most_similar_author((first,last),
                self.paper.author_name_pairs())
        data['depositing_author'] = most_similar_idx

        return data

Example #5

0

Show file

File: protocol.py Project: Phyks/dissemin

 def get_form_initial_data(self):
     data = super(ZenodoProtocol, self).get_form_initial_data()
     data['license'] = ZENODO_DEFAULT_LICENSE_CHOICE
     if self.paper.abstract:
         data['abstract'] = kill_html(self.paper.abstract)
     else:
         self.paper.consolidate_metadata(wait=False)
     return data

Example #6

0

Show file

File: protocol.py Project: Lysxia/dissemin

 def get_form(self):
     data = {}
     data['paper_id'] = self.paper.id
     if self.paper.abstract:
         data['abstract'] = kill_html(self.paper.abstract)
     else:
         self.paper.consolidate_metadata(wait=False)
     return HALForm(initial=data)

Example #7

0

Show file

File: protocol.py Project: jilljenn/dissemin

 def get_form(self):
     data = {}
     data["license"] = "other-open"
     data["paper_id"] = self.paper.id
     if self.paper.abstract:
         data["abstract"] = kill_html(self.paper.abstract)
     else:
         self.paper.consolidate_metadata(wait=False)
     return ZenodoForm(initial=data)

Example #8

0

Show file

File: protocol.py Project: Lysxia/dissemin

 def get_form(self):
     data = {}
     data['license'] = 'other-open'
     data['paper_id'] = self.paper.id
     if self.paper.abstract:
         data['abstract'] = kill_html(self.paper.abstract)
     else:
         self.paper.consolidate_metadata(wait=False)
     return ZenodoForm(initial=data)

Example #9

0

Show file

File: ajax.py Project: Lysxia/dissemin

def waitForConsolidatedField(request):
    try:
        paper = Paper.objects.get(pk=int(request.GET["id"]))
    except (KeyError, ValueError, Paper.DoesNotExist):
        return HttpResponseForbidden('Invalid paper id', content_type='text/plain')
    field = request.GET.get('field')
    value = None
    success = False
    paper.consolidate_metadata(wait=True)
    if field == 'abstract':
        value = kill_html(paper.abstract)
        success = len(paper.abstract) > 64
    else:
        return {'success':success,'message':'Invalid field'}
    return {'success':success,'value':value}

Example #10

0

Show file

File: ajax.py Project: Phyks/dissemin

def waitForConsolidatedField(request):
    success = False
    try:
        paper = Paper.objects.get(pk=int(request.GET["id"]))
    except (KeyError, ValueError, Paper.DoesNotExist):
        return {'success': success, 'message': 'Invalid paper id'}, 404
    field = request.GET.get('field')
    value = None
    try:
        paper.consolidate_metadata(wait=True)
    except TimeoutError:
        # Zotero instance is down / slow / failing, consolidation failed. Not
        # a big deal.
        pass
    if field == 'abstract':
        value = kill_html(paper.abstract)
        success = len(paper.abstract) > 64
    else:
        return {'success': success, 'message': 'Invalid field'}, 401
    return {'success': success, 'value': value}

Example #11

0

Show file

File: test_utils.py Project: Phyks/dissemin

 def test_kill_html(self):
     self.assertEqual(kill_html('My title<sub>is</sub><a href="http://dissem.in"><sup>nice</sup>    </a>'), 'My titleisnice')

Example #12

0

Show file

    def get_or_create_publisher(self, romeo_xml_description):
        """
        Retrieves from the model, or creates into the model,
        the publisher corresponding to the <publisher> description
        from RoMEO.

        If the data from RoMEO is more fresh than what we have
        in cache, we update our model.
        """
        xml = romeo_xml_description
        romeo_id = None
        try:
            romeo_id = xml.attrib['id']
        except KeyError:
            raise MetadataSourceException('RoMEO did not provide a publisher id.')

        romeo_parent_id = None
        try:
            romeo_parent_id = xml.attrib['parentid']
        except KeyError:
            pass

        name = None
        try:
            raw_name = xml.findall('./name')[0].text.strip()
            name = fromstring(kill_html(sanitize_html(raw_name))).text
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the publisher\'s name.')

        alias = None
        try:
            alias = nstrip(xml.findall('./alias')[0].text)
            if alias:
                alias = fromstring(kill_html(sanitize_html(alias))).text
        except (KeyError, IndexError):
            pass

        last_update = self._get_romeo_date(xml, './dateupdated')

        # Check if we already have it.
        # Sadly the romeo_id is not unique (as publishers imported from doaj
        # all get the same id, so we have to use the name too).
        matches = None
        if re.match(r'\d+', romeo_id): # numeric ids are unambiguous
            matches = Publisher.objects.filter(romeo_id=romeo_id)
        elif alias:
            matches = Publisher.objects.filter(
                romeo_id=romeo_id, name__iexact=name, alias__iexact=alias)
        else:
            matches = Publisher.objects.filter(
                romeo_id=romeo_id, name__iexact=name, alias__isnull=True)
        if matches:
            first_match = matches[0]
            if first_match.last_updated is not None and first_match.last_updated >= last_update:
                return matches[0]

        # Otherwise, create it
        url = None
        try:
            url = nstrip(xml.findall('./homeurl')[0].text)
        except (KeyError, IndexError):
            pass

        preprint = None
        try:
            preprint = xml.findall('./preprints/prearchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the preprint policy.')

        postprint = None
        try:
            postprint = xml.findall('./postprints/postarchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the postprint policy.')

        pdfversion = None
        try:
            pdfversion = xml.findall('./pdfversion/pdfarchiving')[0].text.strip()
        except (KeyError, IndexError, AttributeError):
            raise MetadataSourceException(
                'RoMEO did not provide the pdf archiving policy.')

        # Compute OA status of the publisher
        status = 'UNK'

        if not matches:
            publisher = Publisher()
        else:
            publisher = matches[0]

        publisher.name = name
        publisher.alias = alias
        publisher.url = url
        publisher.preprint = preprint
        publisher.postprint = postprint
        publisher.pdfversion = pdfversion
        publisher.romeo_id = romeo_id
        publisher.romeo_parent_id = romeo_parent_id
        publisher.oa_status = status
        publisher.last_updated = last_update
        publisher.save()

        if matches:
            publisher.publishercopyrightlink_set.all().delete()
            publisher.publisherrestrictiondetail_set.all().delete()
            publisher.publishercondition_set.all().delete()

        # Add the conditions, restrictions, and copyright
        for restriction in xml.findall('./preprints/prerestrictions/prerestriction'):
            self.add_restriction(restriction, 'preprint', publisher)

        for restriction in xml.findall('./postprints/postrestrictions/postrestriction'):
            self.add_restriction(restriction, 'postprint', publisher)

        for restriction in xml.findall('./pdfversion/pdfrestrictions/pdfrestriction'):
            self.add_restriction(restriction, 'pdfversion', publisher)

        for condition in xml.findall('./conditions/condition'):
            if condition.text:
                c = PublisherCondition(publisher=publisher,
                                       text=condition.text.strip())
                c.save()

        # Update the publisher status
        publisher.oa_status = publisher.classify_oa_status()
        publisher.save(update_fields=['oa_status'])

        # TODO: if the OA status has changed, then we should update the journals and papers accordingly with the
        # adequate task

        for link in xml.findall('./copyrightlinks/copyrightlink'):
            text = None
            url = None
            texts = link.findall('./copyrightlinktext')
            if texts:
                text = nstrip(texts[0].text)
            urls = link.findall('./copyrightlinkurl')
            if urls:
                url = nstrip(urls[0].text)
            if url and text:
                cplink = PublisherCopyrightLink(
                    text=text, url=url[:1024], publisher=publisher)
                cplink.save()

        return publisher

Example #13

0

Show file

def create_paper_plain_fingerprint(title, authors, year):
    """
    Creates a robust summary of a bibliographic reference.
    This plain fingerprint should then be converted to an
    actual fingerprint by hashing it (so that the length remains
    constant).

    :param title: the title of the paper
    :param authors: the list of author names, represented
        as (first_name, last_name) pairs
    :param year: the year of publication of the paper

    >>> create_paper_plain_fingerprint(' It  cleans whitespace And Case\\n',[('John','Doe')], 2015)
    u'it-cleans-whitespace-and-case/doe'
    >>> create_paper_plain_fingerprint('HTML tags are <emph>removed</emph>',[('John','Doe')], 2015)
    u'html-tags-are-removed/doe'
    >>> create_paper_plain_fingerprint('Les accents sont supprimés', [('John','Doe')],2015)
    u'les-accents-sont-supprimes/doe'
    >>> create_paper_plain_fingerprint('Long titles are unambiguous enough to be unique by themselves, no need for authors', [('John','Doe')], 2015)
    u'long-titles-are-unambiguous-enough-to-be-unique-by-themselves-no-need-for-authors'
    >>> create_paper_plain_fingerprint('Ambiguity', [('John','Doe')], 2014)
    u'ambiguity-2014/doe'
    """
    title = kill_html(title)
    title = remove_diacritics(title).lower()
    title = stripped_chars.sub('', title)
    title = title.strip()
    title = re.sub('[ -]+', '-', title)
    buf = title

    # If the title is long enough, we return the fingerprint as is
    if len(buf) > 50:
        return buf

    # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases)
    # if len(title) <= 16:
    if not '-' in title:
        buf += '-' + str(year)

    author_names_list = []
    for author in authors:
        if not author:
            continue
        author = (remove_diacritics(author[0]), remove_diacritics(author[1]))

        # Last name, without the small words such as "van", "der", "de"…
        last_name_words, last_name_separators = split_name_words(author[1])
        last_words = []
        for i, w in enumerate(last_name_words):
            if (w[0].isupper()
                    or (i > 0 and last_name_separators[i - 1] == '-')):
                last_words.append(w)

        # If no word was uppercased, fall back on all the words
        if not last_words:
            last_words = last_name_words

        # Lowercase
        last_words = map(ulower, last_words)
        fp = '-'.join(last_words)
        author_names_list.append(fp)

    author_names_list.sort()
    for fp in author_names_list:
        buf += '/' + fp

    return buf

Example #14

0

Show file

File: protocol.py Project: jilljenn/dissemin

    def createMetadata(self, form):
        metadata = {}
        oairecords = self.paper.sorted_oai_records
        publications = self.paper.publication_set.all()

        # Document type
        dt = swordDocumentType(self.paper)
        metadata['upload_type'] = dt[0]
        if dt[0] == 'publication':
            metadata['publication_type'] = dt[1]

        # Publication date
        metadata['publication_date'] = self.paper.pubdate.isoformat()

        # Title
        metadata['title'] = self.paper.title

        # Creators
        def formatAuthor(author):
            res = {'name': author.name.last + ', ' + author.name.first}
            if author.researcher and author.researcher.orcid:
                res['orcid'] = author.researcher.orcid
            # TODO: affiliation
            return res

        metadata['creators'] = map(formatAuthor, self.paper.authors)

        # Abstract
        # If we are currently fetching the abstract, wait for the task to complete
        if self.paper.task:
            self.paper.consolidate_metadata(wait=True)
        abstract = form.cleaned_data['abstract'] or kill_html(
            self.paper.abstract)

        metadata['description'] = abstract

        # Access right: TODO

        # License
        metadata['license'] = form.cleaned_data['license']

        # Embargo date: TODO

        # DOI
        for publi in publications:
            metadata['doi'] = publi.doi
            if publi.pubdate:
                metadata['publication_date'] = publi.pubdate.isoformat()
                if publi.journal:
                    metadata['journal_title'] = publi.journal.title
                else:
                    metadata['journal_title'] = publi.title
                if publi.volume:
                    metadata['journal_volume'] = publi.volume
                if publi.issue:
                    metadata['journal_issue'] = publi.issue
                if publi.pages:
                    metadata['journal_pages'] = publi.pages
                if publi.container:
                    metadata['conference_title'] = publi.container
                break

        # Keywords TODO (this involves having separated keywords in OAI records.)

        # Notes TODO
        # metadata['notes'] = 'Uploaded by dissem.in on behalf of ' …

        # Related identifiers
        idents = map(
            lambda r: {
                'relation': 'isAlternateIdentifier',
                'identifier': r.splash_url
            }, oairecords)
        for publi in publications:
            if publi.journal and publi.journal.issn:
                idents.append({
                    'relation': 'isPartOf',
                    'identifier': publi.journal.issn
                })

        data = {"metadata": metadata}
        return data

Example #15

0

Show file

File: protocol.py Project: NikolaJankovic/dissemin

    def createMetadata(self, form):
        metadata = {}
        oairecords = self.paper.sorted_oai_records
        publications = self.paper.publications

        # Document type
        dt = swordDocumentType(self.paper)
        metadata['upload_type'] = dt[0]
        if dt[0] == 'publication':
            metadata['publication_type'] = dt[1]

        # Publication date
        metadata['publication_date'] = self.paper.pubdate.isoformat()

        # Title
        metadata['title'] = self.paper.title

        # Creators
        def formatAuthor(author):
            res = {'name': author.name.last + ', ' + author.name.first}
            if author.researcher and author.researcher.orcid:
                res['orcid'] = author.researcher.orcid
            # TODO: affiliation
            return res

        metadata['creators'] = map(formatAuthor, self.paper.authors)

        # Abstract
        abstract = form.cleaned_data['abstract'] or kill_html(
            self.paper.abstract)

        metadata['description'] = abstract

        # Access right: TODO

        # License
        metadata['license'] = form.cleaned_data['license']

        # Embargo date: TODO

        # DOI
        for publi in publications:
            metadata['doi'] = publi.doi
            if publi.pubdate:
                metadata['publication_date'] = publi.pubdate.isoformat()
                if publi.journal:
                    metadata['journal_title'] = publi.journal.title
                else:
                    metadata['journal_title'] = publi.journal_title
                if publi.volume:
                    metadata['journal_volume'] = publi.volume
                if publi.issue:
                    metadata['journal_issue'] = publi.issue
                if publi.pages:
                    metadata['journal_pages'] = publi.pages
                if publi.container:
                    metadata['conference_title'] = publi.container
                break

        # Related identifiers
        idents = map(
            lambda r: {
                'relation': 'isAlternateIdentifier',
                'identifier': r.splash_url
            }, oairecords)
        for publi in publications:
            if publi.journal and publi.journal.issn:
                idents.append({
                    'relation': 'isPartOf',
                    'identifier': publi.journal.issn
                })

        data = {"metadata": metadata}
        return data

Example #16

0

Show file

File: protocol.py Project: Phyks/dissemin

    def createMetadata(self, form):
        metadata = {}
        oairecords = self.paper.sorted_oai_records
        publications = self.paper.publications

        # Document type
        dt = swordDocumentType(self.paper)
        metadata['upload_type'] = dt[0]
        if dt[0] == 'publication':
            metadata['publication_type'] = dt[1]

        # Publication date
        metadata['publication_date'] = self.paper.pubdate.isoformat()

        # Title
        metadata['title'] = self.paper.title

        # Creators
        def formatAuthor(author):
            res = {'name': author.name.last+', '+author.name.first}
            if author.researcher and author.researcher.orcid:
                res['orcid'] = author.researcher.orcid
            # TODO: affiliation
            return res
        metadata['creators'] = list(map(formatAuthor, self.paper.authors))

        # Abstract
        abstract = form.cleaned_data[
            'abstract'] or kill_html(self.paper.abstract)

        metadata['description'] = abstract

        # Access right: TODO

        # License
        metadata['license'] = form.cleaned_data['license']

        # Embargo date: TODO

        # DOI
        for publi in publications:
            metadata['doi'] = publi.doi
            if publi.pubdate:
                metadata['publication_date'] = publi.pubdate.isoformat()
                if publi.journal:
                    metadata['journal_title'] = publi.journal.title
                else:
                    metadata['journal_title'] = publi.journal_title
                if publi.volume:
                    metadata['journal_volume'] = publi.volume
                if publi.issue:
                    metadata['journal_issue'] = publi.issue
                if publi.pages:
                    metadata['journal_pages'] = publi.pages
                if publi.container:
                    metadata['conference_title'] = publi.container
                break

        # Related identifiers
        idents = [{
                'relation': 'isAlternateIdentifier',
                'identifier': r.splash_url
            } for r in oairecords]
        for publi in publications:
            if publi.journal and publi.journal.issn:
                idents.append(
                    {'relation': 'isPartOf', 'identifier': publi.journal.issn})

        data = {"metadata": metadata}
        return data

Example #17

0

Show file

File: fingerprint.py Project: Phyks/dissemin

def create_paper_plain_fingerprint(title, authors, year):
    """
    Creates a robust summary of a bibliographic reference.
    This plain fingerprint should then be converted to an
    actual fingerprint by hashing it (so that the length remains
    constant).

    :param title: the title of the paper
    :param authors: the list of author names, represented
        as (first_name, last_name) pairs
    :param year: the year of publication of the paper

    >>> create_paper_plain_fingerprint(' It  cleans whitespace And Case\\n',[('John','Doe')], 2015)
    'it-cleans-whitespace-and-case/doe'
    >>> create_paper_plain_fingerprint('HTML tags are <emph>removed</emph>',[('John','Doe')], 2015)
    'html-tags-are-removed/doe'
    >>> create_paper_plain_fingerprint('Les accents sont supprimés', [('John','Doe')],2015)
    'les-accents-sont-supprimes/doe'
    >>> create_paper_plain_fingerprint('Long titles are unambiguous enough to be unique by themselves, no need for authors', [('John','Doe')], 2015)
    'long-titles-are-unambiguous-enough-to-be-unique-by-themselves-no-need-for-authors'
    >>> create_paper_plain_fingerprint('Ambiguity', [('John','Doe')], 2014)
    'ambiguity-2014/doe'
    """
    title = kill_html(title)
    title = remove_diacritics(title).lower()
    title = stripped_chars.sub('', title)
    title = title.strip()
    title = re.sub('[ -]+', '-', title)
    buf = title

    # If the title is long enough, we return the fingerprint as is
    if len(buf) > 50:
        return buf

    # If the title is very short, we add the year (for "Preface", "Introduction", "New members" cases)
    # if len(title) <= 16:
    if not '-' in title:
        buf += '-'+str(year)

    author_names_list = []
    for author in authors:
        if not author:
            continue
        author = (remove_diacritics(author[0]), remove_diacritics(author[1]))

        # Last name, without the small words such as "van", "der", "de"…
        last_name_words, last_name_separators = split_name_words(author[1])
        last_words = []
        for i, w in enumerate(last_name_words):
            if (w[0].isupper() or
                    (i > 0 and last_name_separators[i-1] == '-')):
                last_words.append(w)

        # If no word was uppercased, fall back on all the words
        if not last_words:
            last_words = last_name_words

        # Lowercase
        last_words = list(map(ulower, last_words))
        fp = '-'.join(last_words)
        author_names_list.append(fp)

    author_names_list.sort()
    for fp in author_names_list:
        buf += '/'+fp

    return buf

Example #18

0

Show file

File: protocol.py Project: Lysxia/dissemin

    def createMetadata(self, form):
        metadata = {}
        oairecords = self.paper.sorted_oai_records
        publications = self.paper.publications

        # Document type
        dt = swordDocumentType(self.paper)
        metadata['upload_type'] = dt[0]
        if dt[0] == 'publication':
            metadata['publication_type'] = dt[1]

        # Publication date
        metadata['publication_date'] = self.paper.pubdate.isoformat()

        # Title
        metadata['title'] = self.paper.title

        # Creators
        def formatAuthor(author):
            res = {'name':author.name.last+', '+author.name.first}
            if author.researcher and author.researcher.orcid:
                res['orcid'] = author.researcher.orcid
            # TODO: affiliation
            return res
        metadata['creators'] = map(formatAuthor, self.paper.authors)

        # Abstract
        # If we are currently fetching the abstract, wait for the task to complete
        if self.paper.task:
            self.paper.consolidate_metadata(wait=True)
        abstract = form.cleaned_data['abstract'] or kill_html(self.paper.abstract)

        metadata['description'] = abstract

        # Access right: TODO

        # License
        metadata['license'] = form.cleaned_data['license']

        # Embargo date: TODO

        # DOI
        for publi in publications:
            metadata['doi'] = publi.doi
            if publi.pubdate:
                metadata['publication_date'] = publi.pubdate.isoformat()
                if publi.journal:
                    metadata['journal_title'] = publi.journal.title
                else:
                    metadata['journal_title'] = publi.title
                if publi.volume:
                    metadata['journal_volume'] = publi.volume
                if publi.issue:
                    metadata['journal_issue'] = publi.issue
                if publi.pages:
                    metadata['journal_pages'] = publi.pages
                if publi.container:
                    metadata['conference_title'] = publi.container
                break

        # Keywords TODO (this involves having separated keywords in OAI records.)

        # Notes TODO
        # metadata['notes'] = 'Uploaded by dissem.in on behalf of ' …

        # Related identifiers
        idents = map(lambda r: {'relation':'isAlternateIdentifier','identifier':r.splash_url}, oairecords)
        for publi in publications:
            if publi.journal and publi.journal.issn:
                idents.append({'relation':'isPartOf','identifier':publi.journal.issn})
        
        data = {"metadata": metadata}
        return data

Example #19

0

Show file

File: romeo.py Project: tarsbase/dissemin

def fetch_journal(search_terms, matching_mode='exact'):
    """
    Fetch the journal data from RoMEO. Returns an Journal object.
    search_terms should be a dictionnary object containing at least one of these fields:
    """
    allowed_fields = ['issn', 'jtitle']
    terms = search_terms.copy()
    # Make the title HTML-safe before searching for it in the database or in
    # the API
    if 'title' in terms:
        terms['title'] = kill_html(terms['title'])

    # Check the arguments
    if not all(key in allowed_fields for key in terms):
        raise ValueError('The search terms have to belong to ' +
                         str(allowed_fields) + 'but the dictionary I got is ' +
                         str(terms))

    # Remove diacritics (because it has to be sent in ASCII to ROMEO)
    for key in terms:
        terms[key] = remove_diacritics(terms[key])
        if len(terms[key]) > 256:
            return None

    # First check we don't have it already
    journal = find_journal_in_model(terms)
    if journal:
        return journal

    # Perform the query
    if matching_mode != 'exact':
        terms['qtype'] = matching_mode
    root = perform_romeo_query(terms)

    # Find the matching journals (if any)
    journals = list(root.findall('./journals/journal'))

    if not journals:
        return None
    elif len(journals) > 1:
        print("Warning, " + str(len(journals)) +
              " journals match the RoMEO request, " +
              "defaulting to the first one")
        # TODO different behaviour: get the ISSN and try again.
    journal = journals[0]

    names = list(journal.findall('./jtitle'))
    if not names:
        raise MetadataSourceException(
            'RoMEO returned a journal without title.\n' + 'Terms were: ' +
            unicode(terms))
    if len(names) > 1:
        print("Warning, " + str(len(names)) +
              " names provided for one journal, " +
              "defaulting to the first one")
    name = kill_html(names[0].text)

    issn = None
    try:
        issn = nstrip(journal.findall('./issn')[0].text)
    except (KeyError, IndexError):
        pass

    # Now we may have additional info, so it's worth trying again in the model
    model_journal = find_journal_in_model({'issn': issn, 'jtitle': name})
    if model_journal:
        return model_journal

    # Otherwise we need to find the publisher
    publishers = root.findall('./publishers/publisher')
    if not publishers:
        return None
    # TODO here we shouldn't default to the first one but look it up using the
    # <romeopub>
    publisher_desc = publishers[0]

    publisher = get_or_create_publisher(publisher_desc)

    result = Journal(title=name, issn=issn, publisher=publisher)
    result.save()
    return result