Example #1
0
 def get_normalised_fulltext(self):
     if self.data.get("index", {}).get("fulltext") is not None:
         return self.data["index"]["fulltext"]
     fulltexts = self.bibjson().get_urls(constants.LINK_TYPE_FULLTEXT)
     if len(fulltexts) == 0:
         return None
     try:
         return normalise.normalise_url(fulltexts[0])
     except ValueError:
         # can't be normalised, so we just return the url as-is
         return fulltexts[0]
Example #2
0
File: article.py Project: DOAJ/doaj
 def get_normalised_fulltext(self):
     if self.data.get("index", {}).get("fulltext") is not None:
         return self.data["index"]["fulltext"]
     fulltexts = self.bibjson().get_urls(constants.LINK_TYPE_FULLTEXT)
     if len(fulltexts) == 0:
         return None
     try:
         return normalise.normalise_url(fulltexts[0])
     except ValueError:
         # can't be normalised, so we just return the url as-is
         return fulltexts[0]
Example #3
0
    def test_01_normalise_url(self, name, kwargs):

        url_arg = kwargs.get("url")
        scheme_arg = kwargs.get("scheme")
        whitespace_arg = kwargs.get("whitespace")

        raises_arg = kwargs.get("raises")
        raises = EXCEPTIONS.get(raises_arg)

        ###############################################
        ## set up

        canonicalUrl = None
        if url_arg != "none":
            canonicalUrl = "//example.com/path;p=1?query=one&two=three#frag"

        url = canonicalUrl
        if scheme_arg == "none" and url is not None:
            url = url[2:]
        if scheme_arg not in ["-", "invalid", "none", "//"]:
            url = scheme_arg + ":" + url
        elif scheme_arg == "invalid":
            url = "somerubbish:" + url
        elif scheme_arg == "unknown":
            url = "unknown:" + url

        if whitespace_arg == "yes":
            url = "   " + url + "\t\n"

        ###########################################################
        # Execution

        if raises is not None:
            with self.assertRaises(raises):
                norm = normalise_url(url)
        else:
            norm = normalise_url(url)
            assert norm == canonicalUrl
Example #4
0
    def duplicates(cls,
                   publisher_record_id=None,
                   doi=None,
                   fulltexts=None,
                   title=None,
                   volume=None,
                   number=None,
                   start=None,
                   should_match=None,
                   size=10):
        # some input sanitisation
        urls = fulltexts if isinstance(fulltexts, list) else [
            fulltexts
        ] if isinstance(fulltexts, str) or isinstance(fulltexts, str) else []

        # make sure that we're dealing with the normal form of the identifiers
        norm_urls = []
        for url in urls:
            try:
                norm = normalise.normalise_url(url)
                norm_urls.append(norm)
            except ValueError:
                # use the non-normal form
                norm_urls.append(url)
        urls = norm_urls

        try:
            doi = normalise.normalise_doi(doi)
        except ValueError:
            # leave the doi as it is
            pass

        q = DuplicateArticleQuery(publisher_record_id=publisher_record_id,
                                  doi=doi,
                                  urls=urls,
                                  title=title,
                                  volume=volume,
                                  number=number,
                                  start=start,
                                  should_match=should_match,
                                  size=size)

        res = cls.query(q=q.query())
        return [
            cls(**hit.get("_source"))
            for hit in res.get("hits", {}).get("hits", [])
        ]
Example #5
0
    def _generate_index(self):
        # the index fields we are going to generate
        issns = []
        subjects = []
        schema_subjects = []
        schema_codes = []
        classification = []
        langs = []
        country = None
        licenses = []
        publisher = []
        classification_paths = []
        unpunctitle = None
        asciiunpunctitle = None
        doi = None
        fulltext = None

        # the places we're going to get those fields from
        cbib = self.bibjson()
        jindex = self.data.get('index', {})
        hist = self.history()

        # get the issns out of the current bibjson
        issns += cbib.get_identifiers(cbib.P_ISSN)
        issns += cbib.get_identifiers(cbib.E_ISSN)

        # get the issn from the journal bibjson
        if isinstance(cbib.journal_issns, list):
            issns += cbib.journal_issns

        # de-duplicate the issns
        issns = list(set(issns))

        # now get the issns out of the historic records
        for date, hbib in hist:
            issns += hbib.get_identifiers(hbib.P_ISSN)
            issns += hbib.get_identifiers(hbib.E_ISSN)

        # get the subjects and concatenate them with their schemes from the current bibjson
        for subs in cbib.subjects():
            scheme = subs.get("scheme")
            term = subs.get("term")
            subjects.append(term)
            schema_subjects.append(scheme + ":" + term)
            classification.append(term)
            if "code" in subs:
                schema_codes.append(scheme + ":" + subs.get("code"))

        # copy the languages
        from portality import datasets  # delayed import, as it loads some stuff from file
        if len(cbib.journal_language) > 0:
            langs = [datasets.name_for_lang(l) for l in cbib.journal_language]

        # copy the country
        if jindex.get('country'):
            country = jindex.get('country')
        elif cbib.journal_country:
            country = xwalk.get_country_name(cbib.journal_country)

        # get the title of the license
        lic = cbib.get_journal_license()
        if lic is not None:
            licenses.append(lic.get("title"))

        # copy the publisher/provider
        if cbib.publisher:
            publisher.append(cbib.publisher)

        # deduplicate the lists
        issns = list(set(issns))
        subjects = list(set(subjects))
        schema_subjects = list(set(schema_subjects))
        classification = list(set(classification))
        licenses = list(set(licenses))
        publisher = list(set(publisher))
        langs = list(set(langs))
        schema_codes = list(set(schema_codes))

        # work out what the date of publication is
        date = cbib.get_publication_date()

        # calculate the classification paths
        from portality.lcc import lcc  # inline import since this hits the database
        for subs in cbib.subjects():
            scheme = subs.get("scheme")
            term = subs.get("term")
            if scheme == "LCC":
                path = lcc.pathify(term)
                if path is not None:
                    classification_paths.append(path)

        # normalise the classification paths, so we only store the longest ones
        classification_paths = lcc.longest(classification_paths)

        # create an unpunctitle
        if cbib.title is not None:
            throwlist = string.punctuation + '\n\t'
            unpunctitle = "".join(c for c in cbib.title
                                  if c not in throwlist).strip()
            try:
                asciiunpunctitle = unidecode(unpunctitle)
            except:
                asciiunpunctitle = unpunctitle

        # determine if the seal is applied
        has_seal = "Yes" if self.has_seal() else "No"

        # create a normalised version of the DOI for deduplication
        source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI)
        try:
            doi = normalise.normalise_doi(source_doi)
        except ValueError as e:
            # if we can't normalise the DOI, just store it as-is.
            doi = source_doi

        # create a normalised version of the fulltext URL for deduplication
        fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT)
        if len(fulltexts) > 0:
            source_fulltext = fulltexts[0]
            try:
                fulltext = normalise.normalise_url(source_fulltext)
            except ValueError as e:
                # if we can't normalise the fulltext store it as-is
                fulltext = source_fulltext

        # build the index part of the object
        self.data["index"] = {}
        if len(issns) > 0:
            self.data["index"]["issn"] = issns
        if date != "":
            self.data["index"]["date"] = date
            self.data["index"][
                "date_toc_fv_month"] = date  # Duplicated so we can have year/month facets in fv2
        if len(subjects) > 0:
            self.data["index"]["subject"] = subjects
        if len(schema_subjects) > 0:
            self.data["index"]["schema_subject"] = schema_subjects
        if len(classification) > 0:
            self.data["index"]["classification"] = classification
        if len(publisher) > 0:
            self.data["index"]["publisher"] = publisher
        if len(licenses) > 0:
            self.data["index"]["license"] = licenses
        if len(langs) > 0:
            self.data["index"]["language"] = langs
        if country is not None:
            self.data["index"]["country"] = country
        if schema_codes > 0:
            self.data["index"]["schema_code"] = schema_codes
        if len(classification_paths) > 0:
            self.data["index"]["classification_paths"] = classification_paths
        if unpunctitle is not None:
            self.data["index"]["unpunctitle"] = unpunctitle
        if asciiunpunctitle is not None:
            self.data["index"]["asciiunpunctitle"] = unpunctitle
        if has_seal:
            self.data["index"]["has_seal"] = has_seal
        if doi is not None:
            self.data["index"]["doi"] = doi
        if fulltext is not None:
            self.data["index"]["fulltext"] = fulltext
Example #6
0
    def duplicates(cls,
                   issns=None,
                   publisher_record_id=None,
                   doi=None,
                   fulltexts=None,
                   title=None,
                   volume=None,
                   number=None,
                   start=None,
                   should_match=None,
                   size=10):
        # some input sanitisation
        issns = issns if isinstance(issns, list) else []
        urls = fulltexts if isinstance(
            fulltexts, list) else [fulltexts] if isinstance(
                fulltexts, str) or isinstance(fulltexts, unicode) else []

        # make sure that we're dealing with the normal form of the identifiers
        norm_urls = []
        for url in urls:
            try:
                norm = normalise.normalise_url(url)
                norm_urls.append(norm)
            except ValueError:
                # use the non-normal form
                norm_urls.append(url)
        urls = norm_urls

        try:
            doi = normalise.normalise_doi(doi)
        except ValueError:
            # leave the doi as it is
            pass

        # in order to make sure we don't send too many terms to the ES query, break the issn list down into chunks
        terms_limit = app.config.get("ES_TERMS_LIMIT", 1024)
        issn_groups = []
        lower = 0
        upper = terms_limit
        while lower < len(issns):
            issn_groups.append(issns[lower:upper])
            lower = upper
            upper = lower + terms_limit

        if issns is not None and len(issns) > 0:
            duplicate_articles = []
            for g in issn_groups:
                q = DuplicateArticleQuery(
                    issns=g,
                    publisher_record_id=publisher_record_id,
                    doi=doi,
                    urls=urls,
                    title=title,
                    volume=volume,
                    number=number,
                    start=start,
                    should_match=should_match,
                    size=size)
                # print json.dumps(q.query())

                res = cls.query(q=q.query())
                duplicate_articles += [
                    cls(**hit.get("_source"))
                    for hit in res.get("hits", {}).get("hits", [])
                ]

            return duplicate_articles
        else:
            q = DuplicateArticleQuery(publisher_record_id=publisher_record_id,
                                      doi=doi,
                                      urls=urls,
                                      title=title,
                                      volume=volume,
                                      number=number,
                                      start=start,
                                      should_match=should_match,
                                      size=size)
            # print json.dumps(q.query())

            res = cls.query(q=q.query())
            return [
                cls(**hit.get("_source"))
                for hit in res.get("hits", {}).get("hits", [])
            ]
def _read_match_set(reader, next_row):
    n_matches = -1
    match_set = MatchSet()
    while True:
        if next_row is not None:
            row = deepcopy(next_row)
            next_row = None
        else:
            try:
                row = next(reader)
            except StopIteration:
                return match_set, None

        if row is None:
            return match_set, None

        a_id = row[0]
        root = match_set.root
        if root is not None and a_id != root["id"]:
            return match_set, row

        a_created = row[1]
        try:
            a_doi = normalise.normalise_doi(row[2])
        except:
            a_doi = row[2]
        try:
            a_ft = normalise.normalise_url(row[3])
        except:
            a_ft = row[3]
        a_owner = row[4]
        a_issns = row[5]
        a_in_doaj = row[6] == "True"

        if n_matches != -1:
            n_matches = int(row[7])

        match_type = row[8]

        b_id = row[9]
        b_created = row[10]
        try:
            b_doi = normalise.normalise_doi(row[11])
        except:
            b_doi = row[11]
        try:
            b_ft = normalise.normalise_url(row[12])
        except:
            b_ft = row[12]
        b_owner = row[13]
        b_issns = row[14]
        b_in_doaj = row[15] == "True"

        title_match = row[17] == "True"

        if root is None:
            match_set.add_root(a_id, a_created, a_doi, a_ft, a_owner, a_issns,
                               a_in_doaj, title_match)

        match_set.add_match(b_id, b_created, b_doi, b_ft, b_owner, b_issns,
                            b_in_doaj, title_match, match_type)

    # a catch to make sure that everything is ok with the match set detection
    assert n_matches + 1 == len(match_set.matches)
Example #8
0
File: article.py Project: DOAJ/doaj
    def _generate_index(self):
        # the index fields we are going to generate
        issns = []
        subjects = []
        schema_subjects = []
        schema_codes = []
        classification = []
        langs = []
        country = None
        licenses = []
        publisher = []
        classification_paths = []
        unpunctitle = None
        asciiunpunctitle = None
        doi = None
        fulltext = None

        # the places we're going to get those fields from
        cbib = self.bibjson()
        jindex = self.data.get('index', {})
        hist = self.history()

        # get the issns out of the current bibjson
        issns += cbib.get_identifiers(cbib.P_ISSN)
        issns += cbib.get_identifiers(cbib.E_ISSN)

        # get the issn from the journal bibjson
        if isinstance(cbib.journal_issns, list):
            issns += cbib.journal_issns

        # de-duplicate the issns
        issns = list(set(issns))

        # now get the issns out of the historic records
        for date, hbib in hist:
            issns += hbib.get_identifiers(hbib.P_ISSN)
            issns += hbib.get_identifiers(hbib.E_ISSN)

        # get the subjects and concatenate them with their schemes from the current bibjson
        for subs in cbib.subjects():
            scheme = subs.get("scheme")
            term = subs.get("term")
            subjects.append(term)
            schema_subjects.append(scheme + ":" + term)
            classification.append(term)
            if "code" in subs:
                schema_codes.append(scheme + ":" + subs.get("code"))

        # copy the languages
        from portality import datasets  # delayed import, as it loads some stuff from file
        if len(cbib.journal_language) > 0:
            langs = [datasets.name_for_lang(l) for l in cbib.journal_language]

        # copy the country
        if jindex.get('country'):
            country = jindex.get('country')
        elif cbib.journal_country:
            country = xwalk.get_country_name(cbib.journal_country)

        # get the title of the license
        lic = cbib.get_journal_license()
        if lic is not None:
            licenses.append(lic.get("title"))

        # copy the publisher/provider
        if cbib.publisher:
            publisher.append(cbib.publisher)

        # deduplicate the lists
        issns = list(set(issns))
        subjects = list(set(subjects))
        schema_subjects = list(set(schema_subjects))
        classification = list(set(classification))
        licenses = list(set(licenses))
        publisher = list(set(publisher))
        langs = list(set(langs))
        schema_codes = list(set(schema_codes))

        # work out what the date of publication is
        date = cbib.get_publication_date()

        # calculate the classification paths
        from portality.lcc import lcc  # inline import since this hits the database
        for subs in cbib.subjects():
            scheme = subs.get("scheme")
            term = subs.get("term")
            if scheme == "LCC":
                path = lcc.pathify(term)
                if path is not None:
                    classification_paths.append(path)

        # normalise the classification paths, so we only store the longest ones
        classification_paths = lcc.longest(classification_paths)

        # create an unpunctitle
        if cbib.title is not None:
            throwlist = string.punctuation + '\n\t'
            unpunctitle = "".join(c for c in cbib.title if c not in throwlist).strip()
            try:
                asciiunpunctitle = unidecode(unpunctitle)
            except:
                asciiunpunctitle = unpunctitle

        # determine if the seal is applied
        has_seal = "Yes" if self.has_seal() else "No"

        # create a normalised version of the DOI for deduplication
        source_doi = cbib.get_one_identifier(constants.IDENT_TYPE_DOI)
        try:
            doi = normalise.normalise_doi(source_doi)
        except ValueError as e:
            # if we can't normalise the DOI, just store it as-is.
            doi = source_doi


        # create a normalised version of the fulltext URL for deduplication
        fulltexts = cbib.get_urls(constants.LINK_TYPE_FULLTEXT)
        if len(fulltexts) > 0:
            source_fulltext = fulltexts[0]
            try:
                fulltext = normalise.normalise_url(source_fulltext)
            except ValueError as e:
                # if we can't normalise the fulltext store it as-is
                fulltext = source_fulltext

        # build the index part of the object
        self.data["index"] = {}
        if len(issns) > 0:
            self.data["index"]["issn"] = issns
        if date != "":
            self.data["index"]["date"] = date
            self.data["index"]["date_toc_fv_month"] = date        # Duplicated so we can have year/month facets in fv2
        if len(subjects) > 0:
            self.data["index"]["subject"] = subjects
        if len(schema_subjects) > 0:
            self.data["index"]["schema_subject"] = schema_subjects
        if len(classification) > 0:
            self.data["index"]["classification"] = classification
        if len(publisher) > 0:
            self.data["index"]["publisher"] = publisher
        if len(licenses) > 0:
            self.data["index"]["license"] = licenses
        if len(langs) > 0:
            self.data["index"]["language"] = langs
        if country is not None:
            self.data["index"]["country"] = country
        if schema_codes > 0:
            self.data["index"]["schema_code"] = schema_codes
        if len(classification_paths) > 0:
            self.data["index"]["classification_paths"] = classification_paths
        if unpunctitle is not None:
            self.data["index"]["unpunctitle"] = unpunctitle
        if asciiunpunctitle is not None:
            self.data["index"]["asciiunpunctitle"] = unpunctitle
        if has_seal:
            self.data["index"]["has_seal"] = has_seal
        if doi is not None:
            self.data["index"]["doi"] = doi
        if fulltext is not None:
            self.data["index"]["fulltext"] = fulltext
Example #9
0
File: article.py Project: DOAJ/doaj
    def duplicates(cls, issns=None, publisher_record_id=None, doi=None, fulltexts=None, title=None, volume=None, number=None, start=None, should_match=None, size=10):
        # some input sanitisation
        issns = issns if isinstance(issns, list) else []
        urls = fulltexts if isinstance(fulltexts, list) else [fulltexts] if isinstance(fulltexts, str) or isinstance(fulltexts, unicode) else []

        # make sure that we're dealing with the normal form of the identifiers
        norm_urls = []
        for url in urls:
            try:
                norm = normalise.normalise_url(url)
                norm_urls.append(norm)
            except ValueError:
                # use the non-normal form
                norm_urls.append(url)
        urls = norm_urls

        try:
            doi = normalise.normalise_doi(doi)
        except ValueError:
            # leave the doi as it is
            pass

        # in order to make sure we don't send too many terms to the ES query, break the issn list down into chunks
        terms_limit = app.config.get("ES_TERMS_LIMIT", 1024)
        issn_groups = []
        lower = 0
        upper = terms_limit
        while lower < len(issns):
            issn_groups.append(issns[lower:upper])
            lower = upper
            upper = lower + terms_limit

        if issns is not None and len(issns) > 0:
            duplicate_articles = []
            for g in issn_groups:
                q = DuplicateArticleQuery(issns=g,
                                            publisher_record_id=publisher_record_id,
                                            doi=doi,
                                            urls=urls,
                                            title=title,
                                            volume=volume,
                                            number=number,
                                            start=start,
                                            should_match=should_match,
                                            size=size)
                # print json.dumps(q.query())

                res = cls.query(q=q.query())
                duplicate_articles += [cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", [])]

            return duplicate_articles
        else:
            q = DuplicateArticleQuery(publisher_record_id=publisher_record_id,
                                        doi=doi,
                                        urls=urls,
                                        title=title,
                                        volume=volume,
                                        number=number,
                                        start=start,
                                        should_match=should_match,
                                        size=size)
            # print json.dumps(q.query())

            res = cls.query(q=q.query())
            return [cls(**hit.get("_source")) for hit in res.get("hits", {}).get("hits", [])]
Example #10
0
def _read_match_set(reader, next_row):
    n_matches = -1
    match_set = MatchSet()
    while True:
        if next_row is not None:
            row = deepcopy(next_row)
            next_row = None
        else:
            try:
                row = reader.next()
            except StopIteration:
                return match_set, None

        if row is None:
            return match_set, None

        a_id = row[0]
        root = match_set.root
        if root is not None and a_id != root["id"]:
            return match_set, row

        a_created = row[1]
        try:
            a_doi = normalise.normalise_doi(row[2])
        except:
            a_doi = row[2]
        try:
            a_ft = normalise.normalise_url(row[3])
        except:
            a_ft = row[3]
        a_owner = row[4]
        a_issns = row[5]
        a_in_doaj = row[6] == "True"

        if n_matches != -1:
            n_matches = int(row[7])

        match_type = row[8]

        b_id = row[9]
        b_created = row[10]
        try:
            b_doi = normalise.normalise_doi(row[11])
        except:
            b_doi = row[11]
        try:
            b_ft = normalise.normalise_url(row[12])
        except:
            b_ft = row[12]
        b_owner = row[13]
        b_issns = row[14]
        b_in_doaj = row[15] == "True"

        title_match = row[17] == "True"

        if root is None:
            match_set.add_root(a_id, a_created, a_doi, a_ft, a_owner, a_issns, a_in_doaj, title_match)

        match_set.add_match(b_id, b_created, b_doi, b_ft, b_owner, b_issns, b_in_doaj, title_match, match_type)

    # a catch to make sure that everything is ok with the match set detection
    assert n_matches + 1 == len(match_set.matches)