Example #1
0
    def __init__(self, date=None, orcid=None, issn=None, doi=None, **params):
        if date is None:
            date = CSVManager(store_new=False)
        if orcid is None:
            orcid = CSVManager(store_new=False)
        if issn is None:
            issn = CSVManager(store_new=False)
        if doi is None:
            doi = CSVManager(store_new=False)

        for key in params:
            setattr(self, key, params[key])

        self.issn = issn
        self.date = date
        self.orcid = orcid
        if hasattr(self, 'use_api_service'):
            self.dm = DOIManager(doi, self.use_api_service)
        else:
            self.dm = DOIManager(doi)
        self.im = ISSNManager()
        self.om = ORCIDManager()

        self.headers = {
            "User-Agent":
            "ResourceFinder / OpenCitations Indexes "
            "(http://opencitations.net; mailto:[email protected])"
        }
class CrowdsourcedCitationSource(CSVFileCitationSource):
    def __init__(self, src, local_name=""):
        self.doi = DOIManager()
        super(CrowdsourcedCitationSource, self).__init__(src, local_name)

    def get_next_citation_data(self):
        row = self._get_next_in_file()

        while row is not None:
            citing = self.doi.normalise(row.get("citing_id"))
            cited = self.doi.normalise(row.get("cited_id"))

            if citing is not None and cited is not None:
                created = row.get("citing_publication_date")
                if not created:
                    created = None

                cited_pub_date = row.get("cited_publication_date")
                if not cited_pub_date:
                    timespan = None
                else:
                    c = Citation(None, None, created, None, cited_pub_date,
                                 None, None, None, None, "", None, None, None,
                                 None, None)
                    timespan = c.duration

                self.update_status_file()
                return citing, cited, created, timespan, None, None

            self.update_status_file()
            row = self._get_next_in_file()

        remove(self.status_file)
 def test_doi_normalise(self):
     dm = DOIManager()
     self.assertEqual(
         self.valid_doi_1,
         dm.normalise(self.valid_doi_1.upper().replace("10.", "doi: 10. ")))
     self.assertEqual(
         self.valid_doi_1,
         dm.normalise(self.valid_doi_1.upper().replace("10.", "doi:10.")))
     self.assertEqual(
         self.valid_doi_1,
         dm.normalise(self.valid_doi_1.upper().replace(
             "10.", "https://doi.org/10.")))
Example #4
0
def execute_workflow(idbaseurl, baseurl, python, pclass, input, doi_file,
                     date_file, orcid_file, issn_file, orcid, lookup, data,
                     prefix, agent, source, service, verbose, no_api):
    # Create the support file for handling information about bibliographic resources
    valid_doi, id_date, id_orcid, id_issn = create_csv(doi_file, date_file,
                                                       orcid_file, issn_file)

    doi_manager = DOIManager(valid_doi, use_api_service=not no_api)
    crossref_rf = CrossrefResourceFinder(date=id_date,
                                         orcid=id_orcid,
                                         issn=id_issn,
                                         doi=valid_doi,
                                         use_api_service=not no_api)
    datacite_rf = DataCiteResourceFinder(date=id_date,
                                         orcid=id_orcid,
                                         issn=id_issn,
                                         doi=valid_doi,
                                         use_api_service=not no_api)
    orcid_rf = ORCIDResourceFinder(
        date=id_date,
        orcid=id_orcid,
        issn=id_issn,
        doi=valid_doi,
        use_api_service=True if orcid is not None and not no_api else False,
        key=orcid)

    rf_handler = ResourceFinderHandler([crossref_rf, datacite_rf, orcid_rf])
    return extract_citations(idbaseurl, baseurl, python, pclass, input, lookup,
                             data, prefix, agent, source, service, verbose,
                             doi_manager, rf_handler)
class CrossrefCitationSource(DirCitationSource):
    def __init__(self, src, local_name=""):
        self.last_ref = -1
        self.doi = DOIManager()
        super(CrossrefCitationSource, self).__init__(src, local_name)

    def load(self, file_path):
        result = []
        with open(file_path, encoding="utf8") as f:
            j = load(f)
            if "items" in j:
                result.extend(j["items"])
        return result, len(result)

    def select_file(self, file_path):
        return file_path.endswith(".json")

    def get_next_citation_data(self):
        row = self._get_next_in_file()
        while row is not None:
            citing = self.doi.normalise(row.get("DOI"))
            if citing is not None and "reference" in row:
                for idx, ref in enumerate(row["reference"]):
                    if idx > self.last_ref:
                        self.last_ref = idx
                        cited = self.doi.normalise(ref.get("DOI"))
                        if cited is not None:
                            self.last_row -= 1
                            # self.update_status_file() # In Crossref, this should not be
                            # needed since I modify the row only when I finished to process
                            # all the references of a certain row (since here, a row is an
                            # article, not a citation)
                            return citing, cited, None, None, None, None

            self.update_status_file()
            row = self._get_next_in_file()
            self.last_ref = -1

        remove(self.status_file)
Example #6
0
class CrossrefCitationSource(DirCitationSource):
    def __init__(self, src, local_name=""):
        self.last_ref = -1
        self.doi = DOIManager()
        super(CrossrefCitationSource, self).__init__(src, local_name)

    def load(self, file_path):
        result = []
        with open(file_path) as f:
            j = load(f)
            if "items" in j:
                result.extend(j["items"])
        return result, len(result)

    def select_file(self, file_path):
        return file_path.endswith(".json")

    def get_next_citation_data(self):
        row = self._get_next_in_file()
        while row is not None:
            citing = self.doi.normalise(row.get("DOI"))
            if citing is not None and "reference" in row:
                for idx, ref in enumerate(row["reference"]):
                    if idx > self.last_ref:
                        self.last_ref = idx
                        cited = self.doi.normalise(ref.get("DOI"))
                        if cited is not None:
                            self.last_row -= 1
                            self.update_status_file()
                            return citing, cited, None, None, None, None

            self.update_status_file()
            row = self._get_next_in_file()
            self.last_ref = -1

        remove(self.status_file)
    def test_doi_is_valid(self):
        dm_nofile = DOIManager()
        self.assertTrue(dm_nofile.is_valid(self.valid_doi_1))
        self.assertTrue(dm_nofile.is_valid(self.valid_doi_2))
        self.assertFalse(dm_nofile.is_valid(self.invalid_doi_1))
        self.assertFalse(dm_nofile.is_valid(self.invalid_doi_2))

        valid_doi = CSVManager(self.valid_doi_path)
        dm_file = DOIManager(valid_doi=valid_doi, use_api_service=False)
        self.assertTrue(dm_file.is_valid(self.valid_doi_1))
        self.assertFalse(dm_file.is_valid(self.invalid_doi_1))

        dm_nofile_noapi = DOIManager(use_api_service=False)
        self.assertFalse(dm_nofile_noapi.is_valid(self.valid_doi_1))
        self.assertFalse(dm_nofile_noapi.is_valid(self.invalid_doi_1))
Example #8
0
def process(input_dir, output_dir):
    if not exists(output_dir):
        makedirs(output_dir)

    citing_doi_with_no_date = set()
    valid_doi = CSVManager(output_dir + sep + "valid_doi.csv")
    id_date = CSVManager(output_dir + sep + "id_date.csv")
    id_issn = CSVManager(output_dir + sep + "id_issn.csv")
    id_orcid = CSVManager(output_dir + sep + "id_orcid.csv")

    doi_manager = DOIManager(valid_doi)
    issn_manager = ISSNManager()
    orcid_manager = ORCIDManager()

    all_files, opener = get_all_files(input_dir)
    len_all_files = len(all_files)

    # Read all the JSON file in the Crossref dump to create the main information of all the indexes
    print("\n\n# Add valid DOIs from Crossref metadata")
    for file_idx, file in enumerate(all_files, 1):
        with opener(file) as f:
            print("Open file %s of %s" % (file_idx, len_all_files))
            try:
                data = load(f)
            # When using tar.gz file or zip file a stream of byte is returned by the opener. Thus,
            # it must be converted into an utf-8 string before loading it into a JSON.
            except TypeError:
                utf8reader = codecs.getreader("utf-8")
                data = load(utf8reader(f))

            if "items" in data:
                for obj in data['items']:
                    if "DOI" in obj:
                        citing_doi = doi_manager.normalise(obj["DOI"], True)
                        doi_manager.set_valid(citing_doi)

                        if id_date.get_value(citing_doi) is None:
                            citing_date = Citation.check_date(build_pubdate(obj))
                            if citing_date is not None:
                                id_date.add_value(citing_doi, citing_date)
                                if citing_doi in citing_doi_with_no_date:
                                    citing_doi_with_no_date.remove(citing_doi)
                            else:
                                citing_doi_with_no_date.add(citing_doi)

                        if id_issn.get_value(citing_doi) is None:
                            if "type" in obj:
                                cur_type = obj["type"]
                                if cur_type is not None and "journal" in cur_type and "ISSN" in obj:
                                    cur_issn = obj["ISSN"]
                                    if cur_issn is not None:
                                        for issn in [issn_manager.normalise(issn) for issn in cur_issn]:
                                            if issn is not None:
                                                id_issn.add_value(citing_doi, issn)

                        if id_orcid.get_value(citing_doi) is None:
                            if "author" in obj:
                                cur_author = obj['author']
                                if cur_author is not None:
                                    for author in cur_author:
                                        if "ORCID" in author:
                                            orcid = orcid_manager.normalise(author["ORCID"])
                                            if orcid is not None:
                                                id_orcid.add_value(citing_doi, orcid)

    # Do it again for updating the dates of the cited DOIs, if these are valid
    print("\n\n# Check cited DOIs from Crossref reference field")
    doi_date = {}
    for file_idx, file in enumerate(all_files, 1):
        with opener(file) as f:
            print("Open file %s of %s" % (file_idx, len_all_files))
            data = load(f)
            if "items" in data:
                for obj in data['items']:
                    if "DOI" in obj and "reference" in obj:
                        for ref in obj['reference']:
                            if "DOI" in ref:
                                cited_doi = doi_manager.normalise(ref["DOI"], True)
                                if doi_manager.is_valid(cited_doi) and id_date.get_value(cited_doi) is None:
                                    if cited_doi not in doi_date:
                                        doi_date[cited_doi] = []
                                    cited_date = Citation.check_date(build_pubdate(ref))
                                    if cited_date is not None:
                                        doi_date[cited_doi].append(cited_date)
                                        if cited_doi in citing_doi_with_no_date:
                                            citing_doi_with_no_date.remove(cited_doi)

    # Add the date to the DOI if such date is the most adopted one in the various references.
    # In case two distinct dates are used the most, select the older one.
    for doi in doi_date:
        count = Counter(doi_date[doi])
        if len(count):
            top_value = count.most_common(1)[0][1]
            selected_dates = []
            for date in count:
                if count[date] == top_value:
                    selected_dates.append(date)
            best_date = sorted(selected_dates)[0]
            id_date.add_value(doi, best_date)
        else:
            id_date.add_value(doi, "")

    # Add emtpy dates for the remaining DOIs
    for doi in citing_doi_with_no_date:
        id_date.add_value(doi, "")
 def __init__(self, src, local_name=""):
     self.last_ref = -1
     self.doi = DOIManager()
     super(CrossrefCitationSource, self).__init__(src, local_name)
Example #10
0
    result = {}
    threshold = 10000
    existing_ocis = set()

    all_files = []
    if isdir(args.input_file):
        for cur_dir, cur_subdir, cur_files in walk(args.input_file):
            for cur_file in [f for f in cur_files if f.endswith(".csv")]:
                all_files.append(cur_dir + sep + cur_file)
    else:
        all_files.append(args.input_file)

    dois = None
    if args.doi_file is not None and exists(args.doi_file):
        with open(args.doi_file, encoding="utf8") as f:
            dm = DOIManager()
            dois = set()
            csv_reader = reader(f)
            for row in csv_reader:
                doi = dm.normalise(row[0])
                if doi:
                    dois.add(doi)

    for cur_file in all_files:
        with open(cur_file, encoding="utf8") as f:
            csv_content = ""
            for idx, line in enumerate(f.readlines()):
                if header is None:
                    header = line
                    csv_content = header
                else:
 def __init__(self, src, local_name=""):
     self.doi = DOIManager()
     super(CrowdsourcedCitationSource, self).__init__(src, local_name)