Python strdiff 예제들, basic.strdiff Python 예제들

예제 #1

0

파일 보기

파일: endnotexml.py 프로젝트: OAPDF/oapdftools

    def finddoi(self, num, prefix='', issn=''):
        title = self.gettitle(num)
        doi = DOI(self.getdoi(num))
        if (not prefix):
            prefix = doi.split('/', 1)[0] if doi else ""
        volume = self.getvolume(num)
        journal = self.getjournalfull(num)
        year = self.getyear(num)
        pages = self.getpages(num)
        self.cr = CRrecord()
        try:
            # The origin doi maybe true. Find in crossref
            if (doi and self.cr.getfromdoi(doi, fullparse=False)
                    and self.cr.doi):
                # Further check title
                if (strdiff(doi,self.cr.doi)>=0.85 and \
                strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
                    return doi
                if (volume and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and volume == self.cr.volume):
                        return doi
                if (year and pages):
                    ops = pages.split('-')
                    crps = self.cr.pages.split('-')
                    if (len(ops) > 0 and len(crps) > 0 and ops[0] == crps[0]
                            and year == self.cr.year):
                        return doi
                print "Origin DOI:", doi, "may be true but record strange..Try title"

            keyword = title + " " + journal + " " + year + " " + pages + " " + volume
            if (self.cr.getfromtitledoi(keyword,
                                        doi,
                                        year=year,
                                        limit=10,
                                        fullparse=False,
                                        prefix=prefix)):
                if (doi):
                    if (prefix == self.cr.doi.split('/')[0]
                            and strdiff(doi, self.cr.doi) >= 0.85):
                        return self.cr.doi
                    else:
                        print "Error for origin doi: " + doi + "; found: " + self.cr.doi
                        return ""
                return self.cr.doi
            if (doi):
                if (strdiff(doi, self.cr.doi) >= 0.85):
                    return self.cr.doi
                else:
                    print "Error2 for origin doi: " + doi + "; found: " + self.cr.doi
                    return ""
            else:
                return ""
        except Exception as e:
            print "Error when find doi..", e, "\nRetry..."
            return self.finddoi(num, prefix=prefix, issn=issn)

예제 #2

0

파일 보기

파일: endnotexml.py 프로젝트: OAPDF/oapdftools

	def finddoi(self,num,prefix='',issn=''):
		title=self.gettitle(num)
		doi=DOI(self.getdoi(num))
		if (not prefix):
			prefix = doi.split('/',1)[0] if doi else ""
		volume= self.getvolume(num)
		journal=self.getjournalfull(num)
		year=self.getyear(num) 
		pages=self.getpages(num)
		self.cr=CRrecord()
		try:
			# The origin doi maybe true. Find in crossref
			if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi):
				# Further check title
				if (strdiff(doi,self.cr.doi)>=0.85 and \
				strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75):
					return doi
				if( volume and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume):
						return doi
				if( year and pages ):
					ops=pages.split('-')
					crps=self.cr.pages.split('-')
					if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year):
						return doi
				print "Origin DOI:",doi,"may be true but record strange..Try title"

			keyword=title+" "+journal+" "+year+" "+pages+" "+volume
			if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)):
				if (doi):
					if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85):
						return self.cr.doi
					else:
						print "Error for origin doi: "+doi+"; found: "+self.cr.doi
						return ""
				return self.cr.doi
			if (doi):
				if( strdiff(doi,self.cr.doi)>=0.85):
					return self.cr.doi
				else:
					print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi
					return ""
			else:
				return ""
		except Exception as e:
			print "Error when find doi..",e,"\nRetry..."
			return self.finddoi(num,prefix=prefix,issn=issn)

예제 #3

0

파일 보기

파일: crrecord.py 프로젝트: OAPDF/oapdftools

    def getfromtitledoi(
        self,
        title,
        doi,
        year="",
        volume="",
        issue="",
        pages="",
        limit=3,
        offset=0,
        cutoff=0.1,
        fullparse=True,
        ignorecheminfo=True,
        prefix="",
        issn="",
    ):
        """Get information from journal title and doi, better with year, volume, issue, pages information"""
        # Over max records try
        if offset > limit:
            return False
            # Cancel ISSN check because unreliable

            # search url
        if issn and len(issn.strip()) is 9:
            url = (
                "http://api.crossref.org/journals/"
                + issn
                + "/works?query="
                + normalizeString(title)
                + "&rows=1&offset="
                + str(offset)
            )
        elif prefix:
            url = (
                "http://api.crossref.org/prefixes/"
                + prefix
                + "/works?query="
                + normalizeString(title)
                + "&rows=1&offset="
                + str(offset)
            )
        else:
            url = "http://api.crossref.org/works?query=" + normalizeString(title) + "&rows=1&offset=" + str(offset)
        if year:
            # some time year maybe +- 1
            url += "&filter=from-pub-date:" + str(int(year) - 1) + "-06,until-pub-date:" + str(int(year) + 1) + "-06"
            # print url

            # search crossref
        r = requests.get(url, timeout=timeout_setting)
        if r.status_code is 200:
            try:
                for currentrecord in range(len(r.json()["message"]["items"])):
                    data = r.json()["message"]["items"][currentrecord]
                    # should better then cutoff
                    if float(data["score"]) > cutoff:
                        self.title = data.get("title", [""])[0]
                        self.year = str(data["issued"]["date-parts"][0][0])
                        self.volume = data.get("volume", "")
                        self.issue = data.get("issue", "")
                        self.pages = data.get("page", "")
                        self.doi = data.get("DOI", "")
                        if fullparse:
                            self.journals = data.get("container-title", [""])
                            self.issns = data.get("ISSN", [""])
                            if len(self.journals) >= 1:
                                self.journal = self.journals[0]
                            else:
                                self.journal = ""
                            if len(self.issns) >= 1:
                                self.issn = self.issns[0]
                            else:
                                self.issn = ""
                            self.authors = self._getauthor(data.get("author", []))
                            self.urls = [data.get("URL", "")]

                        if doi.strip():
                            if strdiff(doi.strip(), self.doi) >= 0.85:
                                return True
                                # else blank

                                # check whether fitting to giving parameters
                        if year and year.strip() != self.year.strip():
                            # possible +- 1year
                            if not (abs(int(year) - int(self.year)) is 1 and volume.strip() == self.volume.strip()):
                                continue
                        if volume and volume.strip() != self.volume.strip():
                            continue
                        if pages and pages.strip().split("-")[0] != self.pages.strip().split("-")[0]:
                            continue
                        if ignorecheminfo and data.get("container-title", [""])[0].lower() == "cheminform":
                            continue
                        return True
                        # Low score, more try.
                    else:
                        continue
                return False
            except:
                print "Something error for finding " + title.encode("utf-8")
                return False
        else:
            print "Journal title can't be found: " + title.encode("utf-8")
            return False

예제 #4

0

파일 보기

    def getfromtitledoi(self,title,doi, year="",volume="",issue="",pages="", \
     limit=3, offset=0, cutoff=0.1, fullparse=True,ignorecheminfo=True,prefix="",issn=""):
        '''Get information from journal title and doi, better with year, volume, issue, pages information'''
        # Over max records try
        if (offset > limit):
            return False
        # Cancel ISSN check because unreliable

        # search url
        if (issn and len(issn.strip()) is 9):
            url = "http://api.crossref.org/journals/" + issn + "/works?query=" + normalizeString(
                title) + "&rows=1&offset=" + str(offset)
        elif (prefix):
            url = "http://api.crossref.org/prefixes/" + prefix + "/works?query=" + normalizeString(
                title) + "&rows=1&offset=" + str(offset)
        else:
            url = "http://api.crossref.org/works?query=" + normalizeString(
                title) + "&rows=1&offset=" + str(offset)
        if (year):
            #some time year maybe +- 1
            url += "&filter=from-pub-date:" + str(int(
                year) - 1) + "-06,until-pub-date:" + str(int(year) + 1) + "-06"
        #print url

        # search crossref
        r = requests.get(url, timeout=timeout_setting)
        if (r.status_code is 200):
            try:
                for currentrecord in range(len(r.json()['message']['items'])):
                    data = r.json()['message']['items'][currentrecord]
                    # should better then cutoff
                    if (float(data['score']) > cutoff):
                        self.title = data.get('title', [''])[0]
                        self.year = str(data['issued']['date-parts'][0][0])
                        self.volume = data.get('volume', '')
                        self.issue = data.get('issue', '')
                        self.pages = data.get('page', '')
                        self.doi = data.get('DOI', '')
                        if (fullparse):
                            self.journals = data.get('container-title', [''])
                            self.issns = data.get('ISSN', [''])
                            if (len(self.journals) >= 1):
                                self.journal = self.journals[0]
                            else:
                                self.journal = ""
                            if (len(self.issns) >= 1):
                                self.issn = self.issns[0]
                            else:
                                self.issn = ""
                            self.authors = self._getauthor(
                                data.get('author', []))
                            self.urls = [data.get('URL', '')]

                        if (doi.strip()):
                            if (strdiff(doi.strip(), self.doi) >= 0.85):
                                return True
                        #else blank

                        # check whether fitting to giving parameters
                        if (year and year.strip() != self.year.strip()):
                            # possible +- 1year
                            if not (abs(int(year) - int(self.year)) is 1
                                    and volume.strip() == self.volume.strip()):
                                continue
                        if (volume and volume.strip() != self.volume.strip()):
                            continue
                        if (pages and pages.strip().split('-')[0] !=
                                self.pages.strip().split('-')[0]):
                            continue
                        if (ignorecheminfo
                                and data.get('container-title',
                                             [''])[0].lower() == "cheminform"):
                            continue
                        return True
                    # Low score, more try.
                    else:
                        continue
                return False
            except:
                print "Something error for finding " + title.encode('utf-8')
                return False
        else:
            print "Journal title can't be found: " + title.encode('utf-8')
            return False