def procurarPais(self, dataDoi, nomeDePais,urlDOI): nomeDePais = nomeDePais.lower() nomeDePais = HTMLParser().unescape(nomeDePais.decode("utf8", "ignore")) nomeDePais = unicodedata.normalize('NFKD',unicode(nomeDePais)).encode('ascii','ignore') if len(nomeDePais)<=0: return False if len(dataDoi) == 2: doihtml = dataDoi[0] doihtml = doihtml.encode('utf8','replace') doihtml=doihtml.lower() doihtml=doihtml.replace('\\r\\n','') doihtml=doihtml.replace('\\t','') doihtml=doihtml.replace('\\n','') prefixo = dataDoi[1][4] posfixo = dataDoi[1][5] idDoi = dataDoi[1][0] if re.search((prefixo,'')[prefixo is None]+re.escape(nomeDePais)+(posfixo,'')[posfixo is None], doihtml): print "generic case" #print idDoi+" "+(prefixo,'')[prefixo is None] +" "+(posfixo,'')[posfixo is None] return True else: return False elif len(dataDoi) == 1: doihtml = dataDoi[0] doihtml = doihtml.encode('utf8','replace') doihtml=doihtml.lower() prefixo = ",.*,\s*" if re.search(prefixo+re.escape(nomeDePais)+r"\s*\n", doihtml): return True if re.search(prefixo+re.escape(nomeDePais)+r"\W*\n", doihtml): return True return False else: return False
def procurarPais(self, dataDoi, nomeDePais, urlDOI): nomeDePais = nomeDePais.lower() nomeDePais = HTMLParser().unescape(nomeDePais.decode("utf8", "ignore")) nomeDePais = unicodedata.normalize('NFKD',unicode(nomeDePais)).encode('ascii','ignore') if len(nomeDePais)<=0: return False if len(dataDoi) == 2: doihtml = dataDoi[0] doihtml = doihtml.encode('utf8','replace') doihtml=doihtml.lower() doihtml=doihtml.replace('\\r\\n','') doihtml=doihtml.replace('\\t','') doihtml=doihtml.replace('\\n','') prefixo = dataDoi[1][4] posfixo = dataDoi[1][5] idDoi = dataDoi[1][0] if re.search((prefixo,'')[prefixo is None]+re.escape(nomeDePais)+(posfixo,'')[posfixo is None], doihtml): # print "generic case" # print idDoi+" "+(prefixo,'')[prefixo is None] +" "+(posfixo,'')[posfixo is None] return True else: return False elif len(dataDoi) == 1: doihtml = dataDoi[0] doihtml = doihtml.encode('utf8','replace') doihtml = doihtml.lower() prefixo = ",.*,\s*" if re.search(prefixo+re.escape(nomeDePais)+r"\s*\n", doihtml): return True if re.search(prefixo+re.escape(nomeDePais)+r"\W*\n", doihtml): return True return False else: return False
def obterDadosAtravesDeDOI(self, urlDOI): print '\nProcessando DOI: ' + urlDOI txdata = None txheaders = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0) Gecko/20100101 Firefox/4.0', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Encoding': 'deflate', 'Keep-Alive': '115', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } # tentamos 3 vezes baixar a página web associado ao DOI tentativa = 1 while tentativa <= 1: try: req = urllib2.Request(urlDOI, txdata, txheaders) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) response = opener.open(req) rawDOIhtml = response.read() break except: print '[AVISO] Tentativa ' + str( tentativa ) + ': DOI não identificado corretamente: ', urlDOI time.sleep(100) rawDOIhtml = None tentativa += 1 continue dataDoi = [] if rawDOIhtml is not None: print "***************************************************" print type(rawDOIhtml) print type(rawDOIhtml.decode("utf8", "ignore")) # rawDOIhtml = HTMLParser.HTMLParser().unescape(rawDOIhtml.decode("utf8", "ignore")) rawDOIhtml = HTMLParser().unescape( rawDOIhtml.decode("utf8", "ignore")) rawDOIhtml = unicodedata.normalize('NFKD', unicode(rawDOIhtml)).encode( 'ascii', 'ignore') parserData = self.procurarParser(urlDOI) if parserData is not None: if len(parserData) == 6: print "**caso - " + parserData[0] caso = genericParser(parserData) caso.feed(rawDOIhtml) doihtml = str(caso.data) dataDoi.append(doihtml) dataDoi.append(parserData) elif urlDOI.find("10.1134") > -1: print "**caso - 10.1134" casoUrl = parser101007() casoUrl.feed(rawDOIhtml) doihtml = str(casoUrl.data) parserData = [ "10.1134", '', '', '', 'authoraddress=.*\+', '.*&contentid' ] dataDoi.append(doihtml) dataDoi.append(parserData) elif urlDOI.find("10.1007") > -1: print "**caso - 10.1007" casoUrl = parser101007() casoUrl.feed(rawDOIhtml) doihtml = str(casoUrl.data) parserData = [ "10.1007", '', '', '', 'authoraddress=.*\+', '.*&contentid' ] dataDoi.append(doihtml) dataDoi.append(parserData) elif urlDOI.find("10.1021") > -1: print "**caso -- 10.1021" caso = parser101021() caso.feed(rawDOIhtml) doihtml = str(caso.data) parserData = [ "10.1021", '', '', '', ',.*,\s*', '[\s*|,|;|-|\.|\'|\"]' ] dataDoi.append(doihtml) dataDoi.append(parserData) elif urlDOI.find("10.1590") > -1: print "**caso -- 10.1590" caso = parser101590() caso.feed(rawDOIhtml) doihtml = str(caso.data) print doihtml parserData = [ "10.1590", '', '', '', ',.*,\s*', '[\s*|,|;|-|\.|\'|\"]' ] dataDoi.append(doihtml) dataDoi.append(parserData) else: print "**caso DEFAULT não esta no xml" doihtml = self.html2texto(rawDOIhtml) dataDoi.append(doihtml) else: dataDoi = [] return dataDoi
def obterDadosAtravesDeDOI(self, urlDOI): print '\nProcessando DOI: ' + urlDOI txdata = None txheaders = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0) Gecko/20100101 Firefox/4.0', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Encoding': 'deflate', 'Keep-Alive': '115', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } doiNumber = urlDOI doiNumber = doiNumber.replace('http://dx.doi.org/', '') doiNumber = doiNumber.replace('/', '-') doiPath = self.grupo.diretorioDoi + '/' + doiNumber if (os.path.isfile(doiPath)): arquivoX = open(doiPath) rawDOIhtml = arquivoX.read() arquivoX.close() print "- Utilizando DOI armazenado no cache: " + doiPath #----------------------------------------------------------------- # tentamos 3 vezes baixar a página web associado ao DOI else: tentativa = 1 while tentativa <= 1: try: req = urllib2.Request(urlDOI, txdata, txheaders) cj = cookielib.CookieJar() opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cj)) response = opener.open(req) rawDOIhtml = response.read() print "- Baixando publicacao com DOI: " + urlDOI # rawDOIhtml = HTMLParser.HTMLParser().unescape(rawDOIhtml.decode("utf8", "ignore")) rawDOIhtml = HTMLParser().unescape( rawDOIhtml.decode("utf8", "ignore")) rawDOIhtml = unicodedata.normalize( 'NFKD', unicode(rawDOIhtml)).encode('ascii', 'ignore') if not self.grupo.diretorioDoi == '': print "- Armazenando DOI armazenado no cache: " + doiPath filename = doiPath file = open(filename, "w") file.write(rawDOIhtml) file.close() break except: print '[AVISO] Tentativa ' + str( tentativa ) + ': DOI não está disponível na internet: ', urlDOI time.sleep(10) rawDOIhtml = None tentativa += 1 continue dataDoi = [] if rawDOIhtml is not None: parserData = self.procurarParser(urlDOI) if parserData is not None: if len(parserData) == 6: print "**caso -- " + parserData[0] caso = genericParser(parserData) try: caso.feed(rawDOIhtml) #### except: caso.data = "" doihtml = str(caso.data) dataDoi.append(doihtml) dataDoi.append(parserData) elif urlDOI.find("10.1134") > -1: print "**caso - 10.1134" casoUrl = parser101007() try: casoUrl.feed(rawDOIhtml) ### except: casoUrl.data = "" doihtml = str(casoUrl.data) parserData = [ "10.1134", '', '', '', 'authoraddress=.*\+', '.*&contentid' ] dataDoi.append(doihtml) dataDoi.append(parserData) # elif urlDOI.find("10.1007")>-1: # print "**caso - 10.1007" # casoUrl = parser101007() # casoUrl.feed(rawDOIhtml) # doihtml = str(casoUrl.data) # parserData = ["10.1007",'','','','authoraddress=.*\+','.*&contentid'] # dataDoi.append(doihtml) # dataDoi.append(parserData) # elif urlDOI.find("10.1021")>-1: # # print "**caso -- 10.1021" # caso=parser101021() # caso.feed(rawDOIhtml) # doihtml= str(caso.data) # parserData=["10.1021",'','','',',.*,\s*','[\s*|,|;|-|\.|\'|\"]'] # dataDoi.append(doihtml) # dataDoi.append(parserData) elif urlDOI.find("10.1590") > -1: print "**caso -- 10.1590" caso = parser101590() try: caso.feed(rawDOIhtml) except: caso.data = "" doihtml = str(caso.data) #print doihtml parserData = [ "10.1590", '', '', '', ',.*,\s*', '[\s*|,|;|-|\.|\'|\"]' ] dataDoi.append(doihtml) dataDoi.append(parserData) else: print "**caso DEFAULT não esta no xml" doihtml = self.html2texto(rawDOIhtml) dataDoi.append(doihtml) else: dataDoi = [] return dataDoi
def obterDadosAtravesDeDOI(self, urlDOI): print '\nProcessando DOI: ' + urlDOI txdata = None txheaders = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0) Gecko/20100101 Firefox/4.0', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Encoding': 'deflate', 'Keep-Alive': '115', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } # tentamos 3 vezes baixar a página web associado ao DOI tentativa = 1 while tentativa<=1: try: req = urllib2.Request(urlDOI, txdata, txheaders) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) response = opener.open(req) rawDOIhtml = response.read() break except: print '[AVISO] Tentativa '+str(tentativa)+': DOI não identificado corretamente: ', urlDOI time.sleep(100) rawDOIhtml = None tentativa += 1 continue dataDoi=[] if rawDOIhtml is not None: print "***************************************************" print type(rawDOIhtml) print type(rawDOIhtml.decode("utf8", "ignore")) # rawDOIhtml = HTMLParser.HTMLParser().unescape(rawDOIhtml.decode("utf8", "ignore")) rawDOIhtml = HTMLParser().unescape(rawDOIhtml.decode("utf8", "ignore")) rawDOIhtml = unicodedata.normalize('NFKD',unicode(rawDOIhtml)).encode('ascii','ignore') parserData=self.procurarParser(urlDOI) if parserData is not None: if len(parserData)==6: print "**caso - "+parserData[0] caso=genericParser(parserData) caso.feed(rawDOIhtml) doihtml= str(caso.data) dataDoi.append(doihtml) dataDoi.append(parserData) elif urlDOI.find("10.1134")>-1: print "**caso - 10.1134" casoUrl=parser101007() casoUrl.feed(rawDOIhtml) doihtml= str(casoUrl.data) parserData=["10.1134",'','','','authoraddress=.*\+','.*&contentid'] dataDoi.append(doihtml) dataDoi.append(parserData) elif urlDOI.find("10.1007")>-1: print "**caso - 10.1007" casoUrl=parser101007() casoUrl.feed(rawDOIhtml) doihtml= str(casoUrl.data) parserData=["10.1007",'','','','authoraddress=.*\+','.*&contentid'] dataDoi.append(doihtml) dataDoi.append(parserData) elif urlDOI.find("10.1021")>-1: print "**caso -- 10.1021" caso=parser101021() caso.feed(rawDOIhtml) doihtml= str(caso.data) parserData=["10.1021",'','','',',.*,\s*','[\s*|,|;|-|\.|\'|\"]'] dataDoi.append(doihtml) dataDoi.append(parserData) elif urlDOI.find("10.1590")>-1: print "**caso -- 10.1590" caso=parser101590() caso.feed(rawDOIhtml) doihtml= str(caso.data) print doihtml parserData=["10.1590",'','','',',.*,\s*','[\s*|,|;|-|\.|\'|\"]'] dataDoi.append(doihtml) dataDoi.append(parserData) else: print "**caso DEFAULT não esta no xml" doihtml = self.html2texto(rawDOIhtml) dataDoi.append(doihtml) else: dataDoi=[] return dataDoi
def obterDadosAtravesDeDOI(self, urlDOI): print '\nProcessando DOI: ' + urlDOI txdata = None txheaders = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0) Gecko/20100101 Firefox/4.0', 'Accept-Language': 'en-us,en;q=0.5', 'Accept-Encoding': 'deflate', 'Keep-Alive': '115', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', } doiNumber = urlDOI doiNumber = doiNumber.replace('http://dx.doi.org/',''); doiNumber = doiNumber.replace('/','-'); doiPath = self.grupo.diretorioDoi+'/'+doiNumber if (os.path.isfile(doiPath)): arquivoX = open(doiPath) rawDOIhtml = arquivoX.read() arquivoX.close() print "- Utilizando DOI armazenado no cache: " + doiPath #----------------------------------------------------------------- # tentamos 3 vezes baixar a página web associado ao DOI else: tentativa = 1 while tentativa<=1: try: req = urllib2.Request(urlDOI, txdata, txheaders) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) response = opener.open(req) rawDOIhtml = response.read() print "- Baixando publicacao com DOI: " + urlDOI # rawDOIhtml = HTMLParser.HTMLParser().unescape(rawDOIhtml.decode("utf8", "ignore")) rawDOIhtml = HTMLParser().unescape(rawDOIhtml.decode("utf8", "ignore")) rawDOIhtml = unicodedata.normalize('NFKD',unicode(rawDOIhtml)).encode('ascii','ignore') if not self.grupo.diretorioDoi=='': print "- Armazenando DOI armazenado no cache: " + doiPath filename = doiPath file = open(filename, "w") file.write(rawDOIhtml) file.close() break except: print '[AVISO] Tentativa '+str(tentativa)+': DOI não está disponível na internet: ', urlDOI time.sleep(10) rawDOIhtml = None tentativa += 1 continue dataDoi=[] if rawDOIhtml is not None: parserData = self.procurarParser(urlDOI) if parserData is not None: if len(parserData)==6: print "**caso -- " + parserData[0] caso = genericParser(parserData) caso.feed(rawDOIhtml) doihtml = str(caso.data) dataDoi.append(doihtml) dataDoi.append(parserData) elif urlDOI.find("10.1134")>-1: print "**caso - 10.1134" casoUrl = parser101007() casoUrl.feed(rawDOIhtml) doihtml = str(casoUrl.data) parserData = ["10.1134",'','','','authoraddress=.*\+','.*&contentid'] dataDoi.append(doihtml) dataDoi.append(parserData) # elif urlDOI.find("10.1007")>-1: # print "**caso - 10.1007" # casoUrl = parser101007() # casoUrl.feed(rawDOIhtml) # doihtml = str(casoUrl.data) # parserData = ["10.1007",'','','','authoraddress=.*\+','.*&contentid'] # dataDoi.append(doihtml) # dataDoi.append(parserData) # elif urlDOI.find("10.1021")>-1: # # print "**caso -- 10.1021" # caso=parser101021() # caso.feed(rawDOIhtml) # doihtml= str(caso.data) # parserData=["10.1021",'','','',',.*,\s*','[\s*|,|;|-|\.|\'|\"]'] # dataDoi.append(doihtml) # dataDoi.append(parserData) elif urlDOI.find("10.1590")>-1: print "**caso -- 10.1590" caso = parser101590() caso.feed(rawDOIhtml) doihtml = str(caso.data) #print doihtml parserData = ["10.1590",'','','',',.*,\s*','[\s*|,|;|-|\.|\'|\"]'] dataDoi.append(doihtml) dataDoi.append(parserData) else: print "**caso DEFAULT não esta no xml" doihtml = self.html2texto(rawDOIhtml) dataDoi.append(doihtml) else: dataDoi=[] return dataDoi