Python Normalize Beispiele, rrslib.extractors.normalize.Normalize Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: xmlimport.py Projekt: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

 def prearrange(self, obj):
     """
     Passive preprocessing of object data. It means adding attributes like
     title_normalized, looking for mime-types of file etc etc.
     This method doesnt touch the database, it works only on the object
     attributes.
     @return True if object was successfully preprocessed
     @return False if object doesnt fulfill table contraints or if object damaged
     """
     # add normalized title
     if "title_normalized" in [x for x in obj]:
         norm_method = getattr(Normalize, obj._table_name)
         # I'm still not sure if it is implemented yet
         assert callable(norm_method)
         if obj["title"] is None:
             return False
         obj["title_normalized"] = norm_method(obj["title"])
         if obj["title_normalized"] is None:
             return False
     # add missing mime type if needed
     if obj._table_name == "file":
         # if "type" not in obj:  TODO really??
         fn = obj["filename"]
         r = self._mime.start([fn])
         obj["type"] = r[fn]
     elif obj._table_name == "person":
         if obj["full_name"] is None:
             obj["full_name"] = " ".join(
                 [obj[x] for x in ("first_name", "middle_name", "last_name") if obj[x] is not None]
             )
         if obj["full_name_ascii"] is None:
             obj["full_name_ascii"] = Normalize.to_ascii(obj["full_name"])
     elif obj._table_name == "location":
         if "city" in obj:
             city = Normalize.location(obj["city"], True)
             if city is None:
                 obj.city = None
             else:
                 obj["city"] = city
         if "country" in obj:
             country = Normalize.location(obj["country"], True)
             if country is None:
                 obj.country = None
             else:
                 obj["country"] = country
         if "address" in obj:
             addr = Normalize.location(obj["address"])
             if addr is None:
                 obj.address = None
             else:
                 obj["address"] = addr
     elif obj._table_name == "publication_section":
         self._bind_publ_section_to_text(obj)
     return True

Beispiel #2

0

Datei anzeigen

Datei: researchrFeed.py Projekt: KNOT-GIT/ResearchR

	def __init__(self, config, importer_kwargs):
		#data ziskana z api
		self.rPublication = None
		
		#objekt typu RRSPublication, ktery po naplneni budeme importovat do db
		self.publication = None

		#nastaveni pro importer
		self.importer_kwargs = importer_kwargs

		#sleeper range
		self.LimitMin = 0.1
		self.LimitMax = 0.2

		#objekt pro vytvareni sql dotazu
		self.q = FluentSQLQuery()

		#researchr API
		self.researchrClass = ResearchrClass()

		#nejvyssi vrstva, pro nacteni objektu podle id
		self.rrsdb = RRSDatabase()

		#normalizator
		self.norm = Normalize()
		
		#importer
		self.importer = RRSXMLImporter(self.importer_kwargs)

Beispiel #3

0

Datei anzeigen

 def prearrange(self, obj):
     """
     Passive preprocessing of object data. It means adding attributes like
     title_normalized, looking for mime-types of file etc etc.
     This method doesnt touch the database, it works only on the object
     attributes.
     @return True if object was successfully preprocessed
     @return False if object doesnt fulfill table contraints or if object damaged
     """
     # add normalized title
     if 'title_normalized' in [x for x in obj]:
         norm_method = getattr(Normalize, obj._table_name)
         # I'm still not sure if it is implemented yet
         assert callable(norm_method)
         if obj['title'] is None:
             return False
         obj['title_normalized'] = norm_method(obj['title'])
         if obj['title_normalized'] is None:
             return False
     # add missing mime type if needed
     if obj._table_name == "file":
         #if "type" not in obj:  TODO really??
         fn = obj['filename']
         r = self._mime.start([fn])
         obj['type'] = r[fn]
     elif obj._table_name == "person":
         if obj['full_name'] is None:
             obj['full_name'] = " ".join([obj[x] for x in ('first_name', \
                         'middle_name', 'last_name') if obj[x] is not None])
         if obj['full_name_ascii'] is None:
             obj['full_name_ascii'] = Normalize.to_ascii(obj['full_name'])
     elif obj._table_name == "location":
         if 'city' in obj:
             city = Normalize.location(obj['city'], True)
             if city is None: obj.city = None
             else: obj['city'] = city
         if 'country' in obj:
             country = Normalize.location(obj['country'], True)
             if country is None: obj.country = None
             else: obj['country'] = country
         if 'address' in obj:
             addr = Normalize.location(obj['address'])
             if addr is None: obj.address = None
             else: obj['address'] = addr
     elif obj._table_name == "publication_section":
         self._bind_publ_section_to_text(obj)
     return True

Beispiel #4

0

Datei anzeigen

Datei: htmltools.py Projekt: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

 def clean_html(self, elemtree):
     """
     Cleans HTML page in format lxml.etree._ElementTree. This method decodes
     HTML entities and translates national characters into normal form.
     Warining! This method creates new ElementTree instead of the old one!
     """
     ed = HtmlEntityDecoder()
     html = tostring(elemtree)
     html = ed.decode_htmlentities(html)
     html = Normalize.translate_national(html)
     html = re.sub("<[bB][rR][^>]*\/?>", " ", html)
     return ElementTree(fromstring(html))

Beispiel #5

0

Datei anzeigen

Datei: htmltools.py Projekt: lucidvoci/ResearchProjectPortal

 def clean_html(self, elemtree):
     """
     Cleans HTML page in format lxml.etree._ElementTree. This method decodes
     HTML entities and translates national characters into normal form.
     Warining! This method creates new ElementTree instead of the old one!
     """
     ed = HtmlEntityDecoder()
     html = tostring(elemtree)
     html = ed.decode_htmlentities(html)
     html = Normalize.translate_national(html)
     html = re.sub("<[bB][rR][^>]*\/?>", " ", html)
     return ElementTree(fromstring(html))

Beispiel #6

0

Datei anzeigen

Datei: researchrFeed.py Projekt: KNOT-GIT/ResearchR

class ResearchrPublicationFeeder:
	def __init__(self, config, importer_kwargs):
		#data ziskana z api
		self.rPublication = None
		
		#objekt typu RRSPublication, ktery po naplneni budeme importovat do db
		self.publication = None

		#nastaveni pro importer
		self.importer_kwargs = importer_kwargs

		#sleeper range
		self.LimitMin = 0.1
		self.LimitMax = 0.2

		#objekt pro vytvareni sql dotazu
		self.q = FluentSQLQuery()

		#researchr API
		self.researchrClass = ResearchrClass()

		#nejvyssi vrstva, pro nacteni objektu podle id
		self.rrsdb = RRSDatabase()

		#normalizator
		self.norm = Normalize()
		
		#importer
		self.importer = RRSXMLImporter(self.importer_kwargs)

	def __FillType(self):
		"""
		Transform rPublication.type to publication.type
		"""
		_id = self.__GetId("publication_type", "type=", self.rPublication.publication_type)
		if (_id != None):
			self.publication["type"] = self.rrsdb.load("publication_type", _id)
	

	def __FillSeries(self):
		"""
		Add rPublication.series to publication_series table
		"""
		if (self.rPublication.series != None and self.rPublication.series != ""):
			_id = None
			while (_id == None):
				_id = self.__GetId("publication_series", "title=", self.rPublication.series)
				if (_id == None):
					series = RRSPublication_series(title=self.rPublication.series)
					#importer = RRSXMLImporter(self.importer_kwargs)
					self.importer.import_model(series)
					continue
			self.publication["series"] = self.rrsdb.load("publication_series", _id)
			

	def __GetId(self, _from, where, _is):
		"""
		Try to find ID in table and return it
		
		@type  _from: string
		@param _from: Name of table.
		@type  where: string
		@param where: Name of column.
		@type  _is: string
		@param _is: What it is equal.
		@rtype:   int
		@return:  Id of selected entry.
		"""
		self.q.select("id").from_table(_from)
		self.q.where(where, _is)
		self.q()
		data = self.q.fetch_one()
		#print(self.q.sql())
		self.q.cleanup()
		if data != None:
			return data[0]
		return None
	
	def __FillPublisher(self):
		"""
		Add rPublication.publisher to organization table
		"""
	 	if (self.rPublication.publisher != None and self.rPublication.publisher != ""):
			_id = None
			normalized_title = self.norm.organization(self.rPublication.publisher)
			while (_id == None):
				_id = self.__GetId("organization", "title_normalized=", normalized_title)
				if (_id == None):
					organization = RRSOrganization(title=self.rPublication.publisher, 
						title_normalized=normalized_title)
					#importer = RRSXMLImporter(self.importer_kwargs)
					self.importer.import_model(organization)
					continue
				self.publication["publisher"] = self.rrsdb.load("organization", _id)

	def __FillAuthors(self, authorData, isEditor):
		"""
       		FillAuthor Add (if there are not) person to db and
       		contain them with actual publication. Foreach
		rPublication.authors, take only person's url and fullname.
		
		@type  authorData: list
		@param authorData: List of authors data (person, alias)
		@type  isEditor: bool
		@param isEditor: True if authors are editors of this publication.
		"""
		if (len(authorData) != 0):
			rank = 0
			for author in authorData:
				if 'author' in author:
					rFullname = author["person"]["fullname"]
					rUrl = author["person"]["url"]
				else:
					rFullname = author["alias"]["name"]
					rUrl = author["alias"]["url"]
				personUrl = RRSRelationshipPersonUrl()
				rank += 1
				self.__FillUrl(personUrl, rUrl)
				self.__FillPerson(personUrl, rFullname, rank, isEditor)

	def __FillUrl(self, personUrl, rUrl):
		"""
		This function add url to db bind url to person 

		@type  personUrl: RRSRelationshipPersonUrl
		@param personUrl: Relationship object to add url into it.
		@type  rUrl: string
		@param isEditor: rPublication.(person/alias) url, url of author/editor.
		"""
		_id = None
		while (_id == None):
			_id = self.__GetId("url", "link=", rUrl)
			if (_id == None):	
				url = RRSUrl(link=rUrl)
				url["type"] = self.rrsdb.load("url_type", "1")
				#importer = RRSXMLImporter(self.importer_kwargs)
				self.importer.import_model(url)	
				continue
			url = self.rrsdb.load("url", _id)
			personUrl.set_entity(url)
			#print( personUrl)

	def __FillPerson(self, personUrl, rFullname, rank, isEditor):
		"""
		This function try fill first name, middle name, last name of person.

		@type  personUrl: RRSRelationshipPersonUrl
		@param personUrl: Relationship object to bind to person["url"].
		@type  rFullname: string
		@param rFullname: Fullname of author.
		@type  rank: int
		@param rank: Rank of author, first author get 1, second 2 and so on.
		@type  isEditor: bool
		@param isEditor: True if person is editor of this publication.
		"""
		_id = None
		while (_id == None):
			_id = self.__GetId("person", "full_name=", rFullname)
			if (_id == None):
				person = RRSPerson()
				person["full_name"] = rFullname
				person["url"] = personUrl
				self.__SetPersonNames(person, rFullname)
				person["full_name_ascii"] = unicodedata.normalize('NFKD', rFullname).encode('ascii', 'ignore')
				#importer = RRSXMLImporter(self.importer_kwargs)
				#print(person)
				self.importer.import_model(person)
				continue
			publicationPerson = RRSRelationshipPersonPublication(author_rank=rank, editor=isEditor)
			publicationPerson.set_entity(self.rrsdb.load("person", _id))
			#print(publicationPerson)
			self.publication['person'] = publicationPerson

	def __SetPersonNames(self, person, rFullname):
		"""
		This function try fill first name, middle name, last name of person.

		@type  person: RRSPerson
		@param person: Object of author of publication.
		@type  rFullname: string
		@param rFullname: Fullname of author.
		"""
		splitName = rFullname.split()
		if (len(splitName) == 3):
			person["first_name"] = splitName[0]
			person["middle_name"] = splitName[1]
			person["last_name"] = splitName[2]
		elif (len(splitName) == 2):
			person["first_name"] = splitName[0]
			person["last_name"] = splitName[1]

	def FillPublication(self, key):
		"""
		This function call all private function with prefix Fill, 
		this function load data to rPublication structure and then 
		assign data from rPublication to publication(RRSPublication).
		
		@type  key: string
		@param key: Key of the publication.
		"""
		self.__FillRPublication(key)
		self.publication = RRSPublication()
		self.__FillAuthors(self.rPublication.authors, False)
		self.__FillAuthors(self.rPublication.editors, True)
		self.__FillPublisher()
		self.__FillType()
		self.__FillSeries()
		self.publication["title"] = self.rPublication.title
		self.publication["title_normalized"] = self.norm.publication(self.rPublication.title)

		if (self.rPublication.year != None and self.rPublication.year != ""):
			self.publication["year"] = int(self.rPublication.year) # "2000" -> 2000

		if (self.rPublication.month != None and self.rPublication.month != ""):
			self.publication["month"] = int(strptime(self.rPublication.month[:3],'%b').tm_mon)

		if (self.rPublication.volume != None and self.rPublication.volume != "" and self.rPublication.volume.isdigit()):
			self.publication["volume"] = int(self.rPublication.volume)

		if (self.rPublication.number != None and self.rPublication.number != "" and self.rPublication.volume.isdigit()):
			self.publication["number"] = int(self.rPublication.number)

		if (self.rPublication.abstract != None and self.rPublication.abstract != ""):
			self.publication["abstract"] = self.rPublication.abstract

		if (self.rPublication.doi != None and "http://dx.doi.org/" in self.rPublication.doi):
			self.publication["doi"] = self.rPublication.doi.strip('http://dx.doi.org/')

		if (self.rPublication.firstpage != None and self.rPublication.lastpage != None and 
			self.rPublication.firstpage != "" and self.rPublication.lastpage != ""):
			self.publication["pages"] = str(self.rPublication.firstpage) + " - " + str(self.rPublication.lastpage)

		self.publication["language"] = self.rrsdb.load('language', 1)
		self.publication.set("researchr_key", self.rPublication.key, strict=False)
		#print(self.publication)
		#importer = RRSXMLImporter(self.importer_kwargs)
		try:
			self.importer.import_model(self.publication)
		except RRSDatabaseEntityError as e:
			print('RRSDatabaseEntityError - %s, %s' % (self.rPublication.key, str(e)))
			logging.warning('RRSDatabaseEntityError - %s, %s' % (self.rPublication.key, str(e)))
		except DatabaseError as e:
			print('DatabaseError - %s, %s' % (self.rPublication.key, str(e)))
			logging.warning('DatabaseError - %s, %s' % (self.rPublication.key, str(e)))
		except TypeError as e:
			print('TypeError - %s, %s' % (self.rPublication.key, str(e)))
			logging.warning('TypeError - %s, %s' % (self.rPublication.key, str(e)))
		except:
			print('Unexpected error - %s, %s' % (self.rPublication.key, sys.exc_info()[0]))
			logging.warning('Unexpected error - %s, %s' % (self.rPublication.key, sys.exc_info()[0]))

	def __FillRPublication(self, key):
		"""
		Fill rPublication object.

		@type  key: string
		@param key: Name od publication.	
		"""
		self.rPublication = RPublication()
		publicationData = self.researchrClass.getPublication(key)
		time.sleep(random.uniform(self.LimitMin, self.LimitMax))
		#print(publicationData)
		for key, value in publicationData.items():
			if key == 'abstract':
				self.rPublication.abstract = value
			elif key == 'address':
				self.rPublication.address = value
			elif key == 'authors':
				self.rPublication.authors = value
			elif key == 'booktitle':
	     			self.rPublication.booktitle = value
			elif key == 'conference':
	    			self.rPublication.conference = value
			elif key == 'conferenceYear':
	     	       		self.rPublication.conferenceYear = value
			elif key == 'doi':
	     	       		self.rPublication.doi = value
			elif key == 'editors':
				self.rPublication.editors = value
			elif key == 'firstpage':
	     	       		self.rPublication.firstpage = value
			elif key == 'key':
				self.rPublication.key = value
			elif key == 'issuenumber':
				self.rPublication.issuenumber = value
			elif key == 'journal':
				self.rPublication.journal = value
			elif key == 'key':
				self.rPublication.key = value
			elif key == 'lastpage':
	     	       		self.rPublication.lastpage = value
			elif key == 'month':
	     	       		self.rPublication.month = value
			elif key == 'note':
				self.rPublication.note = value
	     		elif key == 'number':
	     	       		self.rPublication.number = value
	     		elif key == 'organization':
	  	   		self.rPublication.organization = value
	  	   	elif key == 'publisher':
	     			self.rPublication.publisher = value
	     		elif key == 'series':
	     			self.rPublication.series = value
	  	   	elif key == 'title':
	  	   		self.rPublication.title = value
	  	   	elif key == 'type':
	 			self.rPublication.publication_type = value
	     		elif key == 'url':
	     			self.rPublication.url = value
	     		elif key == 'volume':
	   			self.rPublication.volume = value
	    		elif key == 'volumenumber':
				self.rPublication.volumenumber = value
	     		elif key == 'year':
		    		self.rPublication.year = value