예제 #1
0
def parse_reuters(directory, output_dir):
	if not os.path.exists(output_dir):
		os.mkdir(output_dir)
	with open(os.path.join(output_dir, 'reuters.xml'),'w') as fileout:
		index = 0
		bodyhead = "<body>\n <body.content>\n <block class='lead_paragraph'>\n"
		bodyfooter = "</block>\n </body.content>\n </body>"
		footer = "\n</nitf>"
		keyhead = "<classifier>"
		keyfooter = "</classifier>"
		categories = os.listdir(directory)
		print >> fileout, "<?xml version='1.0' encoding='UTF-8'?>\n" 
		print >> fileout, "<nitf>" 
		for cat in categories:
			category_path = os.path.join(directory, cat)
			articles = os.listdir(category_path)
			for article in articles:
				title = True
				author = True
				place = True
				index += 1
				keys = []
				titlestring = ""
				docdatastring = ""
				with open(os.path.join(category_path, article)) as contents:
					text = ""
					for content in contents:
						if not content.isspace():
							if title:
								header = '<head>\n'
								titlestring = '<title>' + clean(content) + '</title>\n'
								title = False
	  							docdatastring = '<doc-id id-string="%s"/>' % (10000000 + index)
	  							# keys = [i for i in clean_total(content).lower().split() if i not in stop]
		  					elif author:
								author = False
								if '<AUTHOR>' in content:
									content = content.replace('<AUTHOR>','').replace('</AUTHOR>','')
								text += clean(content)
							else:
								text += content
					cleaned_text = clean_total(text)
					keys = get_keywords(cleaned_text)
					print >>fileout, titlestring
					print >> fileout, header
					print >> fileout, '<docdata>'
					print >> fileout, docdatastring
					print >> fileout, '<identified-content>\n'
					print >> fileout, keyhead				
					print >> fileout, (" ").join(keys + [cat])
					print >> fileout, keyfooter
					print >> fileout, '</identified-content>\n'
					print >> fileout, '</docdata>\n'
					print >> fileout, '</head>\n'
					print >> fileout, bodyhead
					print >> fileout, clean(cleaned_text)
					print >> fileout, bodyfooter
					print >> fileout, footer
예제 #2
0
 def identify(self, data):
     device = utils.get_device_ua(data)
     if "Windows" in device:
         return {"brand": "Windows", "model": "Windows"}
     elif "Macintosh" in device:
         return {"brand": "Apple", "model": "MacBook"}
     # (iPhone; CPU iPhone OS 10_3_2 like Mac OS X)
     # (iPad; CPU iPhone OS 10_3_2 like Mac OS X)
     elif "iphone" in device.lower() and "like Mac OS".lower() in device.lower():
         return {"brand": "Apple", "model": "iPhone"}
     elif "iPad".lower() in device.lower() and "like Mac OS".lower() in device.lower():
         return {"brand": "Apple", "model": "iPad"}
     elif "android" in device.lower():
         device_ua = utils.get_keywords(device)
         # print(self.precise)
         if device_ua is None:
             return {"brand": "", "model": ""}
         device_ua = device_ua.upper()
         if device_ua in self.precise:
             return self.precise[device_ua]
         if device_ua in self.pattern:
             return self.pattern[device_ua]
         # return {"brand": "", "model": ""}
         return None