Esempio n. 1
0
def decant( corpus, routine, settings, ref_completion=1.0 ):
	from sven.anta.models import Analysis, Routine, Segment, Segment_Concept, Document, Document_Segment, Document_Tag, Tag, Concept
	from sven.anta.utils import textify
	# path = settings.MEDIA_ROOT + options.corpus
	# print NL_STOPWORDS
	logger.info( "starting pattern analysis on corpus: '%s' [%s]" % ( corpus.name, corpus.id ) )
	# get document corpus
	
	log_routine( routine, entry="[info] starting pattern analysis on corpus: %s" % corpus.id )
	
	
	# current analysis, if any
	try: 
		analysis = Analysis.objects.get(corpus=corpus, type="PT")
	except Analysis.DoesNotExist:
		analysis = Analysis( corpus=corpus, type="PT", start_date=datetime.now(), status="CRE" )
		analysis.save()
	except:
		raise

	routine.analysis.add( analysis )
	routine.save()

	# total count
	total_count = Document.objects.filter(corpus=corpus).count()


	# current document (new)
	documents =  Document.objects.filter(corpus__id=corpus.id, status='NEW')

	logger.info( "['%s':%s] documents to be analyzed: %s" % ( corpus.name, corpus.id, documents.count() ) )
	# if analysis.document is None:
	#	documents = Document.objects.filter(corpus__id=corpus.id)
	#	analysis.document = documents[0]
	#else:
	#	documents = Document.objects.filter(corpus__id=corpus.id, id__gt=analysis.document.id)
	#	if documents.count() == 0:
	#		documents = Document.objects.filter(corpus__id=corpus.id)

	# pending status for current analysis
	analysis.status = "PN"
	analysis.save()

	i = 0

	# cycle through documents
	for d in documents:
		i = i + 1
		d.status='IN'
		d.save()

		# update analysis with current document
		analysis.document = d
		analysis.save()

		log_routine( routine, completion=ref_completion * i / total_count )

		# a = Analysis( document=d, )
		logger.info("%s / %s trying to convert document '%s' [%s], mimetype %s" % ( i, total_count, d.title, d.id, d.mime_type) )

		try:
			textified =  textify( d, settings.MEDIA_ROOT )
		except Exception, e:
			analysis.status="ERR"
			d.status = 'ERR'
			d.save()
			analysis.save()
			logger.error("%s / %s FAILED converting document '%s' [%s], mimetype %s with Exception: %s" % ( i, total_count, d.title, d.id, d.mime_type, e) )
			continue

		if textified == False:
			analysis.status="ERR"
			d.status = 'ERR'
			d.save()
			analysis.save()
			logger.error("%s / %s FAILED converting document '%s' [%s], mimetype %s" % ( i, total_count, d.title, d.id, d.mime_type) )
			continue
		
		textified = textified.replace("%20"," ")
		analysis.completion = 0.0
		analysis.document = d
		analysis.save()
		
		# load storpwords for document d language
		if d.language == "NL": 
			stopwords = NL_STOPWORDS
		elif d.language == "EN":
			stopwords = EN_STOPWORDS
		else:
			stopwords = []

		
		logger.info("%s / %s NP extraction started over document '%s' [%s], language: '%s', file: '%s'" % ( i, total_count, d.title, d.id, d.language, textified) )

		#start distill anaysis, exclude given stopwors
		try:
			distilled = distill( filename=textified, language=d.language.lower(), stopwords=stopwords )
		except Exception, e:
			d.status = 'ERR'
			logger.exception("EXCEPTION distill document")
			logger.error("%s / %s FAILED distill document '%s' [%s], mimetype %s with Exception: %s" % ( i, total_count, d.title, d.id, d.mime_type, e) )
			d.save()
			continue
Esempio n. 2
0
	for d in documents:
		i = i + 1
		d.status='IN'
		d.save()

		# update analysis with current document
		analysis.document = d
		analysis.save()

		log_routine( routine, completion=ref_completion * i / total_count )

		# a = Analysis( document=d, )
		logger.info("[%s:%s] %s / %s trying to convert document '%s' [%s], mimetype %s" % (corpus.name, corpus.id, i, total_count, d.title, d.id, d.mime_type))

		try:
			textified =  textify( d, settings.MEDIA_ROOT )
		except Exception, e:
			logger.exception('[%s:%s] Exception: textify function failed on document "%s"[%s]' % (corpus.name, corpus.id, d.title, d.id))
			analysis.status="ERR"
			d.status = 'ERR'
			d.save()
			analysis.save()
			continue

		if textified == False:
			analysis.status="ERR"
			d.status = 'ERR'
			d.save()
			analysis.save()
			logger.error("%s / %s FAILED converting document '%s' [%s], mimetype %s" % ( i, total_count, d.title, d.id, d.mime_type) )
			continue
Esempio n. 3
0
def decant( corpus, routine, settings, ref_completion=1.0 ):
	from sven.anta.models import Analysis, Routine, Segment, Segment_Concept, Document, Document_Segment, Document_Tag, Tag, Concept
	from sven.anta.utils import textify
	# path = settings.MEDIA_ROOT + options.corpus
	# print NL_STOPWORDS
	
	# get document corpus
	print "[info] starting pattern analysis on corpus:",corpus.id, corpus.name
	
	log_routine( routine, entry="[info] starting pattern analysis on corpus: %s" % corpus.id )
	
	
	# current analysis, if any
	try: 
		analysis = Analysis.objects.get(corpus=corpus, type="PT")
	except Analysis.DoesNotExist:
		analysis = Analysis( corpus=corpus, type="PT", start_date=datetime.now(), status="CRE" )
		analysis.save()
	except:
		raise

	routine.analysis.add( analysis )
	routine.save()

	# total count
	total_count = Document.objects.filter(corpus=corpus).count()


	# current document
	if analysis.document is None:
		documents = Document.objects.filter(corpus__id=corpus.id)
		analysis.document = documents[0]
	else:
		documents = Document.objects.filter(corpus__id=corpus.id, id__gt=analysis.document.id)
		if documents.count() == 0:
			documents = Document.objects.filter(corpus__id=corpus.id)

	# pending status for current analysis
	analysis.status = "PN"
	analysis.save()

	i = 0

	# cycle through documents
	for d in documents:
		i = i + 1

		# update analysis with current document
		analysis.document = d
		analysis.save()

		log_routine( routine, completion=ref_completion * i / total_count )

		# a = Analysis( document=d, )
		print "[info] document mimetype:",d.mime_type

		textified =  textify( d, settings.MEDIA_ROOT )
		
		if textified == False:
			analysis.status="ERR"
			analysis.save()
			raise Exception("error in textify function")
		
		textified = textified.replace("%20"," ")
		analysis.completion = 0.0
		analysis.document = d
		analysis.save()
		
		# load storpwords for document d language
		if d.language == "NL": 
			stopwords = NL_STOPWORDS
		elif d.language == "EN":
			stopwords = EN_STOPWORDS
		else:
			stopwords = []

		print "[info] document language:",d.language
		print "[info] analysis started on doc ", d.id,"'", d.title,"'", d.language.lower(), "file:",textified
		
		#start distill anaysis, exclude given stopwors
		distilled = distill( filename=textified, language=d.language.lower(), stopwords=stopwords )
		
		analysis.completion = .1
		analysis.save()

		# append keywords as tag for the document
		for k in distilled['keywords']:
			# print k
			candidate = k[1]
			# get tag
			try:
				t = Tag.objects.get( name=candidate, type="keyword" )
			except:
				# todo lemma version of a word according to language
				t = Tag( name=candidate, type="keyword" )
				try:
					t.save()
				except:
					print "[warning] unable to save as tag:", candidate
					continue
		 	
		 	# set tag documnt relation
		 	try:
		 		td = Document_Tag( document=d, tag=t)	
		 		td.save()
		 	except:
		 		#relation exist,
		 		continue
		analysis.completion = .5
		analysis.save()
		# segment
		first = True
		for segment in distilled['segments']:
						

			if len(segment[0]) > 128:
				print "[warning] sample 'segment' will be truncated:", segment[0]
				continue
			try:
				s = Segment.objects.get( content=segment[0][:128], language=d.language)
			except:
				s = Segment( content=segment[0][:128], stemmed=re.sub("\s+", ' ', " ".join(segment[1])[:128] ), language=d.language )
				try:
					s.save()
				except:
					print "[warning] unable to save segment:", segment[0][:128]
					continue
			try:
				sd = Document_Segment.objects.get( document=d, segment=s )
			except:
				sd = Document_Segment( document=d, segment=s, tf=segment[2] )
				sd.save()
				# relationship exist
			if first:
				print "[info] sample 'segment' saved:", s.id, s.content, ", stem:", s.stemmed ,", tf:", sd.tf
				
			
			# save concept and attach
			for k in segment[1]:
				# ignore numbers				
				k = re.sub("[\d\-\.]+","", k)
				if len(k) < 2:
					continue
				try:
					c = Concept.objects.get( content=k, language=d.language)
				except:
					try:
						c = Concept( content=k, language=d.language )

						c.save()
					except Exception, e:
						print "[warning] unable to save concept: %s, exception: %s" % (k, e)
						continue
				try:
					sc = Segment_Concept.objects.get( segment=s, concept=c )
				except:
					sc = Segment_Concept( segment=s, concept=c )
					sc.save()	
					
				if first:
					print "[info] sample 'concept' saved:",c.id, c.content

			first = False
			
		
		
		print "[info] analysis ended on doc", d.id,"'", d.title,"'"