Python FCUtil.strip_html Examples

Programming Language: Python

Class/Type: FCUtil

Method/Function: strip_html

Examples at hotexamples.com: 2

Python FCUtil.strip_html - 2 examples found. These are the top rated real world Python examples of FCUtil.strip_html extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

cleanStr(7)

openJsonFile(2)

removeSpecChar(2)

remove_html_tags(2)

strip_html(2)

getAllAnnotationsforWork(1)

getAllWorksExcluding(1)

wordcount_to_charcount(1)

Example #1

Show file

File: translate.py Project: sethwoodworth/annotation-parse

def getStartOffsetFromAnnId(id):
	
	for obj in AnnJSON:
		if obj['id'] == id:
			wordIndex = obj['start_index']
			Oquote = obj['quote']
			content = getContentBySectionId(obj['section_id'])
			quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote)))
			quote = re.sub('\s*(/)\s*',' ',quote).strip()
			endQuote = quote.split()[0]
			if	"'" in quote[:1]:
				quote = quote[1:]
			startQuote = quote.split()[0]
			
			#print "quote " + quote
			
			cw = FCUtil.strip_html(FCUtil.remove_html_tags(content))
			cnw = FCUtil.strip_html(content)
			c = cw.split()
			#print cnw
			#print len(c)
			#print 'startWordINdex: '+str(wordIndex)
			
			startQuoteIndex = [m.start() for m in re.finditer(startQuote.replace('[','\[').replace(']','\]'), cnw)] 
			quoteIndex = [m.start() for m in re.finditer(quote, cnw)]
			
			#print 'StartQuoteIndex for: '+startQuote
			#print startQuoteIndex
			
			
			if len( startQuoteIndex) == 1:
				#print 'via Start Quote'
				return [startQuoteIndex[0],1]
			elif len(quoteIndex) == 1:
				#print 'via entire Quote'
				return [quoteIndex[0],1]
			elif len(quoteIndex) == 0:
				sQuote = quote.split()
				print sQuote
				for i in sQuote:
					try:
						tempA = [m.start() for m in re.finditer(i, cnw)]
						if len(tempA) == 1:
							#print 'new way to town'
							return [tempA[0]-len(quote.split(i,1)[0]),1]
					except:
						pass
			else:
				sQuote = quote.split()
				check = ''
				for i in sQuote:
					check +=str(str(i)+' ')
					try:
						tempA = [m.start() for m in re.finditer(check, cnw)]
						if len(tempA) == 1:
							#print 'should delete'
							return [tempA[0],1]
					except:
						pass
						
			if len(quoteIndex) > 1:  #some how get word count to 
				print quoteIndex
			
			startLoc = []
			for idx, k in enumerate(c):
				if 	startQuote in k:
					startLoc.append(idx)
			#print startLoc
			
			place = 0
			m = 10000
			www = 0
			for idx,k in enumerate(startLoc):
				if(m > abs(k-wordIndex)):
					m = abs(k-wordIndex)
					place = idx
			#print "startIndex: "+str(place)
			#print "StartWordIndex: "+str(wordIndex)
			#print len(cnw.rsplit(startQuote,len(startLoc)-place)[0])
			if(len(startLoc) > 0):
				#return len(cnw.split(startQuote,len(startLoc)-place-1)[0])
				return [startQuoteIndex[place],len(startQuoteIndex),len(quote)]
			
			print 'START_ERROR'
			return None

Example #2

Show file

File: translate.py Project: sethwoodworth/annotation-parse

def getEndOffsetFromAnnId(id):
	
	for obj in AnnJSON:
		if obj['id'] == id:
			wordIndex = obj['end_index']
			Oquote = obj['quote']
			quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote)))
			quote = re.sub('\s*(/)\s*','',quote).strip()
			endQuote = quote.rsplit()[-1]
			if	"'" in quote[:1]:
				quote = quote[1:]
				
			#print "quote " + quote
			
		
			content = getContentBySectionId(obj['section_id'])
			cw = FCUtil.strip_html(FCUtil.remove_html_tags(content))
			#possible use of beautifulsoup
			cnw = FCUtil.strip_html(content)
			c = cw.split()
			#print c
			endQuoteIndex = [m.start() for m in re.finditer(endQuote.replace('[','\[').replace(']','\]'), cnw)] 
			quoteIndex = [m.start() for m in re.finditer(quote, cnw)]
			
			#print 'Quote: '+quote
			#print 'Content: '+cnw
			#print 'EndQuoteIndex for: '+endQuote
			#print endQuoteIndex
			
			if len( endQuoteIndex) == 1:
				return [endQuoteIndex[0]+len(endQuote),1,len(quote)]
			elif len(quoteIndex) == 1:
				return [len(quote)+quoteIndex[0],1,len(quote)]
			elif len(quoteIndex) == 0:
				sQuote = quote.split()
				for i in sQuote:
					try:
						tempA = [m.start() for m in re.finditer(i, cnw)]
						if len(tempA) == 1:
							#print 'new way to town'
							return [tempA[0]+len(quote.split(i,1)[1]),1,len(quote)]
					except:
						pass
			else:
				sQuote = quote.split()
				check = ''
				for i in sQuote:
					check +=str(str(i)+' ')
					try:
						tempA = [m.start() for m in re.finditer(check, cnw)]
						if len(tempA) == 1:
							return [len(quote)+tempA[0],1,len(quote)]
					except:
						pass
			
			endLoc = []
			for idx, k in enumerate(c):
				if 	endQuote in k:
					endLoc.append(idx)
			#print endLoc
			
			place = 0
			m = 10000
			for idx,k in enumerate(endLoc):
				if(m > abs(k-wordIndex)):
					m = abs(k-wordIndex)
					place = idx
			#print "index "+str(place)
			#print len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote)
			#print "EndWordIndex: "+str(wordIndex)
			#print "EndIndex: "+str(place)
			if len(endLoc) >0:
				#return len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote)
				return [endQuoteIndex[place]+len(endQuote),len(endQuoteIndex),len(quote)]
			
			print 'END_ERROR'
			return None