def crunch(worker,conv,workId):
		
		
		list = FCUtil.getAllAnnotationsforWork(workId)
		total = 0;
		badMatches = 0
		#print 'this is the list '+str(list)
		#print "Total: %d"%len(list)
		
		et = time()
		st = time()
		for num in list:
			
			
			print "Annotation on: %d %0.2f%% Last one took: %.1f "%(num,(float((total+1))/float(len(list))*100.0),et-st)
			st = time()
			#conv.convert(num)
			total += 1
			et = time()
		badMatches += conv.noMatches

		if badMatches != 0 and total !=0:
			p = (float(badMatches)/float(total))*100.0
			print "No matches for %d : %4.2f" %(workId, p)
		print "No Matches for "+str(workId)+' : '+str(badMatches)
		print "Total for "+str(workId)+' : '+str(total)
	def __init__(self,annDic):
		self.dic = annDic
		self.id = annDic['id']
		self.section_id = annDic['section_id']
		self.start_index = annDic['start_index']
		self.end_index = annDic['end_index']
		self.deleted = annDic['deleted_on']
		tempQuote = BeautifulSoup(FCUtil.removeSpecChar(annDic['quote']).lstrip().lstrip('\''))
		self.quote = self.fixQuote(tempQuote.get_text())
		self.text = annDic['annotation']
	def __init__(self,annDic,uWorker):
		self.uWorker = uWorker
		self.dic = annDic
		self.id = annDic['id']
		self.date = annDic['created_on'].strip().replace(' ','T')+'Z'
		self.user_id = annDic['user_id']
		self.user = uWorker.getUserName(self.user_id)
		self.section_id = annDic['section_id']
		self.start_index = annDic['start_index']
		self.end_index = annDic['end_index']
		
		tempQuote = BeautifulSoup(FCUtil.removeSpecChar(annDic['quote']).lstrip().lstrip('\''))
		self.quote = self.fixQuote(tempQuote.get_text())
		tempText = BeautifulSoup(FCUtil.removeSpecChar(annDic['annotation']).strip().strip('\''))
		self.text = self.fixQuote(tempText.get_text())

		if "NULL" in annDic['deleted_on']:
			self.deleted = False
		else:
			self.deleted = True
	def get_start(self,text):
		
		guess = FCUtil.wordcount_to_charcount(self.start_index,text)
		#matches = s.get_matching_blocks()
		#match = s.find_longest_match(0,len(self.quote),0,len(text))
		matcher = diff_match_patch()
		m = matcher.match_main(text,self.quote,guess)
		#print text
		#print self.quote
		s = difflib.SequenceMatcher(None,self.quote,text[m:m+len(self.quote)])
		r = s.ratio()
		if r < .9:
			print "Low Ratio %.2f %d " %(r,self.id)
			print "Quote: %s" %(self.quote)
			print "Text: %s" %(text[m:m+len(self.quote)])

		if m == -1:
			print "matching error for: %d" %self.id

		self.start = m
		self.end = m+len(self.quote)
file = open('10.6.166.43.sql')
output = open('content.json', 'w')
import json
import re
import FCUtil

firstTime = True
output.write("[")
for row in file:
	if row[:20] == "INSERT INTO `content":
		if not firstTime:
			output.write (str(','))
		firstTime = False
	
		first = row.split("(", 1)[1].split(",", 2)
		dic = {}
		dic["id"] = int(first[0])
		dic["section_id"] = int(first[1])
		dic["content"] = FCUtil.cleanStr(str(str(first[2]).rsplit(");")[0]))
		output.write(json.dumps(dic))
		print json.dumps(dic, sort_keys=True, indent=4)
output.write("]")
output.close()
file = open('10.6.166.43.sql')
output = open('annotation_links.json', 'w')
import json
import re
import FCUtil


output.write("[")
firstTime = True
for row in file:
	if "INSERT INTO `annotation_links`" in row:
		r = row.split("(", 1)[1].rsplit(");",1)[0].split(",",4)
		dic = {}
		if not firstTime:
			output.write (str(','))
		for item in r:
			dic["id"] = int(r[0])
			dic["annotation_linker_id"] = int(r[1])
			dic["linkee_type"] = FCUtil.cleanStr(r[2])
			dic["linkee_id"] = int(r[3])
			dic["reason"]= FCUtil.cleanStr(r[4].rsplit(',',1)[0])
			dic["relationship"] = int(r[4].rsplit(',',1)[1])
			print json.dumps(dic, sort_keys=True, indent=4)
		firstTime = False
		output.write(str(json.dumps(dic)))

output.write("]")
			
output.close()
	def __init__(self):
		super(UserLookup,self).__init__()
		self.users = FCUtil.openJsonFile('users.json')
Beispiel #8
0
 def __init__(self):
     # Why a
     super(Converter, self).__init__()
     self.annJson = FCUtil.openJsonFile('annotations.json')
     self.uWorker = UserLookup()
def getStartOffsetFromAnnId(id):
	
	for obj in AnnJSON:
		if obj['id'] == id:
			wordIndex = obj['start_index']
			Oquote = obj['quote']
			content = getContentBySectionId(obj['section_id'])
			quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote)))
			quote = re.sub('\s*(/)\s*',' ',quote).strip()
			endQuote = quote.split()[0]
			if	"'" in quote[:1]:
				quote = quote[1:]
			startQuote = quote.split()[0]
			
			#print "quote " + quote
			
			cw = FCUtil.strip_html(FCUtil.remove_html_tags(content))
			cnw = FCUtil.strip_html(content)
			c = cw.split()
			#print cnw
			#print len(c)
			#print 'startWordINdex: '+str(wordIndex)
			
			startQuoteIndex = [m.start() for m in re.finditer(startQuote.replace('[','\[').replace(']','\]'), cnw)] 
			quoteIndex = [m.start() for m in re.finditer(quote, cnw)]
			
			#print 'StartQuoteIndex for: '+startQuote
			#print startQuoteIndex
			
			
			if len( startQuoteIndex) == 1:
				#print 'via Start Quote'
				return [startQuoteIndex[0],1]
			elif len(quoteIndex) == 1:
				#print 'via entire Quote'
				return [quoteIndex[0],1]
			elif len(quoteIndex) == 0:
				sQuote = quote.split()
				print sQuote
				for i in sQuote:
					try:
						tempA = [m.start() for m in re.finditer(i, cnw)]
						if len(tempA) == 1:
							#print 'new way to town'
							return [tempA[0]-len(quote.split(i,1)[0]),1]
					except:
						pass
			else:
				sQuote = quote.split()
				check = ''
				for i in sQuote:
					check +=str(str(i)+' ')
					try:
						tempA = [m.start() for m in re.finditer(check, cnw)]
						if len(tempA) == 1:
							#print 'should delete'
							return [tempA[0],1]
					except:
						pass
						
			if len(quoteIndex) > 1:  #some how get word count to 
				print quoteIndex
			
			startLoc = []
			for idx, k in enumerate(c):
				if 	startQuote in k:
					startLoc.append(idx)
			#print startLoc
			
			place = 0
			m = 10000
			www = 0
			for idx,k in enumerate(startLoc):
				if(m > abs(k-wordIndex)):
					m = abs(k-wordIndex)
					place = idx
			#print "startIndex: "+str(place)
			#print "StartWordIndex: "+str(wordIndex)
			#print len(cnw.rsplit(startQuote,len(startLoc)-place)[0])
			if(len(startLoc) > 0):
				#return len(cnw.split(startQuote,len(startLoc)-place-1)[0])
				return [startQuoteIndex[place],len(startQuoteIndex),len(quote)]
			
			print 'START_ERROR'
			return None
def getEndOffsetFromAnnId(id):
	
	for obj in AnnJSON:
		if obj['id'] == id:
			wordIndex = obj['end_index']
			Oquote = obj['quote']
			quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote)))
			quote = re.sub('\s*(/)\s*','',quote).strip()
			endQuote = quote.rsplit()[-1]
			if	"'" in quote[:1]:
				quote = quote[1:]
				
			#print "quote " + quote
			
		
			content = getContentBySectionId(obj['section_id'])
			cw = FCUtil.strip_html(FCUtil.remove_html_tags(content))
			#possible use of beautifulsoup
			cnw = FCUtil.strip_html(content)
			c = cw.split()
			#print c
			endQuoteIndex = [m.start() for m in re.finditer(endQuote.replace('[','\[').replace(']','\]'), cnw)] 
			quoteIndex = [m.start() for m in re.finditer(quote, cnw)]
			
			#print 'Quote: '+quote
			#print 'Content: '+cnw
			#print 'EndQuoteIndex for: '+endQuote
			#print endQuoteIndex
			
			if len( endQuoteIndex) == 1:
				return [endQuoteIndex[0]+len(endQuote),1,len(quote)]
			elif len(quoteIndex) == 1:
				return [len(quote)+quoteIndex[0],1,len(quote)]
			elif len(quoteIndex) == 0:
				sQuote = quote.split()
				for i in sQuote:
					try:
						tempA = [m.start() for m in re.finditer(i, cnw)]
						if len(tempA) == 1:
							#print 'new way to town'
							return [tempA[0]+len(quote.split(i,1)[1]),1,len(quote)]
					except:
						pass
			else:
				sQuote = quote.split()
				check = ''
				for i in sQuote:
					check +=str(str(i)+' ')
					try:
						tempA = [m.start() for m in re.finditer(check, cnw)]
						if len(tempA) == 1:
							return [len(quote)+tempA[0],1,len(quote)]
					except:
						pass
			
			endLoc = []
			for idx, k in enumerate(c):
				if 	endQuote in k:
					endLoc.append(idx)
			#print endLoc
			
			place = 0
			m = 10000
			for idx,k in enumerate(endLoc):
				if(m > abs(k-wordIndex)):
					m = abs(k-wordIndex)
					place = idx
			#print "index "+str(place)
			#print len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote)
			#print "EndWordIndex: "+str(wordIndex)
			#print "EndIndex: "+str(place)
			if len(endLoc) >0:
				#return len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote)
				return [endQuoteIndex[place]+len(endQuote),len(endQuoteIndex),len(quote)]
			
			print 'END_ERROR'
			return None
Beispiel #11
0
				holdString = item
			elif item.count("'")==0:
				open = False
				holdString = item
				
			if not open:
				first.append(holdString.replace('"', '\\"'))
				holdString = ""
		
		#print len(first)
		#print first
		
		for i in first:
			i = str(i).replace('"', '\\"')
			if '"' in i:
				print i
		
		dic = {}
		dic["id"] = int(first[0])
		dic["title"] = FCUtil.cleanStr(first[1])
		dic["author"] = first[2].lstrip().rstrip().rstrip('\'').lstrip('\'')
		dic["summary"] = first[3]
		dic["year"] = first[4]
		dic["page_views"] = first[5]
		dic["wordpress_url"] = first[6]
		dic["intro_essay"] =  FCUtil.cleanStr(first[7])
		dic["created_on"] = str(first[8]).rsplit(");")[0]
		output.write(json.dumps(dic))
		#print json.dumps(dic, sort_keys=True, indent=4)
output.write("]")
output.close()
import json
import re
import FCUtil


output.write("[")
firstTime = True
for row in file:
	if "INSERT INTO `users_files`" in row:
		r = row.split("(", 1)[1].rsplit(");",1)[0].split(",")
		dic = {}
		if not firstTime:
			output.write (str(','))
		for item in r:
			dic["id"] = int(r[0])
			dic["user_id"] = int(r[1])
			dic["work_id"] = int(r[2])
			dic["name"] = str(FCUtil.cleanStr(r[3]))
			dic["description"] = str(FCUtil.cleanStr(r[4]))
			dic["file_location"] = str(FCUtil.cleanStr(r[5]))
			dic["created_on"] = str(FCUtil.cleanStr(r[6]))
			dic["deleted_on"] = str(FCUtil.cleanStr(r[7]))
			print json.dumps(dic, sort_keys=True, indent=4)
		firstTime = False
		print json.dumps(dic, sort_keys=True, indent=4)
		output.write(str(json.dumps(dic)))

output.write("]")
			
output.close()
file = open('10.6.166.43.sql')
output = open('annotation_link_relationships.json', 'w')
import json
import re
import FCUtil


output.write("[")
firstTime = True
for row in file:
	if "INSERT INTO `annotation_link_relationships" in row:
		r = row.split("(", 1)[1].rsplit(");",1)[0].split(",")
		dic = {}
		if not firstTime:
			output.write (str(','))
		for item in r:
			dic["id"] = int(r[0])
			dic["title"] = FCUtil.cleanStr(r[1])
			print json.dumps(dic, sort_keys=True, indent=4)
		firstTime = False
		output.write(str(json.dumps(dic)))

output.write("]")
			
output.close()
Beispiel #14
0
			p = (float(badMatches)/float(total))*100.0
			print "No matches for %d : %4.2f" %(workId, p)
		print "No Matches for "+str(workId)+' : '+str(badMatches)
		print "Total for "+str(workId)+' : '+str(total)


	#workNumber = 112
	#print "Work: %d\n" %workNumber
	#crunch(workNumber)

	#annNumber = 1762
	#print 'Annotation: %d\n'%annNumber
	#conv = converter()
	#conv.convert(annNumber)

	works = FCUtil.getAllWorksExcluding(['William Shakespeare'])
	#works = [61]
	totalWorks = len(works)
	annotationsbyworkid = {}
	count = 0
	
	worker = createhtml.creator()
	conv= converter(worker)
	for wid in works:
		try:
			print '\nCurrently on Work %d %00.2f%%' %(wid,((float(count)/float(totalWorks)))*100.0)
		except:
			print '\nCurrently on Work %d' &wid
		worker.makehtml(wid)
		#crunch(worker,conv,wid)
		count+=1