file = open('10.6.166.43.sql')
output = open('annotation_links.json', 'w')
import json
import re
import FCUtil


output.write("[")
firstTime = True
for row in file:
	if "INSERT INTO `annotation_links`" in row:
		r = row.split("(", 1)[1].rsplit(");",1)[0].split(",",4)
		dic = {}
		if not firstTime:
			output.write (str(','))
		for item in r:
			dic["id"] = int(r[0])
			dic["annotation_linker_id"] = int(r[1])
			dic["linkee_type"] = FCUtil.cleanStr(r[2])
			dic["linkee_id"] = int(r[3])
			dic["reason"]= FCUtil.cleanStr(r[4].rsplit(',',1)[0])
			dic["relationship"] = int(r[4].rsplit(',',1)[1])
			print json.dumps(dic, sort_keys=True, indent=4)
		firstTime = False
		output.write(str(json.dumps(dic)))

output.write("]")
			
output.close()
file = open('10.6.166.43.sql')
output = open('content.json', 'w')
import json
import re
import FCUtil

firstTime = True
output.write("[")
for row in file:
	if row[:20] == "INSERT INTO `content":
		if not firstTime:
			output.write (str(','))
		firstTime = False
	
		first = row.split("(", 1)[1].split(",", 2)
		dic = {}
		dic["id"] = int(first[0])
		dic["section_id"] = int(first[1])
		dic["content"] = FCUtil.cleanStr(str(str(first[2]).rsplit(");")[0]))
		output.write(json.dumps(dic))
		print json.dumps(dic, sort_keys=True, indent=4)
output.write("]")
output.close()
def getStartOffsetFromAnnId(id):
	
	for obj in AnnJSON:
		if obj['id'] == id:
			wordIndex = obj['start_index']
			Oquote = obj['quote']
			content = getContentBySectionId(obj['section_id'])
			quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote)))
			quote = re.sub('\s*(/)\s*',' ',quote).strip()
			endQuote = quote.split()[0]
			if	"'" in quote[:1]:
				quote = quote[1:]
			startQuote = quote.split()[0]
			
			#print "quote " + quote
			
			cw = FCUtil.strip_html(FCUtil.remove_html_tags(content))
			cnw = FCUtil.strip_html(content)
			c = cw.split()
			#print cnw
			#print len(c)
			#print 'startWordINdex: '+str(wordIndex)
			
			startQuoteIndex = [m.start() for m in re.finditer(startQuote.replace('[','\[').replace(']','\]'), cnw)] 
			quoteIndex = [m.start() for m in re.finditer(quote, cnw)]
			
			#print 'StartQuoteIndex for: '+startQuote
			#print startQuoteIndex
			
			
			if len( startQuoteIndex) == 1:
				#print 'via Start Quote'
				return [startQuoteIndex[0],1]
			elif len(quoteIndex) == 1:
				#print 'via entire Quote'
				return [quoteIndex[0],1]
			elif len(quoteIndex) == 0:
				sQuote = quote.split()
				print sQuote
				for i in sQuote:
					try:
						tempA = [m.start() for m in re.finditer(i, cnw)]
						if len(tempA) == 1:
							#print 'new way to town'
							return [tempA[0]-len(quote.split(i,1)[0]),1]
					except:
						pass
			else:
				sQuote = quote.split()
				check = ''
				for i in sQuote:
					check +=str(str(i)+' ')
					try:
						tempA = [m.start() for m in re.finditer(check, cnw)]
						if len(tempA) == 1:
							#print 'should delete'
							return [tempA[0],1]
					except:
						pass
						
			if len(quoteIndex) > 1:  #some how get word count to 
				print quoteIndex
			
			startLoc = []
			for idx, k in enumerate(c):
				if 	startQuote in k:
					startLoc.append(idx)
			#print startLoc
			
			place = 0
			m = 10000
			www = 0
			for idx,k in enumerate(startLoc):
				if(m > abs(k-wordIndex)):
					m = abs(k-wordIndex)
					place = idx
			#print "startIndex: "+str(place)
			#print "StartWordIndex: "+str(wordIndex)
			#print len(cnw.rsplit(startQuote,len(startLoc)-place)[0])
			if(len(startLoc) > 0):
				#return len(cnw.split(startQuote,len(startLoc)-place-1)[0])
				return [startQuoteIndex[place],len(startQuoteIndex),len(quote)]
			
			print 'START_ERROR'
			return None
def getEndOffsetFromAnnId(id):
	
	for obj in AnnJSON:
		if obj['id'] == id:
			wordIndex = obj['end_index']
			Oquote = obj['quote']
			quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote)))
			quote = re.sub('\s*(/)\s*','',quote).strip()
			endQuote = quote.rsplit()[-1]
			if	"'" in quote[:1]:
				quote = quote[1:]
				
			#print "quote " + quote
			
		
			content = getContentBySectionId(obj['section_id'])
			cw = FCUtil.strip_html(FCUtil.remove_html_tags(content))
			#possible use of beautifulsoup
			cnw = FCUtil.strip_html(content)
			c = cw.split()
			#print c
			endQuoteIndex = [m.start() for m in re.finditer(endQuote.replace('[','\[').replace(']','\]'), cnw)] 
			quoteIndex = [m.start() for m in re.finditer(quote, cnw)]
			
			#print 'Quote: '+quote
			#print 'Content: '+cnw
			#print 'EndQuoteIndex for: '+endQuote
			#print endQuoteIndex
			
			if len( endQuoteIndex) == 1:
				return [endQuoteIndex[0]+len(endQuote),1,len(quote)]
			elif len(quoteIndex) == 1:
				return [len(quote)+quoteIndex[0],1,len(quote)]
			elif len(quoteIndex) == 0:
				sQuote = quote.split()
				for i in sQuote:
					try:
						tempA = [m.start() for m in re.finditer(i, cnw)]
						if len(tempA) == 1:
							#print 'new way to town'
							return [tempA[0]+len(quote.split(i,1)[1]),1,len(quote)]
					except:
						pass
			else:
				sQuote = quote.split()
				check = ''
				for i in sQuote:
					check +=str(str(i)+' ')
					try:
						tempA = [m.start() for m in re.finditer(check, cnw)]
						if len(tempA) == 1:
							return [len(quote)+tempA[0],1,len(quote)]
					except:
						pass
			
			endLoc = []
			for idx, k in enumerate(c):
				if 	endQuote in k:
					endLoc.append(idx)
			#print endLoc
			
			place = 0
			m = 10000
			for idx,k in enumerate(endLoc):
				if(m > abs(k-wordIndex)):
					m = abs(k-wordIndex)
					place = idx
			#print "index "+str(place)
			#print len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote)
			#print "EndWordIndex: "+str(wordIndex)
			#print "EndIndex: "+str(place)
			if len(endLoc) >0:
				#return len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote)
				return [endQuoteIndex[place]+len(endQuote),len(endQuoteIndex),len(quote)]
			
			print 'END_ERROR'
			return None
Beispiel #5
0
				holdString = item
			elif item.count("'")==0:
				open = False
				holdString = item
				
			if not open:
				first.append(holdString.replace('"', '\\"'))
				holdString = ""
		
		#print len(first)
		#print first
		
		for i in first:
			i = str(i).replace('"', '\\"')
			if '"' in i:
				print i
		
		dic = {}
		dic["id"] = int(first[0])
		dic["title"] = FCUtil.cleanStr(first[1])
		dic["author"] = first[2].lstrip().rstrip().rstrip('\'').lstrip('\'')
		dic["summary"] = first[3]
		dic["year"] = first[4]
		dic["page_views"] = first[5]
		dic["wordpress_url"] = first[6]
		dic["intro_essay"] =  FCUtil.cleanStr(first[7])
		dic["created_on"] = str(first[8]).rsplit(");")[0]
		output.write(json.dumps(dic))
		#print json.dumps(dic, sort_keys=True, indent=4)
output.write("]")
output.close()
import json
import re
import FCUtil


output.write("[")
firstTime = True
for row in file:
	if "INSERT INTO `users_files`" in row:
		r = row.split("(", 1)[1].rsplit(");",1)[0].split(",")
		dic = {}
		if not firstTime:
			output.write (str(','))
		for item in r:
			dic["id"] = int(r[0])
			dic["user_id"] = int(r[1])
			dic["work_id"] = int(r[2])
			dic["name"] = str(FCUtil.cleanStr(r[3]))
			dic["description"] = str(FCUtil.cleanStr(r[4]))
			dic["file_location"] = str(FCUtil.cleanStr(r[5]))
			dic["created_on"] = str(FCUtil.cleanStr(r[6]))
			dic["deleted_on"] = str(FCUtil.cleanStr(r[7]))
			print json.dumps(dic, sort_keys=True, indent=4)
		firstTime = False
		print json.dumps(dic, sort_keys=True, indent=4)
		output.write(str(json.dumps(dic)))

output.write("]")
			
output.close()
file = open('10.6.166.43.sql')
output = open('annotation_link_relationships.json', 'w')
import json
import re
import FCUtil


output.write("[")
firstTime = True
for row in file:
	if "INSERT INTO `annotation_link_relationships" in row:
		r = row.split("(", 1)[1].rsplit(");",1)[0].split(",")
		dic = {}
		if not firstTime:
			output.write (str(','))
		for item in r:
			dic["id"] = int(r[0])
			dic["title"] = FCUtil.cleanStr(r[1])
			print json.dumps(dic, sort_keys=True, indent=4)
		firstTime = False
		output.write(str(json.dumps(dic)))

output.write("]")
			
output.close()