file = open('10.6.166.43.sql') output = open('annotation_links.json', 'w') import json import re import FCUtil output.write("[") firstTime = True for row in file: if "INSERT INTO `annotation_links`" in row: r = row.split("(", 1)[1].rsplit(");",1)[0].split(",",4) dic = {} if not firstTime: output.write (str(',')) for item in r: dic["id"] = int(r[0]) dic["annotation_linker_id"] = int(r[1]) dic["linkee_type"] = FCUtil.cleanStr(r[2]) dic["linkee_id"] = int(r[3]) dic["reason"]= FCUtil.cleanStr(r[4].rsplit(',',1)[0]) dic["relationship"] = int(r[4].rsplit(',',1)[1]) print json.dumps(dic, sort_keys=True, indent=4) firstTime = False output.write(str(json.dumps(dic))) output.write("]") output.close()
file = open('10.6.166.43.sql') output = open('content.json', 'w') import json import re import FCUtil firstTime = True output.write("[") for row in file: if row[:20] == "INSERT INTO `content": if not firstTime: output.write (str(',')) firstTime = False first = row.split("(", 1)[1].split(",", 2) dic = {} dic["id"] = int(first[0]) dic["section_id"] = int(first[1]) dic["content"] = FCUtil.cleanStr(str(str(first[2]).rsplit(");")[0])) output.write(json.dumps(dic)) print json.dumps(dic, sort_keys=True, indent=4) output.write("]") output.close()
def getStartOffsetFromAnnId(id): for obj in AnnJSON: if obj['id'] == id: wordIndex = obj['start_index'] Oquote = obj['quote'] content = getContentBySectionId(obj['section_id']) quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote))) quote = re.sub('\s*(/)\s*',' ',quote).strip() endQuote = quote.split()[0] if "'" in quote[:1]: quote = quote[1:] startQuote = quote.split()[0] #print "quote " + quote cw = FCUtil.strip_html(FCUtil.remove_html_tags(content)) cnw = FCUtil.strip_html(content) c = cw.split() #print cnw #print len(c) #print 'startWordINdex: '+str(wordIndex) startQuoteIndex = [m.start() for m in re.finditer(startQuote.replace('[','\[').replace(']','\]'), cnw)] quoteIndex = [m.start() for m in re.finditer(quote, cnw)] #print 'StartQuoteIndex for: '+startQuote #print startQuoteIndex if len( startQuoteIndex) == 1: #print 'via Start Quote' return [startQuoteIndex[0],1] elif len(quoteIndex) == 1: #print 'via entire Quote' return [quoteIndex[0],1] elif len(quoteIndex) == 0: sQuote = quote.split() print sQuote for i in sQuote: try: tempA = [m.start() for m in re.finditer(i, cnw)] if len(tempA) == 1: #print 'new way to town' return [tempA[0]-len(quote.split(i,1)[0]),1] except: pass else: sQuote = quote.split() check = '' for i in sQuote: check +=str(str(i)+' ') try: tempA = [m.start() for m in re.finditer(check, cnw)] if len(tempA) == 1: #print 'should delete' return [tempA[0],1] except: pass if len(quoteIndex) > 1: #some how get word count to print quoteIndex startLoc = [] for idx, k in enumerate(c): if startQuote in k: startLoc.append(idx) #print startLoc place = 0 m = 10000 www = 0 for idx,k in enumerate(startLoc): if(m > abs(k-wordIndex)): m = abs(k-wordIndex) place = idx #print "startIndex: "+str(place) #print "StartWordIndex: "+str(wordIndex) #print len(cnw.rsplit(startQuote,len(startLoc)-place)[0]) if(len(startLoc) > 0): #return len(cnw.split(startQuote,len(startLoc)-place-1)[0]) return [startQuoteIndex[place],len(startQuoteIndex),len(quote)] print 'START_ERROR' return None
def getEndOffsetFromAnnId(id): for obj in AnnJSON: if obj['id'] == id: wordIndex = obj['end_index'] Oquote = obj['quote'] quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote))) quote = re.sub('\s*(/)\s*','',quote).strip() endQuote = quote.rsplit()[-1] if "'" in quote[:1]: quote = quote[1:] #print "quote " + quote content = getContentBySectionId(obj['section_id']) cw = FCUtil.strip_html(FCUtil.remove_html_tags(content)) #possible use of beautifulsoup cnw = FCUtil.strip_html(content) c = cw.split() #print c endQuoteIndex = [m.start() for m in re.finditer(endQuote.replace('[','\[').replace(']','\]'), cnw)] quoteIndex = [m.start() for m in re.finditer(quote, cnw)] #print 'Quote: '+quote #print 'Content: '+cnw #print 'EndQuoteIndex for: '+endQuote #print endQuoteIndex if len( endQuoteIndex) == 1: return [endQuoteIndex[0]+len(endQuote),1,len(quote)] elif len(quoteIndex) == 1: return [len(quote)+quoteIndex[0],1,len(quote)] elif len(quoteIndex) == 0: sQuote = quote.split() for i in sQuote: try: tempA = [m.start() for m in re.finditer(i, cnw)] if len(tempA) == 1: #print 'new way to town' return [tempA[0]+len(quote.split(i,1)[1]),1,len(quote)] except: pass else: sQuote = quote.split() check = '' for i in sQuote: check +=str(str(i)+' ') try: tempA = [m.start() for m in re.finditer(check, cnw)] if len(tempA) == 1: return [len(quote)+tempA[0],1,len(quote)] except: pass endLoc = [] for idx, k in enumerate(c): if endQuote in k: endLoc.append(idx) #print endLoc place = 0 m = 10000 for idx,k in enumerate(endLoc): if(m > abs(k-wordIndex)): m = abs(k-wordIndex) place = idx #print "index "+str(place) #print len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote) #print "EndWordIndex: "+str(wordIndex) #print "EndIndex: "+str(place) if len(endLoc) >0: #return len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote) return [endQuoteIndex[place]+len(endQuote),len(endQuoteIndex),len(quote)] print 'END_ERROR' return None
holdString = item elif item.count("'")==0: open = False holdString = item if not open: first.append(holdString.replace('"', '\\"')) holdString = "" #print len(first) #print first for i in first: i = str(i).replace('"', '\\"') if '"' in i: print i dic = {} dic["id"] = int(first[0]) dic["title"] = FCUtil.cleanStr(first[1]) dic["author"] = first[2].lstrip().rstrip().rstrip('\'').lstrip('\'') dic["summary"] = first[3] dic["year"] = first[4] dic["page_views"] = first[5] dic["wordpress_url"] = first[6] dic["intro_essay"] = FCUtil.cleanStr(first[7]) dic["created_on"] = str(first[8]).rsplit(");")[0] output.write(json.dumps(dic)) #print json.dumps(dic, sort_keys=True, indent=4) output.write("]") output.close()
import json import re import FCUtil output.write("[") firstTime = True for row in file: if "INSERT INTO `users_files`" in row: r = row.split("(", 1)[1].rsplit(");",1)[0].split(",") dic = {} if not firstTime: output.write (str(',')) for item in r: dic["id"] = int(r[0]) dic["user_id"] = int(r[1]) dic["work_id"] = int(r[2]) dic["name"] = str(FCUtil.cleanStr(r[3])) dic["description"] = str(FCUtil.cleanStr(r[4])) dic["file_location"] = str(FCUtil.cleanStr(r[5])) dic["created_on"] = str(FCUtil.cleanStr(r[6])) dic["deleted_on"] = str(FCUtil.cleanStr(r[7])) print json.dumps(dic, sort_keys=True, indent=4) firstTime = False print json.dumps(dic, sort_keys=True, indent=4) output.write(str(json.dumps(dic))) output.write("]") output.close()
file = open('10.6.166.43.sql') output = open('annotation_link_relationships.json', 'w') import json import re import FCUtil output.write("[") firstTime = True for row in file: if "INSERT INTO `annotation_link_relationships" in row: r = row.split("(", 1)[1].rsplit(");",1)[0].split(",") dic = {} if not firstTime: output.write (str(',')) for item in r: dic["id"] = int(r[0]) dic["title"] = FCUtil.cleanStr(r[1]) print json.dumps(dic, sort_keys=True, indent=4) firstTime = False output.write(str(json.dumps(dic))) output.write("]") output.close()