def crunch(worker,conv,workId): list = FCUtil.getAllAnnotationsforWork(workId) total = 0; badMatches = 0 #print 'this is the list '+str(list) #print "Total: %d"%len(list) et = time() st = time() for num in list: print "Annotation on: %d %0.2f%% Last one took: %.1f "%(num,(float((total+1))/float(len(list))*100.0),et-st) st = time() #conv.convert(num) total += 1 et = time() badMatches += conv.noMatches if badMatches != 0 and total !=0: p = (float(badMatches)/float(total))*100.0 print "No matches for %d : %4.2f" %(workId, p) print "No Matches for "+str(workId)+' : '+str(badMatches) print "Total for "+str(workId)+' : '+str(total)
def __init__(self,annDic): self.dic = annDic self.id = annDic['id'] self.section_id = annDic['section_id'] self.start_index = annDic['start_index'] self.end_index = annDic['end_index'] self.deleted = annDic['deleted_on'] tempQuote = BeautifulSoup(FCUtil.removeSpecChar(annDic['quote']).lstrip().lstrip('\'')) self.quote = self.fixQuote(tempQuote.get_text()) self.text = annDic['annotation']
def __init__(self,annDic,uWorker): self.uWorker = uWorker self.dic = annDic self.id = annDic['id'] self.date = annDic['created_on'].strip().replace(' ','T')+'Z' self.user_id = annDic['user_id'] self.user = uWorker.getUserName(self.user_id) self.section_id = annDic['section_id'] self.start_index = annDic['start_index'] self.end_index = annDic['end_index'] tempQuote = BeautifulSoup(FCUtil.removeSpecChar(annDic['quote']).lstrip().lstrip('\'')) self.quote = self.fixQuote(tempQuote.get_text()) tempText = BeautifulSoup(FCUtil.removeSpecChar(annDic['annotation']).strip().strip('\'')) self.text = self.fixQuote(tempText.get_text()) if "NULL" in annDic['deleted_on']: self.deleted = False else: self.deleted = True
def get_start(self,text): guess = FCUtil.wordcount_to_charcount(self.start_index,text) #matches = s.get_matching_blocks() #match = s.find_longest_match(0,len(self.quote),0,len(text)) matcher = diff_match_patch() m = matcher.match_main(text,self.quote,guess) #print text #print self.quote s = difflib.SequenceMatcher(None,self.quote,text[m:m+len(self.quote)]) r = s.ratio() if r < .9: print "Low Ratio %.2f %d " %(r,self.id) print "Quote: %s" %(self.quote) print "Text: %s" %(text[m:m+len(self.quote)]) if m == -1: print "matching error for: %d" %self.id self.start = m self.end = m+len(self.quote)
file = open('10.6.166.43.sql') output = open('content.json', 'w') import json import re import FCUtil firstTime = True output.write("[") for row in file: if row[:20] == "INSERT INTO `content": if not firstTime: output.write (str(',')) firstTime = False first = row.split("(", 1)[1].split(",", 2) dic = {} dic["id"] = int(first[0]) dic["section_id"] = int(first[1]) dic["content"] = FCUtil.cleanStr(str(str(first[2]).rsplit(");")[0])) output.write(json.dumps(dic)) print json.dumps(dic, sort_keys=True, indent=4) output.write("]") output.close()
file = open('10.6.166.43.sql') output = open('annotation_links.json', 'w') import json import re import FCUtil output.write("[") firstTime = True for row in file: if "INSERT INTO `annotation_links`" in row: r = row.split("(", 1)[1].rsplit(");",1)[0].split(",",4) dic = {} if not firstTime: output.write (str(',')) for item in r: dic["id"] = int(r[0]) dic["annotation_linker_id"] = int(r[1]) dic["linkee_type"] = FCUtil.cleanStr(r[2]) dic["linkee_id"] = int(r[3]) dic["reason"]= FCUtil.cleanStr(r[4].rsplit(',',1)[0]) dic["relationship"] = int(r[4].rsplit(',',1)[1]) print json.dumps(dic, sort_keys=True, indent=4) firstTime = False output.write(str(json.dumps(dic))) output.write("]") output.close()
def __init__(self): super(UserLookup,self).__init__() self.users = FCUtil.openJsonFile('users.json')
def __init__(self): # Why a super(Converter, self).__init__() self.annJson = FCUtil.openJsonFile('annotations.json') self.uWorker = UserLookup()
def getStartOffsetFromAnnId(id): for obj in AnnJSON: if obj['id'] == id: wordIndex = obj['start_index'] Oquote = obj['quote'] content = getContentBySectionId(obj['section_id']) quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote))) quote = re.sub('\s*(/)\s*',' ',quote).strip() endQuote = quote.split()[0] if "'" in quote[:1]: quote = quote[1:] startQuote = quote.split()[0] #print "quote " + quote cw = FCUtil.strip_html(FCUtil.remove_html_tags(content)) cnw = FCUtil.strip_html(content) c = cw.split() #print cnw #print len(c) #print 'startWordINdex: '+str(wordIndex) startQuoteIndex = [m.start() for m in re.finditer(startQuote.replace('[','\[').replace(']','\]'), cnw)] quoteIndex = [m.start() for m in re.finditer(quote, cnw)] #print 'StartQuoteIndex for: '+startQuote #print startQuoteIndex if len( startQuoteIndex) == 1: #print 'via Start Quote' return [startQuoteIndex[0],1] elif len(quoteIndex) == 1: #print 'via entire Quote' return [quoteIndex[0],1] elif len(quoteIndex) == 0: sQuote = quote.split() print sQuote for i in sQuote: try: tempA = [m.start() for m in re.finditer(i, cnw)] if len(tempA) == 1: #print 'new way to town' return [tempA[0]-len(quote.split(i,1)[0]),1] except: pass else: sQuote = quote.split() check = '' for i in sQuote: check +=str(str(i)+' ') try: tempA = [m.start() for m in re.finditer(check, cnw)] if len(tempA) == 1: #print 'should delete' return [tempA[0],1] except: pass if len(quoteIndex) > 1: #some how get word count to print quoteIndex startLoc = [] for idx, k in enumerate(c): if startQuote in k: startLoc.append(idx) #print startLoc place = 0 m = 10000 www = 0 for idx,k in enumerate(startLoc): if(m > abs(k-wordIndex)): m = abs(k-wordIndex) place = idx #print "startIndex: "+str(place) #print "StartWordIndex: "+str(wordIndex) #print len(cnw.rsplit(startQuote,len(startLoc)-place)[0]) if(len(startLoc) > 0): #return len(cnw.split(startQuote,len(startLoc)-place-1)[0]) return [startQuoteIndex[place],len(startQuoteIndex),len(quote)] print 'START_ERROR' return None
def getEndOffsetFromAnnId(id): for obj in AnnJSON: if obj['id'] == id: wordIndex = obj['end_index'] Oquote = obj['quote'] quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote))) quote = re.sub('\s*(/)\s*','',quote).strip() endQuote = quote.rsplit()[-1] if "'" in quote[:1]: quote = quote[1:] #print "quote " + quote content = getContentBySectionId(obj['section_id']) cw = FCUtil.strip_html(FCUtil.remove_html_tags(content)) #possible use of beautifulsoup cnw = FCUtil.strip_html(content) c = cw.split() #print c endQuoteIndex = [m.start() for m in re.finditer(endQuote.replace('[','\[').replace(']','\]'), cnw)] quoteIndex = [m.start() for m in re.finditer(quote, cnw)] #print 'Quote: '+quote #print 'Content: '+cnw #print 'EndQuoteIndex for: '+endQuote #print endQuoteIndex if len( endQuoteIndex) == 1: return [endQuoteIndex[0]+len(endQuote),1,len(quote)] elif len(quoteIndex) == 1: return [len(quote)+quoteIndex[0],1,len(quote)] elif len(quoteIndex) == 0: sQuote = quote.split() for i in sQuote: try: tempA = [m.start() for m in re.finditer(i, cnw)] if len(tempA) == 1: #print 'new way to town' return [tempA[0]+len(quote.split(i,1)[1]),1,len(quote)] except: pass else: sQuote = quote.split() check = '' for i in sQuote: check +=str(str(i)+' ') try: tempA = [m.start() for m in re.finditer(check, cnw)] if len(tempA) == 1: return [len(quote)+tempA[0],1,len(quote)] except: pass endLoc = [] for idx, k in enumerate(c): if endQuote in k: endLoc.append(idx) #print endLoc place = 0 m = 10000 for idx,k in enumerate(endLoc): if(m > abs(k-wordIndex)): m = abs(k-wordIndex) place = idx #print "index "+str(place) #print len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote) #print "EndWordIndex: "+str(wordIndex) #print "EndIndex: "+str(place) if len(endLoc) >0: #return len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote) return [endQuoteIndex[place]+len(endQuote),len(endQuoteIndex),len(quote)] print 'END_ERROR' return None
holdString = item elif item.count("'")==0: open = False holdString = item if not open: first.append(holdString.replace('"', '\\"')) holdString = "" #print len(first) #print first for i in first: i = str(i).replace('"', '\\"') if '"' in i: print i dic = {} dic["id"] = int(first[0]) dic["title"] = FCUtil.cleanStr(first[1]) dic["author"] = first[2].lstrip().rstrip().rstrip('\'').lstrip('\'') dic["summary"] = first[3] dic["year"] = first[4] dic["page_views"] = first[5] dic["wordpress_url"] = first[6] dic["intro_essay"] = FCUtil.cleanStr(first[7]) dic["created_on"] = str(first[8]).rsplit(");")[0] output.write(json.dumps(dic)) #print json.dumps(dic, sort_keys=True, indent=4) output.write("]") output.close()
import json import re import FCUtil output.write("[") firstTime = True for row in file: if "INSERT INTO `users_files`" in row: r = row.split("(", 1)[1].rsplit(");",1)[0].split(",") dic = {} if not firstTime: output.write (str(',')) for item in r: dic["id"] = int(r[0]) dic["user_id"] = int(r[1]) dic["work_id"] = int(r[2]) dic["name"] = str(FCUtil.cleanStr(r[3])) dic["description"] = str(FCUtil.cleanStr(r[4])) dic["file_location"] = str(FCUtil.cleanStr(r[5])) dic["created_on"] = str(FCUtil.cleanStr(r[6])) dic["deleted_on"] = str(FCUtil.cleanStr(r[7])) print json.dumps(dic, sort_keys=True, indent=4) firstTime = False print json.dumps(dic, sort_keys=True, indent=4) output.write(str(json.dumps(dic))) output.write("]") output.close()
file = open('10.6.166.43.sql') output = open('annotation_link_relationships.json', 'w') import json import re import FCUtil output.write("[") firstTime = True for row in file: if "INSERT INTO `annotation_link_relationships" in row: r = row.split("(", 1)[1].rsplit(");",1)[0].split(",") dic = {} if not firstTime: output.write (str(',')) for item in r: dic["id"] = int(r[0]) dic["title"] = FCUtil.cleanStr(r[1]) print json.dumps(dic, sort_keys=True, indent=4) firstTime = False output.write(str(json.dumps(dic))) output.write("]") output.close()
p = (float(badMatches)/float(total))*100.0 print "No matches for %d : %4.2f" %(workId, p) print "No Matches for "+str(workId)+' : '+str(badMatches) print "Total for "+str(workId)+' : '+str(total) #workNumber = 112 #print "Work: %d\n" %workNumber #crunch(workNumber) #annNumber = 1762 #print 'Annotation: %d\n'%annNumber #conv = converter() #conv.convert(annNumber) works = FCUtil.getAllWorksExcluding(['William Shakespeare']) #works = [61] totalWorks = len(works) annotationsbyworkid = {} count = 0 worker = createhtml.creator() conv= converter(worker) for wid in works: try: print '\nCurrently on Work %d %00.2f%%' %(wid,((float(count)/float(totalWorks)))*100.0) except: print '\nCurrently on Work %d' &wid worker.makehtml(wid) #crunch(worker,conv,wid) count+=1