def getStartOffsetFromAnnId(id): for obj in AnnJSON: if obj['id'] == id: wordIndex = obj['start_index'] Oquote = obj['quote'] content = getContentBySectionId(obj['section_id']) quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote))) quote = re.sub('\s*(/)\s*',' ',quote).strip() endQuote = quote.split()[0] if "'" in quote[:1]: quote = quote[1:] startQuote = quote.split()[0] #print "quote " + quote cw = FCUtil.strip_html(FCUtil.remove_html_tags(content)) cnw = FCUtil.strip_html(content) c = cw.split() #print cnw #print len(c) #print 'startWordINdex: '+str(wordIndex) startQuoteIndex = [m.start() for m in re.finditer(startQuote.replace('[','\[').replace(']','\]'), cnw)] quoteIndex = [m.start() for m in re.finditer(quote, cnw)] #print 'StartQuoteIndex for: '+startQuote #print startQuoteIndex if len( startQuoteIndex) == 1: #print 'via Start Quote' return [startQuoteIndex[0],1] elif len(quoteIndex) == 1: #print 'via entire Quote' return [quoteIndex[0],1] elif len(quoteIndex) == 0: sQuote = quote.split() print sQuote for i in sQuote: try: tempA = [m.start() for m in re.finditer(i, cnw)] if len(tempA) == 1: #print 'new way to town' return [tempA[0]-len(quote.split(i,1)[0]),1] except: pass else: sQuote = quote.split() check = '' for i in sQuote: check +=str(str(i)+' ') try: tempA = [m.start() for m in re.finditer(check, cnw)] if len(tempA) == 1: #print 'should delete' return [tempA[0],1] except: pass if len(quoteIndex) > 1: #some how get word count to print quoteIndex startLoc = [] for idx, k in enumerate(c): if startQuote in k: startLoc.append(idx) #print startLoc place = 0 m = 10000 www = 0 for idx,k in enumerate(startLoc): if(m > abs(k-wordIndex)): m = abs(k-wordIndex) place = idx #print "startIndex: "+str(place) #print "StartWordIndex: "+str(wordIndex) #print len(cnw.rsplit(startQuote,len(startLoc)-place)[0]) if(len(startLoc) > 0): #return len(cnw.split(startQuote,len(startLoc)-place-1)[0]) return [startQuoteIndex[place],len(startQuoteIndex),len(quote)] print 'START_ERROR' return None
def getEndOffsetFromAnnId(id): for obj in AnnJSON: if obj['id'] == id: wordIndex = obj['end_index'] Oquote = obj['quote'] quote = FCUtil.strip_html(FCUtil.remove_html_tags(FCUtil.cleanStr(Oquote))) quote = re.sub('\s*(/)\s*','',quote).strip() endQuote = quote.rsplit()[-1] if "'" in quote[:1]: quote = quote[1:] #print "quote " + quote content = getContentBySectionId(obj['section_id']) cw = FCUtil.strip_html(FCUtil.remove_html_tags(content)) #possible use of beautifulsoup cnw = FCUtil.strip_html(content) c = cw.split() #print c endQuoteIndex = [m.start() for m in re.finditer(endQuote.replace('[','\[').replace(']','\]'), cnw)] quoteIndex = [m.start() for m in re.finditer(quote, cnw)] #print 'Quote: '+quote #print 'Content: '+cnw #print 'EndQuoteIndex for: '+endQuote #print endQuoteIndex if len( endQuoteIndex) == 1: return [endQuoteIndex[0]+len(endQuote),1,len(quote)] elif len(quoteIndex) == 1: return [len(quote)+quoteIndex[0],1,len(quote)] elif len(quoteIndex) == 0: sQuote = quote.split() for i in sQuote: try: tempA = [m.start() for m in re.finditer(i, cnw)] if len(tempA) == 1: #print 'new way to town' return [tempA[0]+len(quote.split(i,1)[1]),1,len(quote)] except: pass else: sQuote = quote.split() check = '' for i in sQuote: check +=str(str(i)+' ') try: tempA = [m.start() for m in re.finditer(check, cnw)] if len(tempA) == 1: return [len(quote)+tempA[0],1,len(quote)] except: pass endLoc = [] for idx, k in enumerate(c): if endQuote in k: endLoc.append(idx) #print endLoc place = 0 m = 10000 for idx,k in enumerate(endLoc): if(m > abs(k-wordIndex)): m = abs(k-wordIndex) place = idx #print "index "+str(place) #print len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote) #print "EndWordIndex: "+str(wordIndex) #print "EndIndex: "+str(place) if len(endLoc) >0: #return len(cnw.rsplit(endQuote,len(endLoc)-place)[0])+len(endQuote) return [endQuoteIndex[place]+len(endQuote),len(endQuoteIndex),len(quote)] print 'END_ERROR' return None