def fuzzymatch(string1): #note: fuzzymatch.php must be in php path, e.g. /usr/lib/php/!!! #put in a cron job that runs every half hour for new entries? entities = Session.query(Entity) matches = [] ##string1 = string1.decode('utf8') for entity in entities: php = PHP("require 'fuzzymatch.php';") #php = PHP() #print "testing " + entity.label.encode('utf8') + " against " + string1.encode('utf8') + "\n" code = '$string1 = utf8_decode("' + string1.encode('utf8') + '");' #code = code + "$string2 = '" + entity.label.encode('latin-1', 'replace') + "';" #code = code + "print $string1; print $string2;" #print code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");' code = code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");' code = code + """print fuzzy_match($string1, $string2, 2);""" verdict = php.get_raw(code) #print "verdict is " + verdict + "\n" if float(verdict)>=.5: #print entity.label + " is a match!\n" entity.matchvalue = verdict matches.append(entity) return matches
def fuzzymatch(string1): #note: fuzzymatch.php must be in php path, e.g. /usr/lib/php/!!! #put in a cron job that runs every half hour for new entries? entities = Session.query(Entity) matches = [] ##string1 = string1.decode('utf8') for entity in entities: php = PHP("require 'fuzzymatch.php';") #php = PHP() #print "testing " + entity.label.encode('utf8') + " against " + string1.encode('utf8') + "\n" code = '$string1 = utf8_decode("' + string1.encode('utf8') + '");' #code = code + "$string2 = '" + entity.label.encode('latin-1', 'replace') + "';" #code = code + "print $string1; print $string2;" #print code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");' code = code + '$string2 = utf8_decode("' + entity.label.encode( 'utf8') + '");' code = code + """print fuzzy_match($string1, $string2, 2);""" verdict = php.get_raw(code) #print "verdict is " + verdict + "\n" if float(verdict) >= .5: #print entity.label + " is a match!\n" entity.matchvalue = verdict matches.append(entity) return matches
def fuzzymatchtest(string1, string2): #note: fuzzymatch.php must be in php path, e.g. /usr/lib/php/!!! php = PHP("require 'fuzzymatch.php';") #php = PHP() code = "$string1 = '" + string1 + "';" code = code + "$string2 = '" + string2.encode('latin-1', 'replace') + "';" #code = code + "print $string1; print $string2;" code = code + """print fuzzy_match($string1, $string2, 2);""" return php.get_raw(code)
def fuzzymatchall(SEPEntrieslist): #takes outputs from addlist() and saves all fuzzy match IDs to SEPEntry.fuzzymatch with verdicts (percent of words matched) #now change so that it only updates ones that don't currently have a fuzzymatchlist #clear out fuzzymatch table--otherwise old fuzzies will accumulate, and nobody wants that delquery = Session.query(Fuzzymatch) delquery.delete() Session.flush() Session.commit() for SEPEntry in SEPEntrieslist: print "working on " + SEPEntry.title.encode('utf-8') + "\n" entities = Session.query(Entity) #exclude journals and nodes from fuzzy matching entities = entities.filter(Entity.typeID != 2) entities = entities.filter(Entity.typeID != 4) #reset fuzzymatches for that entry #SEPEntry.fuzzymatches = "" ##string1 = string1.decode('utf8') for entity in entities: php = PHP("set_include_path('/usr/lib/php/');") php = PHP("require 'fuzzymatch.php';") #php = PHP() #print "testing " + entity.label.encode('utf8') + " against " + string1.encode('utf8') + "\n" code = '$string1 = utf8_decode("' + SEPEntry.title.encode('utf8') + '");' #code = code + "$string2 = '" + entity.label.encode('latin-1', 'replace') + "';" #code = code + "print $string1; print $string2;" #print code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");' code = code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");' code = code + """print fuzzy_match($string1, $string2, 2);""" verdict = php.get_raw(code) #print "verdict is " + verdict + "\n" verdict = verdict.split(',') if float(verdict[0])>=.20: #print entity.label + " is a match!\n" #entity.matchvalue = verdict #string = SEPEntry.fuzzymatches + "|" + str(entity.ID) + "," + verdict #if len(string) < 400: # SEPEntry.fuzzymatches = SEPEntry.fuzzymatches + "|" + str(entity.ID) + "," + verdict #else: # print "sorry, too many matches! Can't add " + str(entity.ID) + " to fuzzy matches; over 400 chars." fmatch = Fuzzymatch(entity.ID) fmatch.sep_dir = SEPEntry.sep_dir fmatch.strength = verdict[0] fmatch.edits = verdict[1] SEPEntry.fmatches.append(fmatch) Session.flush() Session.commit()
def fuzzymatchall(SEPEntrieslist): #takes outputs from addlist() and saves all fuzzy match IDs to SEPEntry.fuzzymatch with verdicts (percent of words matched) #now change so that it only updates ones that don't currently have a fuzzymatchlist #clear out fuzzymatch table--otherwise old fuzzies will accumulate, and nobody wants that delquery = Session.query(Fuzzymatch) delquery.delete() Session.flush() Session.commit() for SEPEntry in SEPEntrieslist: print "working on " + SEPEntry.title.encode('utf-8') + "\n" entities = Session.query(Entity) #exclude journals and nodes from fuzzy matching entities = entities.filter(Entity.typeID != 2) entities = entities.filter(Entity.typeID != 4) #reset fuzzymatches for that entry #SEPEntry.fuzzymatches = "" ##string1 = string1.decode('utf8') for entity in entities: php = PHP("set_include_path('/usr/lib/php/');") php = PHP("require 'fuzzymatch.php';") #php = PHP() #print "testing " + entity.label.encode('utf8') + " against " + string1.encode('utf8') + "\n" code = '$string1 = utf8_decode("' + SEPEntry.title.encode( 'utf8') + '");' #code = code + "$string2 = '" + entity.label.encode('latin-1', 'replace') + "';" #code = code + "print $string1; print $string2;" #print code + '$string2 = utf8_decode("' + entity.label.encode('utf8') + '");' code = code + '$string2 = utf8_decode("' + entity.label.encode( 'utf8') + '");' code = code + """print fuzzy_match($string1, $string2, 2);""" verdict = php.get_raw(code) #print "verdict is " + verdict + "\n" verdict = verdict.split(',') if float(verdict[0]) >= .20: #print entity.label + " is a match!\n" #entity.matchvalue = verdict #string = SEPEntry.fuzzymatches + "|" + str(entity.ID) + "," + verdict #if len(string) < 400: # SEPEntry.fuzzymatches = SEPEntry.fuzzymatches + "|" + str(entity.ID) + "," + verdict #else: # print "sorry, too many matches! Can't add " + str(entity.ID) + " to fuzzy matches; over 400 chars." fmatch = Fuzzymatch(entity.ID) fmatch.sep_dir = SEPEntry.sep_dir fmatch.strength = verdict[0] fmatch.edits = verdict[1] SEPEntry.fmatches.append(fmatch) Session.flush() Session.commit()