def check_car_make(name): name = name.strip() name = name.lower() name = name.replace("`", "") name = name.replace(".", " ") name = name.replace("0", "0") lst = [ "volvo", "chrylsr", "datsun", "chevy", "chevrolet", "ford", "volkswagen", "vw", "buick", "mercury", "dodge", "subaru", "mazda", "audi", "honda", "hyundai", "plymth", "lincoln", "toyota", "renault", "peugot", "nissan", "isuzu", "cadillac", "yamaha", "jeep", "saab", "yamaha", "porshe", "oldsmob", "pontac", "ferrari", "mitsubi", "eagle", "jaguar", "camaro" ] if name == "-": return "other" for item in lst: if ld(name, item) <= 2: return "car_make" return "other" if name == "null": return "other" for item in ls: if item in name: return "neighborhood" return "other"
def check_color(word, lclr): word = word.strip() word = word.lower() if word == "null": return "other" if word == '-': return "other" for item in lclr: if ld(word, item) <= 1: return "color" return "other"
def homophone_matches(terms, targets): # Given two lists, return intersection (with lenience for homophones). excluded_words = ['vs'] targets = [t for t in targets if t not in excluded_words] orig_targets = targets terms = map(unicode, terms) targets = map(unicode, targets) matches = [] for i in range(len(terms)): for j in range(len(targets)): if ld(terms[i], targets[j]) <= 1: matches.append(orig_targets[j]) return matches
def get_clusters(dic,prec): clusters = defaultdict(lambda:[0,0,[]]) l = sorted(dic, key=dic.get, reverse=True)[:len(dic)/5] for i,e1 in enumerate(l): docs = dic[e1][1] clusters[e1][0]+=dic[e1][0] for e2 in l[i+1:]: if e1!=e2 and e1[0]==e2[0] and ld(e1,e2) < prec: clusters[e1][0]+=dic[e2][0] docs = docs|dic[e2][1] clusters[e1][2].append(e2) l.remove(e2) clusters[e1][1]+=len(docs) return clusters
def check_body(name): name = name.strip() name = name.lower() name = name.replace("&","") name = name.replace("and","") name = name.replace("/"," ") name = name.split(" ") lst = ["wagon", "truck", "sedan", "bike","bus","pick","taxi", "van","cycle","cab","motor","door","moped","conv", "garbage","mixer","ambul","passenger","tank","flat","bed","wheel", "limo","vehicle","dump","train","delv","subn","dsd","util","refg","trlr","pkup","semi"] for n in name: for item in lst: if ld(n, item)<=2: return "vehicle_type" return "other"
def check_name(name, content): if name == "null" or name == "-": return "other" name = name.lower() name = name.replace("&", "") name = name.replace("and", "") name = name.replace("/", " ") name = name.replace("-", "") name = name.strip() name = name.split(",") if len(name[0]) == 1: return "person_name" for nm in content: for n in name: n = n.strip() if ld(n, nm) <= 3: return "person_name" return "other"
# from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, StringIndexer, NGram # from pyspark.ml.classification import LogisticRegression # from pyspark.ml import Pipeline # from pyspark.mllib.evaluation import MulticlassMetrics #spark = SparkSession.builder.master("local[*]").getOrCreate() sc = SparkContext("local", pyFiles=["jlf.zip"]) spark = SparkSession \ .builder \ .appName("Big Data Project") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() #sc = SparkContext() #sc.addPyFile("jellyfish") from jellyfish import levenshtein_distance as ld, soundex print("This is starting : ", ld("anjan", "anjana")) """ List of functions written for semantic type : 1. check_park 2. check_agency 3. check_subject 4. check_street 5. check_address 6. check_schLvl 7. check_website 8. check_build_cls 9. check_zipcode 10. check_school_name 11. check_borough 12. check_phoneNum 13. check_color
# print "%s" % (str(bin(shortWordBits))[2:]) # print "%s" % (str(bin(compareWordsBits[i]))[2:]) # print "Difference: %d" % bitCount(shortWordBits ^ compareWordsBits[i]) counter = counter + 1 print "---------------------" print "Bit Stage: %s seconds" % t.secs print "Number of Short Words: %d" % len(banknamesShort) print "Number of Long Words: %d" % len(banknamesLong) print "Theoretical Number of Matches (short x long): %d" % (len(banknamesShort) * len(banknamesLong)) print "Number of Total Comparisons: %d" % counter matchCount =[] with Timer() as t: for shortWord, compareWordsList in compareDict.items(): compareVals = [ld(shortWord, longWord) for longWord in compareWordsList] minVal = min(compareVals) if minVal <= threshold: matches = [word for dist, word in zip(compareVals, compareWordsList) if dist == minVal] matchCount.append(len(matches)) #matchString = ", ".join(matches) #print "Potential Matches for %s: %s, distance %d" % (shortWord, matchString, minVal) #else: # print "No matches for %s" % shortWord avgMatches = float(len(matchCount))/len(compareDict.keys()) fracMatched = float(len(matchCount))/len(banknamesShort) print "---------------------" print "LD Stage: %s seconds" % t.secs print "Number of Comparisons: %d" % successCounter print "Average Number of Matches: %f" % avgMatches
# print "Difference: %d" % bitCount(shortWordBits ^ compareWordsBits[i]) counter = counter + 1 print "---------------------" print "Bit Stage: %s seconds" % t.secs print "Number of Short Words: %d" % len(banknamesShort) print "Number of Long Words: %d" % len(banknamesLong) print "Theoretical Number of Matches (short x long): %d" % ( len(banknamesShort) * len(banknamesLong)) print "Number of Total Comparisons: %d" % counter matchCount = [] with Timer() as t: for shortWord, compareWordsList in compareDict.items(): compareVals = [ ld(shortWord, longWord) for longWord in compareWordsList ] minVal = min(compareVals) if minVal <= threshold: matches = [ word for dist, word in zip(compareVals, compareWordsList) if dist == minVal ] matchCount.append(len(matches)) #matchString = ", ".join(matches) #print "Potential Matches for %s: %s, distance %d" % (shortWord, matchString, minVal) #else: # print "No matches for %s" % shortWord avgMatches = float(len(matchCount)) / len(compareDict.keys()) fracMatched = float(len(matchCount)) / len(banknamesShort)