def fuzzyset_alg(key, key_list): finder = FuzzySet() finder.add(key) candidates = list() for i in key_list: try: added = [i] #if the match score is below 50% key error raises matched = finder[i] added.extend(*matched) del added[-1] #remove rep's key from list added[1] *= 100 #convert to percentage ''' [0] the sf key [1] match percentage ''' candidates.append(added) except: pass #sort by score candidates.sort(key=lambda x: x[1], reverse=True) #take top take 10 top_candi = candidates[:10] #fuzzy match and sort again finalist = [[i[0], fuzz.ratio(key, i[0])] for i in top_candi] finalist.sort(key=lambda x: x[1], reverse=True) del finder, candidates, top_candi if len(finalist) > 0: return finalist[:3] else: return []
def match_cnpj(self, cnpj, debug=False): best_matches = [] # temp variables start_time = time.time() jobs = [( cnpj_base_str, self.__job_server.submit( fuzzy_cnpj_search, (cnpj_base_str, cnpj, debug,), (log, ), ("from fuzzyset import FuzzySet", "time") )) for cnpj_base_str in self.cnpj_bases] for cnpj_base_str, job in jobs: print "Results", cnpj_base_str, "is", job() elapsed_time = time.time() - start_time log('Parallel processes took %d seconds to finish' % elapsed_time, debug) # Performing Fuzzy string match on the best results of each cnpj base file self.fuzzy_matcher = FuzzySet(best_matches) return self.fuzzy_matcher.get(cnpj)[0]
def __init__(self, field=None, similarity=None, base=None, **kw): super(FuzzyBaseIndex, self).__init__(**kw) self.fuzz = FuzzySet(rel_sim_cutoff=1., use_levenshtein=False) self.content = {} self.field = field self.similarity = similarity self.base = base
def __init__(self, choices_corpus, ngram_range=(1, 2), use_cleaner=True, preprocess_func=None): """ :param choices_corpus: should be a list of texts :param preprocess_func: is a str->str function """ self.ngram_range = ngram_range self.use_cleaner = use_cleaner self.preprocess_func = preprocess_func self.initial_choices_corpus = choices_corpus if self.use_cleaner: choices_corpus = self.cleaner(choices_corpus) if self.preprocess_func: choices_corpus = [self.preprocess_func(k) for k in choices_corpus] self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, # strip_accents='ascii', lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(choices_corpus) self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus) self.initial_corpus_tf_idf_dict = {} for k in range(len(choices_corpus)): self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k] self.vocabulary = self.tfidf.vocabulary_.keys() self.fset_vocabulary = FuzzySet() for brnd in self.vocabulary: self.fset_vocabulary.add(brnd)
class FuzzyBaseIndex(object): def __init__(self, field=None, similarity=None, base=None, **kw): super(FuzzyBaseIndex, self).__init__(**kw) self.fuzz = FuzzySet(rel_sim_cutoff=1., use_levenshtein=False) self.content = {} self.field = field self.similarity = similarity self.base = base def add(self, x, i): self.fuzz.add(x) if x not in self.content: self.content[x] = set() self.content[x].add(i) def finalize(self): pass def search(self, x, top=25, debug=True): results = self.fuzz.get(x) ret = [] for r in results: for i in self.content[r[1]]: sim = self.similarity(x, r[1]) ret.append((i, r[0], sim)) ret = sorted(ret, key=lambda x: x[2], reverse=True) ret = ret[:top] return ret
def match_cnpj(self, cnpj, debug=False): """ Search the closest valid CNPJ given a invalid one :param cnpj: a invalid CNPJ :param debug: whether you want to see debugging logs or not :return: a list of the most similar valid CNPJs to the one you've provided """ best_matches = [] for cnpj_base_str in self.__cnpj_bases: with open(cnpj_base_str) as f: # temp variables start_time = time.time() # Searching self.__log("Searching for %s on %s" % (cnpj, cnpj_base_str), debug) self.__fuzzy_matcher = FuzzySet(f.read().splitlines()) match = self.__fuzzy_matcher.get(cnpj) elapsed_time = time.time() - start_time self.__log("Best match for this file is %s and it took %d seconds" % (match, elapsed_time), debug) # Appending to the best matches so far if not match is None: for m in match: best_matches.append(m[1]) # Performing Fuzzy string match on the best results of each cnpj base file self.__fuzzy_matcher = FuzzySet(best_matches) return self.__fuzzy_matcher.get(cnpj)[0]
def match_cnpj(self, cnpj, debug=False): """ Search the closest valid CNPJ given a invalid one :param cnpj: a invalid CNPJ :param debug: whether you want to see debugging logs or not :return: a list of the most similar valid CNPJs to the one you've provided """ best_matches = [] for cnpj_base_str in self.__cnpj_bases: with open(cnpj_base_str) as f: # temp variables start_time = time.time() # Searching self.__log('Searching for %s on %s' % (cnpj, cnpj_base_str), debug) self.__fuzzy_matcher = FuzzySet(f.read().splitlines()) match = self.__fuzzy_matcher.get(cnpj) elapsed_time = time.time() - start_time self.__log( 'Best match for this file is %s and it took %d seconds' % (match, elapsed_time), debug) # Appending to the best matches so far if not match is None: for m in match: best_matches.append(m[1]) # Performing Fuzzy string match on the best results of each cnpj base file self.__fuzzy_matcher = FuzzySet(best_matches) return self.__fuzzy_matcher.get(cnpj)[0]
def __init__(self, camera): self.Camera = camera self.ellipseFuzzySet = FuzzySet()#([0,1,2.5],[0,1,0]) self.colorFuzzySet = FuzzySet()#([0,1,1.05],[0,1,0]) self.lowerColorBnd = (0, 0, 0) self.upperColorBnd = (180, 255, 255) self.bgs = cv2.createBackgroundSubtractorMOG2() self.bgsLearningRate = 0.1 self.imageAnalyzer = ImageAnalyzer() self.morphologyArray = [] self.sizeBoundaries = ((0, 0), (0, 0)) self.contourThreshold = 0.5
def run_profile(impl): if impl == "cFuzzySet": f = cFuzzySet() else: f = FuzzySet() with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file: for line in input_file: f.add(line.rstrip().decode()) print(f) cProfile.runctx("profiler(f)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats()
def get_oov_vocabulary_map(vocabulary_words_weights, wordset): oov = wordset - set(vocabulary_words_weights.keys()) vocabulary_words_set = FuzzySet(sorted(vocabulary_words_weights.keys())) mapping = {} for word in tqdm(oov): word_matches = vocabulary_words_set.get(word) if word_matches is None or len(word_matches) == 0: continue word_scores = {vocabulary_word: score * vocabulary_words_weights[vocabulary_word] for score, vocabulary_word in word_matches} vocabulary_words_scored = sorted(word_scores.keys(), key=lambda vocabulary_word: -word_scores[vocabulary_word]) mapping[word] = vocabulary_words_scored[0] return mapping
def matchedIds(postId, threshold): keywords = Model.getKeywords() postKeywords = list(filter(lambda x: x["id"] == postId, keywords))[0]["keywords"] matches = [] for keyword in keywords: fs = FuzzySet(keyword['keywords']) for pk in postKeywords: if postId != keyword["id"]: m = fs.get(pk) if m: for score, val in fs.get(pk): if score > threshold: matches.append((keyword["id"], score, val)) return matches
class LocalParallelFuzzyCnpjMatcher(BaseParallelFuzzyCnpjMatcher): def __init__(self, cpu_count="autodetect"): super(LocalParallelFuzzyCnpjMatcher, self).__init__() self.__job_server = pp.Server(ncpus=cpu_count) def match_cnpj(self, cnpj, debug=False): best_matches = [] # temp variables start_time = time.time() jobs = [( cnpj_base_str, self.__job_server.submit( fuzzy_cnpj_search, (cnpj_base_str, cnpj, debug,), (log, ), ("from fuzzyset import FuzzySet", "time") )) for cnpj_base_str in self.cnpj_bases] for cnpj_base_str, job in jobs: print "Results", cnpj_base_str, "is", job() elapsed_time = time.time() - start_time log('Parallel processes took %d seconds to finish' % elapsed_time, debug) # Performing Fuzzy string match on the best results of each cnpj base file self.fuzzy_matcher = FuzzySet(best_matches) return self.fuzzy_matcher.get(cnpj)[0]
def __init__(self, dist_file=CommonConstants.INDIA_DIST_NAMES): super().__init__() self.old_names = {'bangalore':'bengaluru','gurgaon':'gurugram','calcutta':'kolkata','prayagraj':'allahabad','delhi':'delhi'} self.fd = FuzzySet() self.set = set() with open(dist_file) as df: reader = csv.reader(df) header = next(reader) for row in reader: if 'rural' in row[1].lower() or 'urban' in row[1].lower() or 'dehat' in row[1].lower(): alternate = ' '.join(row[1].split(' ')[:-1]).lower() self.fd.add(alternate) self.set.add(alternate) continue self.fd.add(row[1].lower()) self.set.add(row[1].lower()) self.nlp = stanza.Pipeline(lang='en',processors='tokenize',use_gpu=False)
def __init__(self, removed_lines_dicts, added_lines_dicts): self.removed_lines = [] self.trim_text_to_array_of_added_lines = defaultdict(list) self.added_file_name_to_line_no_to_line = defaultdict(dict) self.removed_file_name_to_line_no_to_line = defaultdict(dict) self.added_lines_fuzzy_set = FuzzySet() for added_line_dict in added_lines_dicts: line = Line.from_dict(added_line_dict) self.trim_text_to_array_of_added_lines[line.trim_text].append(line) self.added_lines_fuzzy_set.add(line.trim_text) self.added_file_name_to_line_no_to_line[line.file][line.line_no] = line for removed_line_dict in removed_lines_dicts: line = Line.from_dict(removed_line_dict) self.removed_lines.append(line) self.removed_file_name_to_line_no_to_line[line.file][line.line_no] = line
def __init__(self, field=None, tokenizer=None, similarity=None, base=None, idf_limit=0.05, **kw): super(MiniBaseIndex, self).__init__(**kw) self.content = {} self.field = field self.tokenizer = tokenizer self.similarity = similarity self.base = base self.counts = {} self.fuzzwords = FuzzySet(rel_sim_cutoff=0.7, use_levenshtein=False) self.blacklist = set() self.idf_limit = idf_limit
class ListBasedPlaceExtractionService(NERService): def __init__(self, dist_file=CommonConstants.INDIA_DIST_NAMES): super().__init__() self.old_names = {'bangalore':'bengaluru','gurgaon':'gurugram','calcutta':'kolkata','prayagraj':'allahabad','delhi':'delhi'} self.fd = FuzzySet() self.set = set() with open(dist_file) as df: reader = csv.reader(df) header = next(reader) for row in reader: if 'rural' in row[1].lower() or 'urban' in row[1].lower() or 'dehat' in row[1].lower(): alternate = ' '.join(row[1].split(' ')[:-1]).lower() self.fd.add(alternate) self.set.add(alternate) continue self.fd.add(row[1].lower()) self.set.add(row[1].lower()) self.nlp = stanza.Pipeline(lang='en',processors='tokenize',use_gpu=False) def extract_entities_from_text(self,text): doc = self.nlp(text) closest_match = (0,None) ''' for token in doc.ents: tok_text = token.text.lower() closest_dist = self.fd.get(tok_text) if closest_dist and len(closest_dist): closest = closest_dist[0] if closest[0] > closest_match[0]: closest_match = closest if closest_match[0] > 0.5: return closest_match[1] ''' for sent in doc.sentences: for token in sent.tokens: tok_text = token.text.lower() if tok_text in self.set: return tok_text try: return self.old_names[tok_text] except KeyError: continue
def __init__(self, ngram_range=(1, 3)): """ :param choices_corpus: should be a list of texts :param preprocess_func: is a str->str function """ self.ngram_range = ngram_range choices_corpus = [str(x) for x in list(brands['brnd'].dropna().unique())] l = brands[['brnd', 'equivalents']].dropna().to_dict('records') self.equivalents = {} for el in l: for eq in el['equivalents'].split(';'): self.equivalents[eq.strip()] = el['brnd'] choices_corpus.extend(self.equivalents.keys()) self.initial_choices_corpus = choices_corpus self.cleaned_choices_corpus = self.cleaner(choices_corpus) self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, # strip_accents='ascii', lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(self.cleaned_choices_corpus) self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus) self.initial_corpus_tf_idf_dict = {} for k in range(len(choices_corpus)): self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k] # Creating fuzzy set self.fset_brands = FuzzySet() for token in [str(x) for x in list(brands['brnd'].dropna().unique())]: self.fset_brands.add(token) self.fset_tokens = FuzzySet() for token in list(self.tfidf.vocabulary_): self.fset_tokens.add(token) # Prepare the japanese matching jp_brands = brands[['brnd', 'brnd_jp_clean']] jp_brands = jp_brands[jp_brands.brnd_jp_clean.notnull()] jp_brands['brnd_jp_clean'] = jp_brands['brnd_jp_clean'].apply(lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', ''))) jp_brands['brnd_jp_size'] = jp_brands['brnd_jp_clean'].apply(lambda x: len(x)) jp_brands.sort_values(['brnd_jp_size', 'brnd'], ascending=[False, False], inplace=True) self.jp_brands = jp_brands
def defuzz(self, method=None): final = FuzzySet.Union(self.consequents) temp = list() print(max(final.m)) for i in range(len(final.x)): if final.m[i] >= max(final.m): temp.append(final.x[i]) val = sum(temp) / len(temp) fplot = FuzzyPlotter([final]) fplot() return val
def fuzzy_cnpj_search(cnpj_base_str, cnpj, debug=False): best_matches = [] with open(cnpj_base_str) as f: # temp variables start_time = time.time() # Searching log('Searching for %s on %s' % (cnpj, cnpj_base_str), debug) fuzzy_matcher = FuzzySet(f.read().splitlines()) match = fuzzy_matcher.get(cnpj) elapsed_time = time.time() - start_time log('Best match for this file is %s and it took %d seconds' % (match, elapsed_time), debug) # Appending to the best matches so far if not match is None: for m in match: best_matches.append(m[1]) return best_matches
def execute(self, command): ReturnedJson = json.loads( self._nlp.annotate( command, properties=self._nlp_properties))['sentences'][0] dependencies = [] for dependency in ReturnedJson['enhancedPlusPlusDependencies']: dependencies.append( (dependency['governorGloss'], dependency['dependentGloss'], dependency['dep'])) verb_tuple = [x for x in dependencies if x[2] == u'dobj'][0] functions = FuzzySet(self.function_names) query = functions.get(str(verb_tuple[0])) if query: query = query[0] print(query) if query[0] < .5: print("learning") self._learn(command) else: dyn = self.functions.get(query[1]) obj = verb_tuple[1] obj_adj = str(" ".join([ x[1] for x in dependencies if x[0] == obj and not x[2] == u'det' ]) + " " + obj) objects = FuzzySet(world.objects) object_query = objects.get(obj_adj) if object_query: dyn(world.attributes.get(object_query[0][1])) else: print("no objects found!") else: self._learn(command)
def get_nutrition_data(image_class): url = "https://api.nal.usda.gov/ndb/search/?format=json&q=" + image_class + "&sort=n&max=25&offset=0&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu" r = requests.get(url).json() max_dist_ratio = 0 ndbno = 0 for item in r["list"]["item"]: fs = FuzzySet() fs.add(image_class) ratio = fs.get(item["name"])[0][0] if ratio > max_dist_ratio: max_dist_ratio = ratio ndbno = item["ndbno"] print(ndbno) nutrition_url = "https://api.nal.usda.gov/ndb/V2/reports?ndbno=" + ndbno + "&type=f&format=json&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu" nutrition_data = requests.get(nutrition_url).json() nutrition_facts = {} nutrients = nutrition_data["foods"][0]["food"]["nutrients"] nutrition_facts["serve_size"] = str( nutrients[0]["measures"][0]["qty"]) + " ounces" nutrition_facts["kcal"] = str( nutrients[0]["measures"][0]["value"]) + " calories" nutrition_facts["fat"] = str( nutrients[2]["measures"][0]["value"]) + " grams" nutrition_facts["carbs"] = str( nutrients[3]["measures"][0]["value"]) + " grams" nutrition_facts["protein"] = str( nutrients[1]["measures"][0]["value"]) + " grams" nutrition_facts["sugar"] = str( nutrients[4]["measures"][0]["value"]) + " grams" nutrition_facts["sodium"] = str( nutrients[5]["measures"][0]["value"]) + " milligrams" return nutrition_facts
def _learn(self, command): functions = FuzzySet(self.function_names) rospy.logerr("No command found! Please input commands") self.function_names.append(command) self.functions[command] = [] commands = raw_input() sentences = json.loads( self._nlp.annotate(commands, properties=self._nlp_properties))['sentences'] for sentence in sentences: dependencies = [] for dependency in sentence['enhancedPlusPlusDependencies']: dependencies.append( (dependency['governorGloss'], dependency['dependentGloss'], dependency['dep'])) verb_tuple = [x for x in dependencies if x[2] == u'dobj'][0] query = functions.get(str(verb_tuple[0])) if query: query = query[0] func = self.functions.get(query[1]) self.functions[command].extend(func) rospy.logerr(self.functions)
def regress(self, test, yt): predicted = [] for k, row in enumerate(test): act_deg = [] fparts = [] for i, rule in enumerate(self.rules): ante, cons = rule[0], rule[-1] temp = [] for tup in ante: t = row[tup[0]][tup[-1]] temp.append(t) act_deg.append(min(temp)) if act_deg[i] == 0: if i == 0: f = self.fvar[-1].fuzzy[cons] s = 'f.func.{}(f.num_x, f.prmts)'.format(f.mfunc) newm = eval(s) newf = FuzzySet(f.num_x, newm) newf.cutpoint(act_deg[i]) self.fout = newf continue else: f = self.fvar[-1].fuzzy[cons] s = 'f.func.{}(f.num_x, f.prmts)'.format(f.mfunc) newm = eval(s) newf = FuzzySet(f.num_x, newm) newf.cutpoint(act_deg[i]) if i == 0: self.fout = newf else: self.fout = self.fout | newf # mean of max method ## dummy = self.fout.x[np.where(self.fout.m >= max(self.fout.m))[0]] ## val = sum(dummy)/len(dummy) # center of gravity method val = self.centroid(self.fout.x, self.fout.m) predicted.append(val) ## print('## Actual value: {}'.format(yt[k])) ## print('## Regressed value: {}'.format(val)) ## FuzzyPlotter(self.fout) diff = [(a-p)**2 for a, p in zip(yt, predicted)] rmse = sum(diff) / len(diff) return predicted, rmse
def path(self, source, destination): from fuzzyset import FuzzySet if source == destination: return fuzz = FuzzySet(self.map.nodes) if source not in self.map.nodes: source = fuzzymatch(fuzz, source) if destination not in self.map.nodes: destination = fuzzymatch(fuzz, destination) try: path = nx.shortest_path(self.map, source, destination) except NetworkXNoPath: log.error(f"No path between {source.name} and {destination.name}.") return way = [] for iter in range(1, len(path)): way.append(self.map.edges[path[iter - 1], path[iter]]["label"]) return way
class TestFuzzyMatcher(unittest.TestCase): def setUp(self): with open('../bulk/cnpjs.txt') as f: self.fuzzy_set = f.read().splitlines() self.fuzzy_matcher = FuzzySet(self.fuzzy_set) def test_validate(self): self.assertEqual(self.fuzzy_matcher.get('06389497000195')[0][1], '04389697000195') self.assertEqual(self.fuzzy_matcher.get('15574828000190')[0][1], '15575829000190') self.assertEqual(self.fuzzy_matcher.get('15911974000144')[0][1], '15922975000144') self.assertEqual(self.fuzzy_matcher.get('12919223000129')[0][1], '12291923000129') self.assertEqual(self.fuzzy_matcher.get('557135900011')[0][1], '55713579000121') self.assertEqual(self.fuzzy_matcher.get('40194766000116')[0][1], '49794166000116') #'49794166000116' print self.fuzzy_matcher.get('40194766000116')[0][1]
class WordFixer: def __init__(self, word2vec: Word2VecKeyedVectors): self.__word2vec = word2vec self.__fixed_word_dict: Dict[str, str] = dict() self.__approximate_matcher = FuzzySet(word2vec.vocab) def is_word_correct(self, word: str): if word in self.__word2vec: return True return False def fix(self, word: str): if word in self.__fixed_word_dict: return self.__fixed_word_dict[word] candidate = self.__approximate_matcher.get(word) if candidate is not None and len(candidate) > 0: fixed_word = candidate[0][1] self.__fixed_word_dict[word] = fixed_word return fixed_word raise Exception("Cannot be fixed")
##FuzzyPlotter(v) ### pic1 ##name = 'Parametric Membership Functions' ##uod = np.arange(-10, 50, 0.01) ##terms = ['trimf, [-5,2,12]', 'trapmf, [8,14,22,28]', 'gaussmf, [30,4]'] ##v = FuzzyVariable(name, uod, terms) ##v.setmf([('trimf', [-5,2,12]), ## ('trapmf', [8,14,22,28]), ## ('gaussmf', [30,4])]) ##FuzzyPlotter(v) ### pic 2345 uod = np.arange(-10,110,0.1) A = FuzzySet(uod) A.set_mf('trimf', [0,35,60]) B = FuzzySet(uod) B.set_mf('trimf', [20,75,95]) C = A | B D = A & B E = ~A F = ~B plt.figure() plt.plot(A.x,A.m,'r') plt.plot(E.x,E.m,'g') plt.title('Complementary Operation') #plt.title('The intersection of A and B') plt.xlabel('Universe of Discourse') plt.ylabel('Membership Degree') plt.legend(['μA', 'μAcomplement'])
def setUp(self): with open('../bulk/cnpjs.txt') as f: self.fuzzy_set = f.read().splitlines() self.fuzzy_matcher = FuzzySet(self.fuzzy_set)
class SequentialFuzzyCnpjMatcher: """ Class that performs fuzzy string matching on CNPJs sequentially. For small fuzzyset this class is the easiest way to get started. However if you going for a large fuzzyset we strongly recommend using LocalParallelFuzzyCnpjMatcher instead. """ def __init__(self): """ Default constructor :return: a SequentialFuzzyCnpjMatcher instance """ self.__cnpj_bases = [] for x in xrange(0, 100): idx = x * 1000000 self.__cnpj_bases.append('../bulk/cnpjs_base_' + str(idx).zfill(7) + '.txt') self.__fuzzy_matcher = None def match_cnpj(self, cnpj, debug=False): """ Search the closest valid CNPJ given a invalid one :param cnpj: a invalid CNPJ :param debug: whether you want to see debugging logs or not :return: a list of the most similar valid CNPJs to the one you've provided """ best_matches = [] for cnpj_base_str in self.__cnpj_bases: with open(cnpj_base_str) as f: # temp variables start_time = time.time() # Searching self.__log('Searching for %s on %s' % (cnpj, cnpj_base_str), debug) self.__fuzzy_matcher = FuzzySet(f.read().splitlines()) match = self.__fuzzy_matcher.get(cnpj) elapsed_time = time.time() - start_time self.__log( 'Best match for this file is %s and it took %d seconds' % (match, elapsed_time), debug) # Appending to the best matches so far if not match is None: for m in match: best_matches.append(m[1]) # Performing Fuzzy string match on the best results of each cnpj base file self.__fuzzy_matcher = FuzzySet(best_matches) return self.__fuzzy_matcher.get(cnpj)[0] def __log(self, msg, debug=False): """ Prints a message to console depending on debug variable :param msg: a message string :param debug: a boolean value :return: """ if debug: print msg
class VideoAnalyzer: def __init__(self, camera): self.Camera = camera self.ellipseFuzzySet = FuzzySet()#([0,1,2.5],[0,1,0]) self.colorFuzzySet = FuzzySet()#([0,1,1.05],[0,1,0]) self.lowerColorBnd = (0, 0, 0) self.upperColorBnd = (180, 255, 255) self.bgs = cv2.createBackgroundSubtractorMOG2() self.bgsLearningRate = 0.1 self.imageAnalyzer = ImageAnalyzer() self.morphologyArray = [] self.sizeBoundaries = ((0, 0), (0, 0)) self.contourThreshold = 0.5 def analyze(self): cnt = 0 fcnt = 0 result = [] images = [] while(1): ret, frame = self.Camera.capture.read() if(ret != False): if self.Camera.calibrationMatrix is not None: w, h = int(self.Camera.resolution[0]), int(self.Camera.resolution[1]) newCamMatrix, roi = cv2.getOptimalNewCameraMatrix(self.Camera.calibrationMatrix, self.Camera.distortCoefs, (w, h), 1, (w, h)) frame = cv2.undistort(frame, self.Camera.calibrationMatrix, self.Camera.distortCoefs, None, newCamMatrix) images.append(frame.copy()) rectImg = frame.copy() hsvImg = frame.copy() hsvImg = cv2.cvtColor(hsvImg, cv2.COLOR_BGR2HSV) hsvMask = cv2.inRange(hsvImg, self.lowerColorBnd, self.upperColorBnd) #(30,10,110), (60,255,255)) frame = self.bgs.apply(frame, learningRate = self.bgsLearningRate) for morph in self.morphologyArray: frame = self.imageAnalyzer.morphology(frame, morph) image, contours, hierarchy = cv2.findContours(frame.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) fittingContours = [] for i,contour in enumerate(contours): x,y,w,h = cv2.boundingRect(contour) if(w >= self.sizeBoundaries[0][0] and w <= self.sizeBoundaries[0][1] and h >= self.sizeBoundaries[1][0] and h <= self.sizeBoundaries[1][1]): rotatedRect = cv2.fitEllipse(contour) center = (rotatedRect[0][0], rotatedRect[0][1]) size = (rotatedRect[1][0], rotatedRect[1][1]) angle = (rotatedRect[2]) ellipseFit = None if(size[0] >= size[1]): ellipseFit = self.ellipseFuzzySet.fitLinear(size[1] / size[0]) elif(size[0] < size [1]): ellipseFit = self.ellipseFuzzySet.fitLinear(size[0] / size[1]) if(ellipseFit is not None and ellipseFit >= self.ellipseFuzzySet.threshold): foundBlack = 0 foundWhite = 0 for j in range (x,x+w): for k in range(y,y+h): dist = cv2.pointPolygonTest(contour,(j,k),False) if dist>= 0: foundBlack += 1.0 if(hsvMask[k,j] == 255): foundWhite+=1.0 foundBlack-=1.0 colorFit = None if(foundBlack > foundWhite): colorFit = self.colorFuzzySet.fitLinear(foundWhite / foundBlack) elif(foundWhite >= foundBlack and foundBlack > 0): colorFit = 1 + self.colorFuzzySet.fitLinear(foundBlack / foundWhite) if(colorFit is not None and colorFit >= self.colorFuzzySet.threshold): fittingContours.append((center, size, angle,(colorFit+ellipseFit)/2, cnt)) if len(fittingContours) > 0: best = fittingContours[0] for i in range (0,len(fittingContours)): if(fittingContours[i][3] > best[3]): best = fittingContours[i] if(best[3] >= self.contourThreshold): cv2.rectangle(rectImg,(int(best[0][0]-best[1][0]/2), int(best[0][1]-best[1][1]/2)), (int(best[0][0]+best[1][0]/2), int(best[0][1]+best[1][1]/2)),(255,0,0),2) result.append(best) fcnt += 1 cv2.imshow('frame', rectImg) else: print "Frame not found" cnt += 1 if (cnt == self.Camera.capture.get(cv2.CAP_PROP_FRAME_COUNT)) or cv2.waitKey(3) & 0xff == ord('q'): print 'found', fcnt, 'has', self.Camera.capture.get(cv2.CAP_PROP_FRAME_COUNT) return result, images
def _learn(self, command): fs = FuzzySet(self.function_names) fs.add(command) print("No command found! Please input commands") commands = raw_input() commandArray = commands.split('.')
class SequentialFuzzyCnpjMatcher: """ Class that performs fuzzy string matching on CNPJs sequentially. For small fuzzyset this class is the easiest way to get started. However if you going for a large fuzzyset we strongly recommend using LocalParallelFuzzyCnpjMatcher instead. """ def __init__(self): """ Default constructor :return: a SequentialFuzzyCnpjMatcher instance """ self.__cnpj_bases = [] for x in xrange(0, 100): idx = x * 1000000 self.__cnpj_bases.append("../bulk/cnpjs_base_" + str(idx).zfill(7) + ".txt") self.__fuzzy_matcher = None def match_cnpj(self, cnpj, debug=False): """ Search the closest valid CNPJ given a invalid one :param cnpj: a invalid CNPJ :param debug: whether you want to see debugging logs or not :return: a list of the most similar valid CNPJs to the one you've provided """ best_matches = [] for cnpj_base_str in self.__cnpj_bases: with open(cnpj_base_str) as f: # temp variables start_time = time.time() # Searching self.__log("Searching for %s on %s" % (cnpj, cnpj_base_str), debug) self.__fuzzy_matcher = FuzzySet(f.read().splitlines()) match = self.__fuzzy_matcher.get(cnpj) elapsed_time = time.time() - start_time self.__log("Best match for this file is %s and it took %d seconds" % (match, elapsed_time), debug) # Appending to the best matches so far if not match is None: for m in match: best_matches.append(m[1]) # Performing Fuzzy string match on the best results of each cnpj base file self.__fuzzy_matcher = FuzzySet(best_matches) return self.__fuzzy_matcher.get(cnpj)[0] def __log(self, msg, debug=False): """ Prints a message to console depending on debug variable :param msg: a message string :param debug: a boolean value :return: """ if debug: print msg
class MiniBaseIndex(object): def __init__(self, field=None, tokenizer=None, similarity=None, base=None, idf_limit=0.05, **kw): super(MiniBaseIndex, self).__init__(**kw) self.content = {} self.field = field self.tokenizer = tokenizer self.similarity = similarity self.base = base self.counts = {} self.fuzzwords = FuzzySet(rel_sim_cutoff=0.7, use_levenshtein=False) self.blacklist = set() self.idf_limit = idf_limit def add(self, tok, i): if tok not in self.content: if tok not in self.blacklist: self.content[tok] = set() self.counts[tok] = 0 self.content[tok].add(i) self.counts[tok] += 1 # if self.counts[tok]/len(self.base.entries) > self.idf_limit: # self.blacklist.add(tok) # del self.counts[tok] # del self.content[tok] self.fuzzwords.add(tok) def finalize(self): for tok in self.content: pass # self.fuzzwords.add(tok) def search(self, x, expl=5000, top=25, maxtok=250, debug=False): tokenizer = self.tokenizer xtoks = tokenizer(x) # maxtok = maxtok * len(xtoks) results = {} # collect all toks alltoks = [] alltoks_set = set() for xtok in xtoks: for xtok_fuzz_score, xtok_fuzz_tok \ in self.fuzzwords.get(xtok): xtok_fuzz_sim = self.similarity(xtok, xtok_fuzz_tok) if xtok_fuzz_tok not in alltoks_set: alltoks.append( (xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim)) alltoks_set.add(xtok_fuzz_tok) # alltoks = list(alltoks) # sort together by fuzziness alltoks = sorted(alltoks, key=lambda x: x[2] * 100 + 1 / self.counts[x[1]], reverse=True) # take maxtok only if debug: print(len(alltoks), maxtok) for tok in alltoks: print(tok, self.counts[tok[1]]) alltoks = alltoks[:maxtok] # sort by inverse frequency # alltoks = sorted(alltoks, key=lambda x: self.counts[x[1]]) # alltoksset = set(alltoks) for xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim in alltoks: for _id in self.content[xtok_fuzz_tok]: if _id not in results: results[_id] = 0 results[_id] += xtok_fuzz_score if len(results) > expl: break if len(results) > expl: break if debug: print(len(results)) results = [(res[0], res[1], self.similarity(x, self.base.entries[res[0]][self.field])) for res in results.items()] def sortkey(x): entid = x[0] pop = self.base.entries[entid]["pop"] sim = x[2] return sim * 1e2 + pop * 1e-3 results = sorted(results, key=sortkey, reverse=True) results = results[:top] return results
def __init__(self, word2vec: Word2VecKeyedVectors): self.__word2vec = word2vec self.__fixed_word_dict: Dict[str, str] = dict() self.__approximate_matcher = FuzzySet(word2vec.vocab)
geoctrs = [ feat['properties']['name'] for k, feat in enumerate(geos['features']) ] # list of countries from C19 data c19ctrs = c19.columns.tolist() # list of countries from population data popsctrs = pops['Country'].tolist() # geo data is king, we need to match everything else to it # the country name becomes the key matching the tables / dictionaries # some countries in c19 do not match any country in geo data # let's print fuzzy matches fzs = FuzzySet() for c in geoctrs: fzs.add(c) #for c in c19ctrs: # if c not in geoctrs: # print(c, fzs.get(c)) # In[8]: c19notfound = [ 'Andorra', 'Antigua and Barbuda', 'Bahrain', 'Barbados', 'Cabo Verde', 'Comoros', 'Diamond Princess', 'Dominica', 'Grenada', 'Holy See', 'Liechtenstein', 'MS Zaandam', 'Maldives', 'Mauritius', 'Monaco', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines', 'San Marino', 'Sao Tome and Principe', 'Seychelles', 'Singapore'