Python FuzzySetの例、fuzzyset.FuzzySet Pythonの例

コード例 #1

0

ファイルを表示

def fuzzyset_alg(key, key_list):
    finder = FuzzySet()
    finder.add(key)
    candidates = list()
    for i in key_list:
        try:
            added = [i]
            #if the match score is below 50% key error raises
            matched = finder[i]
            added.extend(*matched)
            del added[-1]  #remove rep's key from list
            added[1] *= 100  #convert to percentage
            '''
            [0] the sf key
            [1] match percentage
            '''
            candidates.append(added)
        except:
            pass
    #sort by score
    candidates.sort(key=lambda x: x[1], reverse=True)

    #take top take 10
    top_candi = candidates[:10]
    #fuzzy match and sort again
    finalist = [[i[0], fuzz.ratio(key, i[0])] for i in top_candi]
    finalist.sort(key=lambda x: x[1], reverse=True)
    del finder, candidates, top_candi
    if len(finalist) > 0:
        return finalist[:3]
    else:
        return []

コード例 #2

0

ファイルを表示

ファイル: parallel_search.py プロジェクト: viniciusbig/fuzzy_cnpj_matcher

    def match_cnpj(self, cnpj, debug=False):
        best_matches = []

        # temp variables
        start_time = time.time()

        jobs = [(
                    cnpj_base_str,
                    self.__job_server.submit(
                        fuzzy_cnpj_search,
                        (cnpj_base_str, cnpj, debug,),
                        (log, ),
                        ("from fuzzyset import FuzzySet", "time")
                    )) for cnpj_base_str in self.cnpj_bases]

        for cnpj_base_str, job in jobs:
            print "Results", cnpj_base_str, "is", job()

        elapsed_time = time.time() - start_time

        log('Parallel processes took %d seconds to finish' % elapsed_time, debug)

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.fuzzy_matcher = FuzzySet(best_matches)
        return self.fuzzy_matcher.get(cnpj)[0]

コード例 #3

0

ファイルを表示

 def __init__(self, field=None, similarity=None, base=None, **kw):
     super(FuzzyBaseIndex, self).__init__(**kw)
     self.fuzz = FuzzySet(rel_sim_cutoff=1., use_levenshtein=False)
     self.content = {}
     self.field = field
     self.similarity = similarity
     self.base = base

コード例 #4

0

ファイルを表示

ファイル: tfidfmatcher.py プロジェクト: maker-project-1/webscrapping

    def __init__(self, choices_corpus, ngram_range=(1, 2), use_cleaner=True, preprocess_func=None):
        """
        :param choices_corpus: should be a list of texts
        :param preprocess_func: is a str->str function
        """
        self.ngram_range = ngram_range
        self.use_cleaner = use_cleaner
        self.preprocess_func = preprocess_func

        self.initial_choices_corpus = choices_corpus
        if self.use_cleaner:
            choices_corpus = self.cleaner(choices_corpus)
        if self.preprocess_func:
            choices_corpus = [self.preprocess_func(k) for k in choices_corpus]

        self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True,  # strip_accents='ascii',
                                     lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(choices_corpus)

        self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus)
        self.initial_corpus_tf_idf_dict = {}
        for k in range(len(choices_corpus)):
            self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k]
        self.vocabulary = self.tfidf.vocabulary_.keys()
        self.fset_vocabulary = FuzzySet()
        for brnd in self.vocabulary:
            self.fset_vocabulary.add(brnd)

コード例 #5

0

ファイルを表示

class FuzzyBaseIndex(object):
    def __init__(self, field=None, similarity=None, base=None, **kw):
        super(FuzzyBaseIndex, self).__init__(**kw)
        self.fuzz = FuzzySet(rel_sim_cutoff=1., use_levenshtein=False)
        self.content = {}
        self.field = field
        self.similarity = similarity
        self.base = base

    def add(self, x, i):
        self.fuzz.add(x)
        if x not in self.content:
            self.content[x] = set()
        self.content[x].add(i)

    def finalize(self):
        pass

    def search(self, x, top=25, debug=True):
        results = self.fuzz.get(x)
        ret = []
        for r in results:
            for i in self.content[r[1]]:
                sim = self.similarity(x, r[1])
                ret.append((i, r[0], sim))
        ret = sorted(ret, key=lambda x: x[2], reverse=True)
        ret = ret[:top]
        return ret

コード例 #6

0

ファイルを表示

ファイル: sequential_search.py プロジェクト: viniciusbig/fuzzy_cnpj_matcher

    def match_cnpj(self, cnpj, debug=False):
        """
        Search the closest valid CNPJ given a invalid one
        :param cnpj: a invalid CNPJ
        :param debug: whether you want to see debugging logs or not
        :return: a list of the most similar valid CNPJs to the one you've provided
        """
        best_matches = []

        for cnpj_base_str in self.__cnpj_bases:
            with open(cnpj_base_str) as f:
                # temp variables
                start_time = time.time()

                # Searching
                self.__log("Searching for %s on %s" % (cnpj, cnpj_base_str), debug)
                self.__fuzzy_matcher = FuzzySet(f.read().splitlines())

                match = self.__fuzzy_matcher.get(cnpj)
                elapsed_time = time.time() - start_time

                self.__log("Best match for this file is %s and it took %d seconds" % (match, elapsed_time), debug)
                # Appending to the best matches so far
                if not match is None:
                    for m in match:
                        best_matches.append(m[1])

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.__fuzzy_matcher = FuzzySet(best_matches)
        return self.__fuzzy_matcher.get(cnpj)[0]

コード例 #7

0

ファイルを表示

ファイル: sequential_search.py プロジェクト: viniciusbig/fuzzy_cnpj_matcher

    def match_cnpj(self, cnpj, debug=False):
        """
        Search the closest valid CNPJ given a invalid one
        :param cnpj: a invalid CNPJ
        :param debug: whether you want to see debugging logs or not
        :return: a list of the most similar valid CNPJs to the one you've provided
        """
        best_matches = []

        for cnpj_base_str in self.__cnpj_bases:
            with open(cnpj_base_str) as f:
                # temp variables
                start_time = time.time()

                # Searching
                self.__log('Searching for %s on %s' % (cnpj, cnpj_base_str),
                           debug)
                self.__fuzzy_matcher = FuzzySet(f.read().splitlines())

                match = self.__fuzzy_matcher.get(cnpj)
                elapsed_time = time.time() - start_time

                self.__log(
                    'Best match for this file is %s and it took %d seconds' %
                    (match, elapsed_time), debug)
                # Appending to the best matches so far
                if not match is None:
                    for m in match:
                        best_matches.append(m[1])

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.__fuzzy_matcher = FuzzySet(best_matches)
        return self.__fuzzy_matcher.get(cnpj)[0]

コード例 #8

0

ファイルを表示

ファイル: video_analysis.py プロジェクト: lukic-aleksandar/hawkEyeProject

 def __init__(self, camera):
     self.Camera = camera
     self.ellipseFuzzySet = FuzzySet()#([0,1,2.5],[0,1,0])
     self.colorFuzzySet = FuzzySet()#([0,1,1.05],[0,1,0])
     self.lowerColorBnd = (0, 0, 0)
     self.upperColorBnd = (180, 255, 255)
     self.bgs = cv2.createBackgroundSubtractorMOG2()
     self.bgsLearningRate = 0.1
     self.imageAnalyzer = ImageAnalyzer()
     self.morphologyArray = []
     self.sizeBoundaries = ((0, 0), (0, 0))
     self.contourThreshold = 0.5

コード例 #9

0

ファイルを表示

def run_profile(impl):
    if impl == "cFuzzySet":
        f = cFuzzySet()
    else:
        f = FuzzySet()
    with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file:
        for line in input_file:
            f.add(line.rstrip().decode())
    print(f)
    cProfile.runctx("profiler(f)", globals(), locals(), "Profile.prof")

    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("time").print_stats()

コード例 #10

0

ファイルを表示

ファイル: oov_embeddings.py プロジェクト: QtRoS/nodl_toxic

def get_oov_vocabulary_map(vocabulary_words_weights, wordset):
    oov = wordset - set(vocabulary_words_weights.keys())
    vocabulary_words_set = FuzzySet(sorted(vocabulary_words_weights.keys()))
    mapping = {}
    for word in tqdm(oov):
        word_matches = vocabulary_words_set.get(word)
        if word_matches is None or len(word_matches) == 0:
            continue
        word_scores = {vocabulary_word: score * vocabulary_words_weights[vocabulary_word]
                       for score, vocabulary_word in word_matches}
        vocabulary_words_scored = sorted(word_scores.keys(),
                                         key=lambda vocabulary_word: -word_scores[vocabulary_word])
        mapping[word] = vocabulary_words_scored[0]
    return mapping

コード例 #11

0

ファイルを表示

 def matchedIds(postId, threshold):
     keywords = Model.getKeywords()
     postKeywords = list(filter(lambda x: x["id"] == postId,
                                keywords))[0]["keywords"]
     matches = []
     for keyword in keywords:
         fs = FuzzySet(keyword['keywords'])
         for pk in postKeywords:
             if postId != keyword["id"]:
                 m = fs.get(pk)
                 if m:
                     for score, val in fs.get(pk):
                         if score > threshold:
                             matches.append((keyword["id"], score, val))
     return matches

コード例 #12

0

ファイルを表示

ファイル: parallel_search.py プロジェクト: viniciusbig/fuzzy_cnpj_matcher

class LocalParallelFuzzyCnpjMatcher(BaseParallelFuzzyCnpjMatcher):

    def __init__(self, cpu_count="autodetect"):
        super(LocalParallelFuzzyCnpjMatcher, self).__init__()
        self.__job_server = pp.Server(ncpus=cpu_count)

    def match_cnpj(self, cnpj, debug=False):
        best_matches = []

        # temp variables
        start_time = time.time()

        jobs = [(
                    cnpj_base_str,
                    self.__job_server.submit(
                        fuzzy_cnpj_search,
                        (cnpj_base_str, cnpj, debug,),
                        (log, ),
                        ("from fuzzyset import FuzzySet", "time")
                    )) for cnpj_base_str in self.cnpj_bases]

        for cnpj_base_str, job in jobs:
            print "Results", cnpj_base_str, "is", job()

        elapsed_time = time.time() - start_time

        log('Parallel processes took %d seconds to finish' % elapsed_time, debug)

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.fuzzy_matcher = FuzzySet(best_matches)
        return self.fuzzy_matcher.get(cnpj)[0]

コード例 #13

0

ファイルを表示

 def __init__(self, dist_file=CommonConstants.INDIA_DIST_NAMES):
     super().__init__()
     self.old_names = {'bangalore':'bengaluru','gurgaon':'gurugram','calcutta':'kolkata','prayagraj':'allahabad','delhi':'delhi'}
     self.fd = FuzzySet()
     self.set = set()
     with open(dist_file) as df:
         reader = csv.reader(df)
         header = next(reader)
         for row in reader:
             if 'rural' in row[1].lower() or 'urban' in   row[1].lower() or 'dehat' in  row[1].lower():
                 alternate = ' '.join(row[1].split(' ')[:-1]).lower()
                 self.fd.add(alternate)
                 self.set.add(alternate)
                 continue
             self.fd.add(row[1].lower())
             self.set.add(row[1].lower())
     self.nlp = stanza.Pipeline(lang='en',processors='tokenize',use_gpu=False)

コード例 #14

0

ファイルを表示

    def __init__(self, removed_lines_dicts, added_lines_dicts):
        self.removed_lines = []
        self.trim_text_to_array_of_added_lines = defaultdict(list)
        self.added_file_name_to_line_no_to_line = defaultdict(dict)
        self.removed_file_name_to_line_no_to_line = defaultdict(dict)
        self.added_lines_fuzzy_set = FuzzySet()

        for added_line_dict in added_lines_dicts:
            line = Line.from_dict(added_line_dict)
            self.trim_text_to_array_of_added_lines[line.trim_text].append(line)
            self.added_lines_fuzzy_set.add(line.trim_text)
            self.added_file_name_to_line_no_to_line[line.file][line.line_no] = line

        for removed_line_dict in removed_lines_dicts:
            line = Line.from_dict(removed_line_dict)
            self.removed_lines.append(line)
            self.removed_file_name_to_line_no_to_line[line.file][line.line_no] = line

コード例 #15

0

ファイルを表示

 def __init__(self,
              field=None,
              tokenizer=None,
              similarity=None,
              base=None,
              idf_limit=0.05,
              **kw):
     super(MiniBaseIndex, self).__init__(**kw)
     self.content = {}
     self.field = field
     self.tokenizer = tokenizer
     self.similarity = similarity
     self.base = base
     self.counts = {}
     self.fuzzwords = FuzzySet(rel_sim_cutoff=0.7, use_levenshtein=False)
     self.blacklist = set()
     self.idf_limit = idf_limit

コード例 #16

0

ファイルを表示

class ListBasedPlaceExtractionService(NERService):

    def __init__(self, dist_file=CommonConstants.INDIA_DIST_NAMES):
        super().__init__()
        self.old_names = {'bangalore':'bengaluru','gurgaon':'gurugram','calcutta':'kolkata','prayagraj':'allahabad','delhi':'delhi'}
        self.fd = FuzzySet()
        self.set = set()
        with open(dist_file) as df:
            reader = csv.reader(df)
            header = next(reader)
            for row in reader:
                if 'rural' in row[1].lower() or 'urban' in   row[1].lower() or 'dehat' in  row[1].lower():
                    alternate = ' '.join(row[1].split(' ')[:-1]).lower()
                    self.fd.add(alternate)
                    self.set.add(alternate)
                    continue
                self.fd.add(row[1].lower())
                self.set.add(row[1].lower())
        self.nlp = stanza.Pipeline(lang='en',processors='tokenize',use_gpu=False)

    def extract_entities_from_text(self,text):
        doc = self.nlp(text)
        closest_match = (0,None)
        '''
        for token in doc.ents:
            tok_text = token.text.lower()
            closest_dist = self.fd.get(tok_text)
            if closest_dist and len(closest_dist):
                closest = closest_dist[0]
                if closest[0] > closest_match[0]:
                    closest_match = closest
        if closest_match[0] > 0.5:
            return closest_match[1]
        '''
        for sent in doc.sentences:
            for token in sent.tokens:
                tok_text = token.text.lower()
                if tok_text in self.set:
                    return tok_text
                try:
                    return self.old_names[tok_text]
                except KeyError:
                    continue

コード例 #17

0

ファイルを表示

    def __init__(self, ngram_range=(1, 3)):
        """
        :param choices_corpus: should be a list of texts
        :param preprocess_func: is a str->str function
        """
        self.ngram_range = ngram_range
        choices_corpus = [str(x) for x in list(brands['brnd'].dropna().unique())]

        l = brands[['brnd', 'equivalents']].dropna().to_dict('records')
        self.equivalents = {}
        for el in l:
            for eq in el['equivalents'].split(';'):
                self.equivalents[eq.strip()] = el['brnd']

        choices_corpus.extend(self.equivalents.keys())

        self.initial_choices_corpus = choices_corpus
        self.cleaned_choices_corpus = self.cleaner(choices_corpus)

        self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True,  # strip_accents='ascii',
                                     lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(self.cleaned_choices_corpus)

        self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus)
        self.initial_corpus_tf_idf_dict = {}
        for k in range(len(choices_corpus)):
            self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k]

        # Creating fuzzy set
        self.fset_brands = FuzzySet()
        for token in [str(x) for x in list(brands['brnd'].dropna().unique())]:
            self.fset_brands.add(token)

        self.fset_tokens = FuzzySet()
        for token in list(self.tfidf.vocabulary_):
            self.fset_tokens.add(token)

        # Prepare the japanese matching
        jp_brands = brands[['brnd', 'brnd_jp_clean']]
        jp_brands = jp_brands[jp_brands.brnd_jp_clean.notnull()]
        jp_brands['brnd_jp_clean'] = jp_brands['brnd_jp_clean'].apply(lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', '')))
        jp_brands['brnd_jp_size'] = jp_brands['brnd_jp_clean'].apply(lambda x: len(x))
        jp_brands.sort_values(['brnd_jp_size', 'brnd'], ascending=[False, False], inplace=True)
        self.jp_brands = jp_brands

コード例 #18

0

ファイルを表示

ファイル: fuzzyrule.py プロジェクト: arkaragi/fuzzylib

 def defuzz(self, method=None):
     final = FuzzySet.Union(self.consequents)
     temp = list()
     print(max(final.m))
     for i in range(len(final.x)):
         if final.m[i] >= max(final.m):
             temp.append(final.x[i])
     val = sum(temp) / len(temp)
     fplot = FuzzyPlotter([final])
     fplot()
     return val

コード例 #19

0

ファイルを表示

ファイル: parallel_search.py プロジェクト: viniciusbig/fuzzy_cnpj_matcher

def fuzzy_cnpj_search(cnpj_base_str, cnpj, debug=False):
    best_matches = []
    with open(cnpj_base_str) as f:
        # temp variables
        start_time = time.time()

        # Searching
        log('Searching for %s on %s' % (cnpj, cnpj_base_str), debug)
        fuzzy_matcher = FuzzySet(f.read().splitlines())

        match = fuzzy_matcher.get(cnpj)
        elapsed_time = time.time() - start_time

        log('Best match for this file is %s and it took %d seconds'
                   % (match, elapsed_time), debug)
        # Appending to the best matches so far
        if not match is None:
            for m in match:
                best_matches.append(m[1])
        return best_matches

コード例 #20

0

ファイルを表示

ファイル: nlp_playground.py プロジェクト: DevrathIyer/SciFair2019

 def execute(self, command):
     ReturnedJson = json.loads(
         self._nlp.annotate(
             command, properties=self._nlp_properties))['sentences'][0]
     dependencies = []
     for dependency in ReturnedJson['enhancedPlusPlusDependencies']:
         dependencies.append(
             (dependency['governorGloss'], dependency['dependentGloss'],
              dependency['dep']))
     verb_tuple = [x for x in dependencies if x[2] == u'dobj'][0]
     functions = FuzzySet(self.function_names)
     query = functions.get(str(verb_tuple[0]))
     if query:
         query = query[0]
         print(query)
         if query[0] < .5:
             print("learning")
             self._learn(command)
         else:
             dyn = self.functions.get(query[1])
             obj = verb_tuple[1]
             obj_adj = str(" ".join([
                 x[1]
                 for x in dependencies if x[0] == obj and not x[2] == u'det'
             ]) + " " + obj)
             objects = FuzzySet(world.objects)
             object_query = objects.get(obj_adj)
             if object_query:
                 dyn(world.attributes.get(object_query[0][1]))
             else:
                 print("no objects found!")
     else:
         self._learn(command)

コード例 #21

0

ファイルを表示

def get_nutrition_data(image_class):
    url = "https://api.nal.usda.gov/ndb/search/?format=json&q=" + image_class + "&sort=n&max=25&offset=0&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu"
    r = requests.get(url).json()

    max_dist_ratio = 0
    ndbno = 0
    for item in r["list"]["item"]:
        fs = FuzzySet()
        fs.add(image_class)
        ratio = fs.get(item["name"])[0][0]

        if ratio > max_dist_ratio:
            max_dist_ratio = ratio
            ndbno = item["ndbno"]

    print(ndbno)

    nutrition_url = "https://api.nal.usda.gov/ndb/V2/reports?ndbno=" + ndbno + "&type=f&format=json&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu"
    nutrition_data = requests.get(nutrition_url).json()

    nutrition_facts = {}
    nutrients = nutrition_data["foods"][0]["food"]["nutrients"]

    nutrition_facts["serve_size"] = str(
        nutrients[0]["measures"][0]["qty"]) + " ounces"
    nutrition_facts["kcal"] = str(
        nutrients[0]["measures"][0]["value"]) + " calories"
    nutrition_facts["fat"] = str(
        nutrients[2]["measures"][0]["value"]) + " grams"
    nutrition_facts["carbs"] = str(
        nutrients[3]["measures"][0]["value"]) + " grams"
    nutrition_facts["protein"] = str(
        nutrients[1]["measures"][0]["value"]) + " grams"
    nutrition_facts["sugar"] = str(
        nutrients[4]["measures"][0]["value"]) + " grams"
    nutrition_facts["sodium"] = str(
        nutrients[5]["measures"][0]["value"]) + " milligrams"

    return nutrition_facts

コード例 #22

0

ファイルを表示

 def _learn(self, command):
     functions = FuzzySet(self.function_names)
     rospy.logerr("No command found! Please input commands")
     self.function_names.append(command)
     self.functions[command] = []
     commands = raw_input()
     sentences = json.loads(
         self._nlp.annotate(commands,
                            properties=self._nlp_properties))['sentences']
     for sentence in sentences:
         dependencies = []
         for dependency in sentence['enhancedPlusPlusDependencies']:
             dependencies.append(
                 (dependency['governorGloss'], dependency['dependentGloss'],
                  dependency['dep']))
         verb_tuple = [x for x in dependencies if x[2] == u'dobj'][0]
         query = functions.get(str(verb_tuple[0]))
         if query:
             query = query[0]
             func = self.functions.get(query[1])
             self.functions[command].extend(func)
     rospy.logerr(self.functions)

コード例 #23

0

ファイルを表示

ファイル: fdt.py プロジェクト: arkaragi/fuzzylib

    def regress(self, test, yt):
        predicted = []
        for k, row in enumerate(test):
            act_deg = []
            fparts = []
            for i, rule in enumerate(self.rules):
                ante, cons = rule[0], rule[-1]
                temp = []
                for tup in ante:
                    t = row[tup[0]][tup[-1]]
                    temp.append(t)
                act_deg.append(min(temp))
                if act_deg[i] == 0:
                    if i == 0:
                        f = self.fvar[-1].fuzzy[cons]
                        s = 'f.func.{}(f.num_x, f.prmts)'.format(f.mfunc)
                        newm = eval(s)
                        newf = FuzzySet(f.num_x, newm)
                        newf.cutpoint(act_deg[i])
                        self.fout = newf
                    continue
                else:
                    f = self.fvar[-1].fuzzy[cons]
                    s = 'f.func.{}(f.num_x, f.prmts)'.format(f.mfunc)
                    newm = eval(s)
                    newf = FuzzySet(f.num_x, newm)
                    newf.cutpoint(act_deg[i])
                    if i == 0:
                        self.fout = newf
                    else:
                        self.fout = self.fout | newf
            # mean of max method
##            dummy = self.fout.x[np.where(self.fout.m >= max(self.fout.m))[0]]
##            val = sum(dummy)/len(dummy)
            # center of gravity method
            val = self.centroid(self.fout.x, self.fout.m)
            predicted.append(val)
##            print('## Actual value: {}'.format(yt[k]))
##            print('## Regressed value: {}'.format(val))
##            FuzzyPlotter(self.fout) 
        diff = [(a-p)**2 for a, p in zip(yt, predicted)]
        rmse = sum(diff) / len(diff)
        return predicted, rmse

コード例 #24

0

ファイルを表示

 def path(self, source, destination):
     from fuzzyset import FuzzySet
     if source == destination:
         return
     fuzz = FuzzySet(self.map.nodes)
     if source not in self.map.nodes:
         source = fuzzymatch(fuzz, source)
     if destination not in self.map.nodes:
         destination = fuzzymatch(fuzz, destination)
     try:
         path = nx.shortest_path(self.map, source, destination)
     except NetworkXNoPath:
         log.error(f"No path between {source.name} and {destination.name}.")
         return
     way = []
     for iter in range(1, len(path)):
         way.append(self.map.edges[path[iter - 1], path[iter]]["label"])
     return way

コード例 #25

0

ファイルを表示

ファイル: test_fuzzysetlib_matcher.py プロジェクト: viniciusbig/fuzzy_cnpj_matcher

class TestFuzzyMatcher(unittest.TestCase):

    def setUp(self):
        with open('../bulk/cnpjs.txt') as f:
            self.fuzzy_set = f.read().splitlines()
            self.fuzzy_matcher = FuzzySet(self.fuzzy_set)

    def test_validate(self):
        self.assertEqual(self.fuzzy_matcher.get('06389497000195')[0][1],
                         '04389697000195')
        self.assertEqual(self.fuzzy_matcher.get('15574828000190')[0][1],
                         '15575829000190')
        self.assertEqual(self.fuzzy_matcher.get('15911974000144')[0][1],
                         '15922975000144')
        self.assertEqual(self.fuzzy_matcher.get('12919223000129')[0][1],
                         '12291923000129')
        self.assertEqual(self.fuzzy_matcher.get('557135900011')[0][1],
                         '55713579000121')
        self.assertEqual(self.fuzzy_matcher.get('40194766000116')[0][1],
                         '49794166000116')

        #'49794166000116'
        print self.fuzzy_matcher.get('40194766000116')[0][1]

コード例 #26

0

ファイルを表示

class WordFixer:
    def __init__(self, word2vec: Word2VecKeyedVectors):
        self.__word2vec = word2vec
        self.__fixed_word_dict: Dict[str, str] = dict()

        self.__approximate_matcher = FuzzySet(word2vec.vocab)

    def is_word_correct(self, word: str):
        if word in self.__word2vec:
            return True
        return False

    def fix(self, word: str):

        if word in self.__fixed_word_dict:
            return self.__fixed_word_dict[word]

        candidate = self.__approximate_matcher.get(word)
        if candidate is not None and len(candidate) > 0:
            fixed_word = candidate[0][1]
            self.__fixed_word_dict[word] = fixed_word
            return fixed_word

        raise Exception("Cannot be fixed")

コード例 #27

0

ファイルを表示

ファイル: plotter.py プロジェクト: arkaragi/fuzzylib

##FuzzyPlotter(v)

### pic1
##name = 'Parametric Membership Functions'
##uod = np.arange(-10, 50, 0.01)
##terms = ['trimf, [-5,2,12]', 'trapmf, [8,14,22,28]', 'gaussmf, [30,4]']
##v = FuzzyVariable(name, uod, terms)
##v.setmf([('trimf', [-5,2,12]),
##         ('trapmf', [8,14,22,28]),
##         ('gaussmf', [30,4])])
##FuzzyPlotter(v)


### pic 2345
uod = np.arange(-10,110,0.1)
A = FuzzySet(uod)
A.set_mf('trimf', [0,35,60])
B = FuzzySet(uod)
B.set_mf('trimf', [20,75,95])
C = A | B
D = A & B
E = ~A
F = ~B
plt.figure()
plt.plot(A.x,A.m,'r')
plt.plot(E.x,E.m,'g')
plt.title('Complementary Operation')
#plt.title('The intersection of A and B')
plt.xlabel('Universe of Discourse')
plt.ylabel('Membership Degree')
plt.legend(['μA', 'μAcomplement'])

コード例 #28

0

ファイルを表示

ファイル: test_fuzzysetlib_matcher.py プロジェクト: PauloMigAlmeida/fuzzy_cnpj_matcher

 def setUp(self):
     with open('../bulk/cnpjs.txt') as f:
         self.fuzzy_set = f.read().splitlines()
         self.fuzzy_matcher = FuzzySet(self.fuzzy_set)

コード例 #29

0

ファイルを表示

ファイル: sequential_search.py プロジェクト: viniciusbig/fuzzy_cnpj_matcher

class SequentialFuzzyCnpjMatcher:
    """
    Class that performs fuzzy string matching on CNPJs sequentially. For small
    fuzzyset this class is the easiest way to get started. However if you going
    for a large fuzzyset we strongly recommend using LocalParallelFuzzyCnpjMatcher
     instead.
    """
    def __init__(self):
        """
        Default constructor
        :return: a SequentialFuzzyCnpjMatcher instance
        """
        self.__cnpj_bases = []

        for x in xrange(0, 100):
            idx = x * 1000000
            self.__cnpj_bases.append('../bulk/cnpjs_base_' +
                                     str(idx).zfill(7) + '.txt')

        self.__fuzzy_matcher = None

    def match_cnpj(self, cnpj, debug=False):
        """
        Search the closest valid CNPJ given a invalid one
        :param cnpj: a invalid CNPJ
        :param debug: whether you want to see debugging logs or not
        :return: a list of the most similar valid CNPJs to the one you've provided
        """
        best_matches = []

        for cnpj_base_str in self.__cnpj_bases:
            with open(cnpj_base_str) as f:
                # temp variables
                start_time = time.time()

                # Searching
                self.__log('Searching for %s on %s' % (cnpj, cnpj_base_str),
                           debug)
                self.__fuzzy_matcher = FuzzySet(f.read().splitlines())

                match = self.__fuzzy_matcher.get(cnpj)
                elapsed_time = time.time() - start_time

                self.__log(
                    'Best match for this file is %s and it took %d seconds' %
                    (match, elapsed_time), debug)
                # Appending to the best matches so far
                if not match is None:
                    for m in match:
                        best_matches.append(m[1])

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.__fuzzy_matcher = FuzzySet(best_matches)
        return self.__fuzzy_matcher.get(cnpj)[0]

    def __log(self, msg, debug=False):
        """
        Prints a message to console depending on debug variable
        :param msg: a message string
        :param debug: a boolean value
        :return:
        """
        if debug:
            print msg

コード例 #30

0

ファイルを表示

ファイル: video_analysis.py プロジェクト: lukic-aleksandar/hawkEyeProject

class VideoAnalyzer:

    def __init__(self, camera):
        self.Camera = camera
        self.ellipseFuzzySet = FuzzySet()#([0,1,2.5],[0,1,0])
        self.colorFuzzySet = FuzzySet()#([0,1,1.05],[0,1,0])
        self.lowerColorBnd = (0, 0, 0)
        self.upperColorBnd = (180, 255, 255)
        self.bgs = cv2.createBackgroundSubtractorMOG2()
        self.bgsLearningRate = 0.1
        self.imageAnalyzer = ImageAnalyzer()
        self.morphologyArray = []
        self.sizeBoundaries = ((0, 0), (0, 0))
        self.contourThreshold = 0.5

    def analyze(self):
        cnt = 0
        fcnt = 0
        result = []
        images = []

        while(1):

            ret, frame = self.Camera.capture.read()

            if(ret != False):

                if self.Camera.calibrationMatrix is not None:
                    w,  h = int(self.Camera.resolution[0]), int(self.Camera.resolution[1])
                    newCamMatrix, roi = cv2.getOptimalNewCameraMatrix(self.Camera.calibrationMatrix, self.Camera.distortCoefs, (w, h), 1, (w, h))
                    frame = cv2.undistort(frame, self.Camera.calibrationMatrix, self.Camera.distortCoefs, None, newCamMatrix)

                images.append(frame.copy())

                rectImg = frame.copy()
                hsvImg = frame.copy()

                hsvImg = cv2.cvtColor(hsvImg, cv2.COLOR_BGR2HSV)
                hsvMask = cv2.inRange(hsvImg, self.lowerColorBnd, self.upperColorBnd)  #(30,10,110), (60,255,255))

                frame = self.bgs.apply(frame, learningRate = self.bgsLearningRate)

                for morph in self.morphologyArray:
                    frame = self.imageAnalyzer.morphology(frame, morph)

                image, contours, hierarchy = cv2.findContours(frame.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

                fittingContours = []
                for i,contour in enumerate(contours):
                    x,y,w,h = cv2.boundingRect(contour)
                    if(w >= self.sizeBoundaries[0][0] and w <= self.sizeBoundaries[0][1]
                       and h >= self.sizeBoundaries[1][0] and h <= self.sizeBoundaries[1][1]):

                        rotatedRect = cv2.fitEllipse(contour)
                        center = (rotatedRect[0][0], rotatedRect[0][1])
                        size = (rotatedRect[1][0], rotatedRect[1][1])
                        angle = (rotatedRect[2])

                        ellipseFit = None

                        if(size[0] >= size[1]):
                            ellipseFit = self.ellipseFuzzySet.fitLinear(size[1] / size[0])
                        elif(size[0] < size [1]):
                            ellipseFit = self.ellipseFuzzySet.fitLinear(size[0] / size[1])

                        if(ellipseFit is not None and ellipseFit >= self.ellipseFuzzySet.threshold):
                            foundBlack = 0
                            foundWhite = 0

                            for j in range (x,x+w):
                                for k in range(y,y+h):
                                    dist = cv2.pointPolygonTest(contour,(j,k),False)
                                    if dist>= 0:
                                        foundBlack += 1.0
                                        if(hsvMask[k,j] == 255):
                                            foundWhite+=1.0
                                            foundBlack-=1.0

                            colorFit = None

                            if(foundBlack > foundWhite):
                                colorFit = self.colorFuzzySet.fitLinear(foundWhite / foundBlack)
                            elif(foundWhite >= foundBlack and foundBlack > 0):
                                colorFit = 1 + self.colorFuzzySet.fitLinear(foundBlack / foundWhite)

                            if(colorFit is not None and colorFit >= self.colorFuzzySet.threshold):
                                fittingContours.append((center, size, angle,(colorFit+ellipseFit)/2, cnt))

                if len(fittingContours) > 0:
                    best = fittingContours[0]
                    for i in range (0,len(fittingContours)):
                        if(fittingContours[i][3] > best[3]):
                            best = fittingContours[i]
                    if(best[3] >= self.contourThreshold):
                        cv2.rectangle(rectImg,(int(best[0][0]-best[1][0]/2), int(best[0][1]-best[1][1]/2)),
                                     (int(best[0][0]+best[1][0]/2), int(best[0][1]+best[1][1]/2)),(255,0,0),2)

                        result.append(best)
                        fcnt += 1

                cv2.imshow('frame', rectImg)
            else:
                print "Frame not found"

            cnt += 1
            if (cnt == self.Camera.capture.get(cv2.CAP_PROP_FRAME_COUNT)) or cv2.waitKey(3) & 0xff == ord('q'):
                print 'found', fcnt, 'has', self.Camera.capture.get(cv2.CAP_PROP_FRAME_COUNT)
                return result, images

コード例 #31

0

ファイルを表示

ファイル: nlp_playground.py プロジェクト: DevrathIyer/SciFair2019

 def _learn(self, command):
     fs = FuzzySet(self.function_names)
     fs.add(command)
     print("No command found! Please input commands")
     commands = raw_input()
     commandArray = commands.split('.')

コード例 #32

0

ファイルを表示

ファイル: sequential_search.py プロジェクト: viniciusbig/fuzzy_cnpj_matcher

class SequentialFuzzyCnpjMatcher:
    """
    Class that performs fuzzy string matching on CNPJs sequentially. For small
    fuzzyset this class is the easiest way to get started. However if you going
    for a large fuzzyset we strongly recommend using LocalParallelFuzzyCnpjMatcher
     instead.
    """

    def __init__(self):
        """
        Default constructor
        :return: a SequentialFuzzyCnpjMatcher instance
        """
        self.__cnpj_bases = []

        for x in xrange(0, 100):
            idx = x * 1000000
            self.__cnpj_bases.append("../bulk/cnpjs_base_" + str(idx).zfill(7) + ".txt")

        self.__fuzzy_matcher = None

    def match_cnpj(self, cnpj, debug=False):
        """
        Search the closest valid CNPJ given a invalid one
        :param cnpj: a invalid CNPJ
        :param debug: whether you want to see debugging logs or not
        :return: a list of the most similar valid CNPJs to the one you've provided
        """
        best_matches = []

        for cnpj_base_str in self.__cnpj_bases:
            with open(cnpj_base_str) as f:
                # temp variables
                start_time = time.time()

                # Searching
                self.__log("Searching for %s on %s" % (cnpj, cnpj_base_str), debug)
                self.__fuzzy_matcher = FuzzySet(f.read().splitlines())

                match = self.__fuzzy_matcher.get(cnpj)
                elapsed_time = time.time() - start_time

                self.__log("Best match for this file is %s and it took %d seconds" % (match, elapsed_time), debug)
                # Appending to the best matches so far
                if not match is None:
                    for m in match:
                        best_matches.append(m[1])

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.__fuzzy_matcher = FuzzySet(best_matches)
        return self.__fuzzy_matcher.get(cnpj)[0]

    def __log(self, msg, debug=False):
        """
        Prints a message to console depending on debug variable
        :param msg: a message string
        :param debug: a boolean value
        :return:
        """
        if debug:
            print msg

コード例 #33

0

ファイルを表示

class MiniBaseIndex(object):
    def __init__(self,
                 field=None,
                 tokenizer=None,
                 similarity=None,
                 base=None,
                 idf_limit=0.05,
                 **kw):
        super(MiniBaseIndex, self).__init__(**kw)
        self.content = {}
        self.field = field
        self.tokenizer = tokenizer
        self.similarity = similarity
        self.base = base
        self.counts = {}
        self.fuzzwords = FuzzySet(rel_sim_cutoff=0.7, use_levenshtein=False)
        self.blacklist = set()
        self.idf_limit = idf_limit

    def add(self, tok, i):
        if tok not in self.content:
            if tok not in self.blacklist:
                self.content[tok] = set()
            self.counts[tok] = 0
        self.content[tok].add(i)
        self.counts[tok] += 1
        # if self.counts[tok]/len(self.base.entries) > self.idf_limit:
        #     self.blacklist.add(tok)
        #     del self.counts[tok]
        #     del self.content[tok]
        self.fuzzwords.add(tok)

    def finalize(self):
        for tok in self.content:
            pass
            # self.fuzzwords.add(tok)

    def search(self, x, expl=5000, top=25, maxtok=250, debug=False):
        tokenizer = self.tokenizer
        xtoks = tokenizer(x)
        # maxtok = maxtok * len(xtoks)
        results = {}
        # collect all toks
        alltoks = []
        alltoks_set = set()
        for xtok in xtoks:
            for xtok_fuzz_score, xtok_fuzz_tok \
                    in self.fuzzwords.get(xtok):
                xtok_fuzz_sim = self.similarity(xtok, xtok_fuzz_tok)
                if xtok_fuzz_tok not in alltoks_set:
                    alltoks.append(
                        (xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim))
                    alltoks_set.add(xtok_fuzz_tok)
        # alltoks = list(alltoks)
        # sort together by fuzziness
        alltoks = sorted(alltoks,
                         key=lambda x: x[2] * 100 + 1 / self.counts[x[1]],
                         reverse=True)
        # take maxtok only
        if debug:
            print(len(alltoks), maxtok)
            for tok in alltoks:
                print(tok, self.counts[tok[1]])
        alltoks = alltoks[:maxtok]
        # sort by inverse frequency
        # alltoks = sorted(alltoks, key=lambda x: self.counts[x[1]])
        # alltoksset = set(alltoks)
        for xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim in alltoks:
            for _id in self.content[xtok_fuzz_tok]:
                if _id not in results:
                    results[_id] = 0
                results[_id] += xtok_fuzz_score
                if len(results) > expl:
                    break
            if len(results) > expl:
                break
        if debug:
            print(len(results))
        results = [(res[0], res[1],
                    self.similarity(x, self.base.entries[res[0]][self.field]))
                   for res in results.items()]

        def sortkey(x):
            entid = x[0]
            pop = self.base.entries[entid]["pop"]
            sim = x[2]
            return sim * 1e2 + pop * 1e-3

        results = sorted(results, key=sortkey, reverse=True)
        results = results[:top]
        return results

コード例 #34

0

ファイルを表示

    def __init__(self, word2vec: Word2VecKeyedVectors):
        self.__word2vec = word2vec
        self.__fixed_word_dict: Dict[str, str] = dict()

        self.__approximate_matcher = FuzzySet(word2vec.vocab)

コード例 #35

0

ファイルを表示

geoctrs = [
    feat['properties']['name'] for k, feat in enumerate(geos['features'])
]

# list of countries from C19 data
c19ctrs = c19.columns.tolist()

# list of countries from population data
popsctrs = pops['Country'].tolist()

# geo data is king, we need to match everything else to it
# the country name becomes the key matching the tables / dictionaries

# some countries in c19 do not match any country in geo data
# let's print fuzzy matches
fzs = FuzzySet()
for c in geoctrs:
    fzs.add(c)

#for c in c19ctrs:
#    if c not in geoctrs:
#        print(c, fzs.get(c))

# In[8]:

c19notfound = [
    'Andorra', 'Antigua and Barbuda', 'Bahrain', 'Barbados', 'Cabo Verde',
    'Comoros', 'Diamond Princess', 'Dominica', 'Grenada', 'Holy See',
    'Liechtenstein', 'MS Zaandam', 'Maldives', 'Mauritius', 'Monaco',
    'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines',
    'San Marino', 'Sao Tome and Principe', 'Seychelles', 'Singapore'