Python FuzzySet Beispiele, fuzzyset.FuzzySet Python Beispiele

Beispiel #1

0

Datei anzeigen

def fuzzyset_alg(key, key_list):
    finder = FuzzySet()
    finder.add(key)
    candidates = list()
    for i in key_list:
        try:
            added = [i]
            #if the match score is below 50% key error raises
            matched = finder[i]
            added.extend(*matched)
            del added[-1]  #remove rep's key from list
            added[1] *= 100  #convert to percentage
            '''
            [0] the sf key
            [1] match percentage
            '''
            candidates.append(added)
        except:
            pass
    #sort by score
    candidates.sort(key=lambda x: x[1], reverse=True)

    #take top take 10
    top_candi = candidates[:10]
    #fuzzy match and sort again
    finalist = [[i[0], fuzz.ratio(key, i[0])] for i in top_candi]
    finalist.sort(key=lambda x: x[1], reverse=True)
    del finder, candidates, top_candi
    if len(finalist) > 0:
        return finalist[:3]
    else:
        return []

Beispiel #2

0

Datei anzeigen

Datei: parallel_search.py Projekt: viniciusbig/fuzzy_cnpj_matcher

    def match_cnpj(self, cnpj, debug=False):
        best_matches = []

        # temp variables
        start_time = time.time()

        jobs = [(
                    cnpj_base_str,
                    self.__job_server.submit(
                        fuzzy_cnpj_search,
                        (cnpj_base_str, cnpj, debug,),
                        (log, ),
                        ("from fuzzyset import FuzzySet", "time")
                    )) for cnpj_base_str in self.cnpj_bases]

        for cnpj_base_str, job in jobs:
            print "Results", cnpj_base_str, "is", job()

        elapsed_time = time.time() - start_time

        log('Parallel processes took %d seconds to finish' % elapsed_time, debug)

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.fuzzy_matcher = FuzzySet(best_matches)
        return self.fuzzy_matcher.get(cnpj)[0]

Beispiel #3

0

Datei anzeigen

 def __init__(self, field=None, similarity=None, base=None, **kw):
     super(FuzzyBaseIndex, self).__init__(**kw)
     self.fuzz = FuzzySet(rel_sim_cutoff=1., use_levenshtein=False)
     self.content = {}
     self.field = field
     self.similarity = similarity
     self.base = base

Beispiel #4

0

Datei anzeigen

Datei: tfidfmatcher.py Projekt: maker-project-1/webscrapping

    def __init__(self, choices_corpus, ngram_range=(1, 2), use_cleaner=True, preprocess_func=None):
        """
        :param choices_corpus: should be a list of texts
        :param preprocess_func: is a str->str function
        """
        self.ngram_range = ngram_range
        self.use_cleaner = use_cleaner
        self.preprocess_func = preprocess_func

        self.initial_choices_corpus = choices_corpus
        if self.use_cleaner:
            choices_corpus = self.cleaner(choices_corpus)
        if self.preprocess_func:
            choices_corpus = [self.preprocess_func(k) for k in choices_corpus]

        self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True,  # strip_accents='ascii',
                                     lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(choices_corpus)

        self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus)
        self.initial_corpus_tf_idf_dict = {}
        for k in range(len(choices_corpus)):
            self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k]
        self.vocabulary = self.tfidf.vocabulary_.keys()
        self.fset_vocabulary = FuzzySet()
        for brnd in self.vocabulary:
            self.fset_vocabulary.add(brnd)

Beispiel #5

0

Datei anzeigen

class FuzzyBaseIndex(object):
    def __init__(self, field=None, similarity=None, base=None, **kw):
        super(FuzzyBaseIndex, self).__init__(**kw)
        self.fuzz = FuzzySet(rel_sim_cutoff=1., use_levenshtein=False)
        self.content = {}
        self.field = field
        self.similarity = similarity
        self.base = base

    def add(self, x, i):
        self.fuzz.add(x)
        if x not in self.content:
            self.content[x] = set()
        self.content[x].add(i)

    def finalize(self):
        pass

    def search(self, x, top=25, debug=True):
        results = self.fuzz.get(x)
        ret = []
        for r in results:
            for i in self.content[r[1]]:
                sim = self.similarity(x, r[1])
                ret.append((i, r[0], sim))
        ret = sorted(ret, key=lambda x: x[2], reverse=True)
        ret = ret[:top]
        return ret

Beispiel #6

0

Datei anzeigen

Datei: sequential_search.py Projekt: viniciusbig/fuzzy_cnpj_matcher

    def match_cnpj(self, cnpj, debug=False):
        """
        Search the closest valid CNPJ given a invalid one
        :param cnpj: a invalid CNPJ
        :param debug: whether you want to see debugging logs or not
        :return: a list of the most similar valid CNPJs to the one you've provided
        """
        best_matches = []

        for cnpj_base_str in self.__cnpj_bases:
            with open(cnpj_base_str) as f:
                # temp variables
                start_time = time.time()

                # Searching
                self.__log("Searching for %s on %s" % (cnpj, cnpj_base_str), debug)
                self.__fuzzy_matcher = FuzzySet(f.read().splitlines())

                match = self.__fuzzy_matcher.get(cnpj)
                elapsed_time = time.time() - start_time

                self.__log("Best match for this file is %s and it took %d seconds" % (match, elapsed_time), debug)
                # Appending to the best matches so far
                if not match is None:
                    for m in match:
                        best_matches.append(m[1])

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.__fuzzy_matcher = FuzzySet(best_matches)
        return self.__fuzzy_matcher.get(cnpj)[0]

Beispiel #7

0

Datei anzeigen

Datei: sequential_search.py Projekt: viniciusbig/fuzzy_cnpj_matcher

    def match_cnpj(self, cnpj, debug=False):
        """
        Search the closest valid CNPJ given a invalid one
        :param cnpj: a invalid CNPJ
        :param debug: whether you want to see debugging logs or not
        :return: a list of the most similar valid CNPJs to the one you've provided
        """
        best_matches = []

        for cnpj_base_str in self.__cnpj_bases:
            with open(cnpj_base_str) as f:
                # temp variables
                start_time = time.time()

                # Searching
                self.__log('Searching for %s on %s' % (cnpj, cnpj_base_str),
                           debug)
                self.__fuzzy_matcher = FuzzySet(f.read().splitlines())

                match = self.__fuzzy_matcher.get(cnpj)
                elapsed_time = time.time() - start_time

                self.__log(
                    'Best match for this file is %s and it took %d seconds' %
                    (match, elapsed_time), debug)
                # Appending to the best matches so far
                if not match is None:
                    for m in match:
                        best_matches.append(m[1])

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.__fuzzy_matcher = FuzzySet(best_matches)
        return self.__fuzzy_matcher.get(cnpj)[0]

Beispiel #8

0

Datei anzeigen

Datei: video_analysis.py Projekt: lukic-aleksandar/hawkEyeProject

 def __init__(self, camera):
     self.Camera = camera
     self.ellipseFuzzySet = FuzzySet()#([0,1,2.5],[0,1,0])
     self.colorFuzzySet = FuzzySet()#([0,1,1.05],[0,1,0])
     self.lowerColorBnd = (0, 0, 0)
     self.upperColorBnd = (180, 255, 255)
     self.bgs = cv2.createBackgroundSubtractorMOG2()
     self.bgsLearningRate = 0.1
     self.imageAnalyzer = ImageAnalyzer()
     self.morphologyArray = []
     self.sizeBoundaries = ((0, 0), (0, 0))
     self.contourThreshold = 0.5

Beispiel #9

0

Datei anzeigen

def run_profile(impl):
    if impl == "cFuzzySet":
        f = cFuzzySet()
    else:
        f = FuzzySet()
    with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file:
        for line in input_file:
            f.add(line.rstrip().decode())
    print(f)
    cProfile.runctx("profiler(f)", globals(), locals(), "Profile.prof")

    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("time").print_stats()

Beispiel #10

0

Datei anzeigen

Datei: oov_embeddings.py Projekt: QtRoS/nodl_toxic

def get_oov_vocabulary_map(vocabulary_words_weights, wordset):
    oov = wordset - set(vocabulary_words_weights.keys())
    vocabulary_words_set = FuzzySet(sorted(vocabulary_words_weights.keys()))
    mapping = {}
    for word in tqdm(oov):
        word_matches = vocabulary_words_set.get(word)
        if word_matches is None or len(word_matches) == 0:
            continue
        word_scores = {vocabulary_word: score * vocabulary_words_weights[vocabulary_word]
                       for score, vocabulary_word in word_matches}
        vocabulary_words_scored = sorted(word_scores.keys(),
                                         key=lambda vocabulary_word: -word_scores[vocabulary_word])
        mapping[word] = vocabulary_words_scored[0]
    return mapping

Beispiel #11

0

Datei anzeigen

 def matchedIds(postId, threshold):
     keywords = Model.getKeywords()
     postKeywords = list(filter(lambda x: x["id"] == postId,
                                keywords))[0]["keywords"]
     matches = []
     for keyword in keywords:
         fs = FuzzySet(keyword['keywords'])
         for pk in postKeywords:
             if postId != keyword["id"]:
                 m = fs.get(pk)
                 if m:
                     for score, val in fs.get(pk):
                         if score > threshold:
                             matches.append((keyword["id"], score, val))
     return matches

Beispiel #12

0

Datei anzeigen

Datei: parallel_search.py Projekt: viniciusbig/fuzzy_cnpj_matcher

class LocalParallelFuzzyCnpjMatcher(BaseParallelFuzzyCnpjMatcher):

    def __init__(self, cpu_count="autodetect"):
        super(LocalParallelFuzzyCnpjMatcher, self).__init__()
        self.__job_server = pp.Server(ncpus=cpu_count)

    def match_cnpj(self, cnpj, debug=False):
        best_matches = []

        # temp variables
        start_time = time.time()

        jobs = [(
                    cnpj_base_str,
                    self.__job_server.submit(
                        fuzzy_cnpj_search,
                        (cnpj_base_str, cnpj, debug,),
                        (log, ),
                        ("from fuzzyset import FuzzySet", "time")
                    )) for cnpj_base_str in self.cnpj_bases]

        for cnpj_base_str, job in jobs:
            print "Results", cnpj_base_str, "is", job()

        elapsed_time = time.time() - start_time

        log('Parallel processes took %d seconds to finish' % elapsed_time, debug)

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.fuzzy_matcher = FuzzySet(best_matches)
        return self.fuzzy_matcher.get(cnpj)[0]

Beispiel #13

0

Datei anzeigen

 def __init__(self, dist_file=CommonConstants.INDIA_DIST_NAMES):
     super().__init__()
     self.old_names = {'bangalore':'bengaluru','gurgaon':'gurugram','calcutta':'kolkata','prayagraj':'allahabad','delhi':'delhi'}
     self.fd = FuzzySet()
     self.set = set()
     with open(dist_file) as df:
         reader = csv.reader(df)
         header = next(reader)
         for row in reader:
             if 'rural' in row[1].lower() or 'urban' in   row[1].lower() or 'dehat' in  row[1].lower():
                 alternate = ' '.join(row[1].split(' ')[:-1]).lower()
                 self.fd.add(alternate)
                 self.set.add(alternate)
                 continue
             self.fd.add(row[1].lower())
             self.set.add(row[1].lower())
     self.nlp = stanza.Pipeline(lang='en',processors='tokenize',use_gpu=False)

Beispiel #14

0

Datei anzeigen

    def __init__(self, removed_lines_dicts, added_lines_dicts):
        self.removed_lines = []
        self.trim_text_to_array_of_added_lines = defaultdict(list)
        self.added_file_name_to_line_no_to_line = defaultdict(dict)
        self.removed_file_name_to_line_no_to_line = defaultdict(dict)
        self.added_lines_fuzzy_set = FuzzySet()

        for added_line_dict in added_lines_dicts:
            line = Line.from_dict(added_line_dict)
            self.trim_text_to_array_of_added_lines[line.trim_text].append(line)
            self.added_lines_fuzzy_set.add(line.trim_text)
            self.added_file_name_to_line_no_to_line[line.file][line.line_no] = line

        for removed_line_dict in removed_lines_dicts:
            line = Line.from_dict(removed_line_dict)
            self.removed_lines.append(line)
            self.removed_file_name_to_line_no_to_line[line.file][line.line_no] = line

Beispiel #15

0

Datei anzeigen

 def __init__(self,
              field=None,
              tokenizer=None,
              similarity=None,
              base=None,
              idf_limit=0.05,
              **kw):
     super(MiniBaseIndex, self).__init__(**kw)
     self.content = {}
     self.field = field
     self.tokenizer = tokenizer
     self.similarity = similarity
     self.base = base
     self.counts = {}
     self.fuzzwords = FuzzySet(rel_sim_cutoff=0.7, use_levenshtein=False)
     self.blacklist = set()
     self.idf_limit = idf_limit

Beispiel #16

0

Datei anzeigen

class ListBasedPlaceExtractionService(NERService):

    def __init__(self, dist_file=CommonConstants.INDIA_DIST_NAMES):
        super().__init__()
        self.old_names = {'bangalore':'bengaluru','gurgaon':'gurugram','calcutta':'kolkata','prayagraj':'allahabad','delhi':'delhi'}
        self.fd = FuzzySet()
        self.set = set()
        with open(dist_file) as df:
            reader = csv.reader(df)
            header = next(reader)
            for row in reader:
                if 'rural' in row[1].lower() or 'urban' in   row[1].lower() or 'dehat' in  row[1].lower():
                    alternate = ' '.join(row[1].split(' ')[:-1]).lower()
                    self.fd.add(alternate)
                    self.set.add(alternate)
                    continue
                self.fd.add(row[1].lower())
                self.set.add(row[1].lower())
        self.nlp = stanza.Pipeline(lang='en',processors='tokenize',use_gpu=False)

    def extract_entities_from_text(self,text):
        doc = self.nlp(text)
        closest_match = (0,None)
        '''
        for token in doc.ents:
            tok_text = token.text.lower()
            closest_dist = self.fd.get(tok_text)
            if closest_dist and len(closest_dist):
                closest = closest_dist[0]
                if closest[0] > closest_match[0]:
                    closest_match = closest
        if closest_match[0] > 0.5:
            return closest_match[1]
        '''
        for sent in doc.sentences:
            for token in sent.tokens:
                tok_text = token.text.lower()
                if tok_text in self.set:
                    return tok_text
                try:
                    return self.old_names[tok_text]
                except KeyError:
                    continue

Beispiel #17

0

Datei anzeigen

    def __init__(self, ngram_range=(1, 3)):
        """
        :param choices_corpus: should be a list of texts
        :param preprocess_func: is a str->str function
        """
        self.ngram_range = ngram_range
        choices_corpus = [str(x) for x in list(brands['brnd'].dropna().unique())]

        l = brands[['brnd', 'equivalents']].dropna().to_dict('records')
        self.equivalents = {}
        for el in l:
            for eq in el['equivalents'].split(';'):
                self.equivalents[eq.strip()] = el['brnd']

        choices_corpus.extend(self.equivalents.keys())

        self.initial_choices_corpus = choices_corpus
        self.cleaned_choices_corpus = self.cleaner(choices_corpus)

        self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True,  # strip_accents='ascii',
                                     lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(self.cleaned_choices_corpus)

        self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus)
        self.initial_corpus_tf_idf_dict = {}
        for k in range(len(choices_corpus)):
            self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k]

        # Creating fuzzy set
        self.fset_brands = FuzzySet()
        for token in [str(x) for x in list(brands['brnd'].dropna().unique())]:
            self.fset_brands.add(token)

        self.fset_tokens = FuzzySet()
        for token in list(self.tfidf.vocabulary_):
            self.fset_tokens.add(token)

        # Prepare the japanese matching
        jp_brands = brands[['brnd', 'brnd_jp_clean']]
        jp_brands = jp_brands[jp_brands.brnd_jp_clean.notnull()]
        jp_brands['brnd_jp_clean'] = jp_brands['brnd_jp_clean'].apply(lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', '')))
        jp_brands['brnd_jp_size'] = jp_brands['brnd_jp_clean'].apply(lambda x: len(x))
        jp_brands.sort_values(['brnd_jp_size', 'brnd'], ascending=[False, False], inplace=True)
        self.jp_brands = jp_brands

Beispiel #18

0

Datei anzeigen

Datei: fuzzyrule.py Projekt: arkaragi/fuzzylib

 def defuzz(self, method=None):
     final = FuzzySet.Union(self.consequents)
     temp = list()
     print(max(final.m))
     for i in range(len(final.x)):
         if final.m[i] >= max(final.m):
             temp.append(final.x[i])
     val = sum(temp) / len(temp)
     fplot = FuzzyPlotter([final])
     fplot()
     return val

Beispiel #19

0

Datei anzeigen

Datei: parallel_search.py Projekt: viniciusbig/fuzzy_cnpj_matcher

def fuzzy_cnpj_search(cnpj_base_str, cnpj, debug=False):
    best_matches = []
    with open(cnpj_base_str) as f:
        # temp variables
        start_time = time.time()

        # Searching
        log('Searching for %s on %s' % (cnpj, cnpj_base_str), debug)
        fuzzy_matcher = FuzzySet(f.read().splitlines())

        match = fuzzy_matcher.get(cnpj)
        elapsed_time = time.time() - start_time

        log('Best match for this file is %s and it took %d seconds'
                   % (match, elapsed_time), debug)
        # Appending to the best matches so far
        if not match is None:
            for m in match:
                best_matches.append(m[1])
        return best_matches

Beispiel #20

0

Datei anzeigen

Datei: nlp_playground.py Projekt: DevrathIyer/SciFair2019

 def execute(self, command):
     ReturnedJson = json.loads(
         self._nlp.annotate(
             command, properties=self._nlp_properties))['sentences'][0]
     dependencies = []
     for dependency in ReturnedJson['enhancedPlusPlusDependencies']:
         dependencies.append(
             (dependency['governorGloss'], dependency['dependentGloss'],
              dependency['dep']))
     verb_tuple = [x for x in dependencies if x[2] == u'dobj'][0]
     functions = FuzzySet(self.function_names)
     query = functions.get(str(verb_tuple[0]))
     if query:
         query = query[0]
         print(query)
         if query[0] < .5:
             print("learning")
             self._learn(command)
         else:
             dyn = self.functions.get(query[1])
             obj = verb_tuple[1]
             obj_adj = str(" ".join([
                 x[1]
                 for x in dependencies if x[0] == obj and not x[2] == u'det'
             ]) + " " + obj)
             objects = FuzzySet(world.objects)
             object_query = objects.get(obj_adj)
             if object_query:
                 dyn(world.attributes.get(object_query[0][1]))
             else:
                 print("no objects found!")
     else:
         self._learn(command)

Beispiel #21

0

Datei anzeigen

def get_nutrition_data(image_class):
    url = "https://api.nal.usda.gov/ndb/search/?format=json&q=" + image_class + "&sort=n&max=25&offset=0&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu"
    r = requests.get(url).json()

    max_dist_ratio = 0
    ndbno = 0
    for item in r["list"]["item"]:
        fs = FuzzySet()
        fs.add(image_class)
        ratio = fs.get(item["name"])[0][0]

        if ratio > max_dist_ratio:
            max_dist_ratio = ratio
            ndbno = item["ndbno"]

    print(ndbno)

    nutrition_url = "https://api.nal.usda.gov/ndb/V2/reports?ndbno=" + ndbno + "&type=f&format=json&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu"
    nutrition_data = requests.get(nutrition_url).json()

    nutrition_facts = {}
    nutrients = nutrition_data["foods"][0]["food"]["nutrients"]

    nutrition_facts["serve_size"] = str(
        nutrients[0]["measures"][0]["qty"]) + " ounces"
    nutrition_facts["kcal"] = str(
        nutrients[0]["measures"][0]["value"]) + " calories"
    nutrition_facts["fat"] = str(
        nutrients[2]["measures"][0]["value"]) + " grams"
    nutrition_facts["carbs"] = str(
        nutrients[3]["measures"][0]["value"]) + " grams"
    nutrition_facts["protein"] = str(
        nutrients[1]["measures"][0]["value"]) + " grams"
    nutrition_facts["sugar"] = str(
        nutrients[4]["measures"][0]["value"]) + " grams"
    nutrition_facts["sodium"] = str(
        nutrients[5]["measures"][0]["value"]) + " milligrams"

    return nutrition_facts

Beispiel #22

0

Datei anzeigen

 def _learn(self, command):
     functions = FuzzySet(self.function_names)
     rospy.logerr("No command found! Please input commands")
     self.function_names.append(command)
     self.functions[command] = []
     commands = raw_input()
     sentences = json.loads(
         self._nlp.annotate(commands,
                            properties=self._nlp_properties))['sentences']
     for sentence in sentences:
         dependencies = []
         for dependency in sentence['enhancedPlusPlusDependencies']:
             dependencies.append(
                 (dependency['governorGloss'], dependency['dependentGloss'],
                  dependency['dep']))
         verb_tuple = [x for x in dependencies if x[2] == u'dobj'][0]
         query = functions.get(str(verb_tuple[0]))
         if query:
             query = query[0]
             func = self.functions.get(query[1])
             self.functions[command].extend(func)
     rospy.logerr(self.functions)

Beispiel #23

0

Datei anzeigen

Datei: fdt.py Projekt: arkaragi/fuzzylib

    def regress(self, test, yt):
        predicted = []
        for k, row in enumerate(test):
            act_deg = []
            fparts = []
            for i, rule in enumerate(self.rules):
                ante, cons = rule[0], rule[-1]
                temp = []
                for tup in ante:
                    t = row[tup[0]][tup[-1]]
                    temp.append(t)
                act_deg.append(min(temp))
                if act_deg[i] == 0:
                    if i == 0:
                        f = self.fvar[-1].fuzzy[cons]
                        s = 'f.func.{}(f.num_x, f.prmts)'.format(f.mfunc)
                        newm = eval(s)
                        newf = FuzzySet(f.num_x, newm)
                        newf.cutpoint(act_deg[i])
                        self.fout = newf
                    continue
                else:
                    f = self.fvar[-1].fuzzy[cons]
                    s = 'f.func.{}(f.num_x, f.prmts)'.format(f.mfunc)
                    newm = eval(s)
                    newf = FuzzySet(f.num_x, newm)
                    newf.cutpoint(act_deg[i])
                    if i == 0:
                        self.fout = newf
                    else:
                        self.fout = self.fout | newf
            # mean of max method
##            dummy = self.fout.x[np.where(self.fout.m >= max(self.fout.m))[0]]
##            val = sum(dummy)/len(dummy)
            # center of gravity method
            val = self.centroid(self.fout.x, self.fout.m)
            predicted.append(val)
##            print('## Actual value: {}'.format(yt[k]))
##            print('## Regressed value: {}'.format(val))
##            FuzzyPlotter(self.fout) 
        diff = [(a-p)**2 for a, p in zip(yt, predicted)]
        rmse = sum(diff) / len(diff)
        return predicted, rmse

Beispiel #24

0

Datei anzeigen

 def path(self, source, destination):
     from fuzzyset import FuzzySet
     if source == destination:
         return
     fuzz = FuzzySet(self.map.nodes)
     if source not in self.map.nodes:
         source = fuzzymatch(fuzz, source)
     if destination not in self.map.nodes:
         destination = fuzzymatch(fuzz, destination)
     try:
         path = nx.shortest_path(self.map, source, destination)
     except NetworkXNoPath:
         log.error(f"No path between {source.name} and {destination.name}.")
         return
     way = []
     for iter in range(1, len(path)):
         way.append(self.map.edges[path[iter - 1], path[iter]]["label"])
     return way

Beispiel #25

0

Datei anzeigen

Datei: test_fuzzysetlib_matcher.py Projekt: viniciusbig/fuzzy_cnpj_matcher

class TestFuzzyMatcher(unittest.TestCase):

    def setUp(self):
        with open('../bulk/cnpjs.txt') as f:
            self.fuzzy_set = f.read().splitlines()
            self.fuzzy_matcher = FuzzySet(self.fuzzy_set)

    def test_validate(self):
        self.assertEqual(self.fuzzy_matcher.get('06389497000195')[0][1],
                         '04389697000195')
        self.assertEqual(self.fuzzy_matcher.get('15574828000190')[0][1],
                         '15575829000190')
        self.assertEqual(self.fuzzy_matcher.get('15911974000144')[0][1],
                         '15922975000144')
        self.assertEqual(self.fuzzy_matcher.get('12919223000129')[0][1],
                         '12291923000129')
        self.assertEqual(self.fuzzy_matcher.get('557135900011')[0][1],
                         '55713579000121')
        self.assertEqual(self.fuzzy_matcher.get('40194766000116')[0][1],
                         '49794166000116')

        #'49794166000116'
        print self.fuzzy_matcher.get('40194766000116')[0][1]

Beispiel #26

0

Datei anzeigen

class WordFixer:
    def __init__(self, word2vec: Word2VecKeyedVectors):
        self.__word2vec = word2vec
        self.__fixed_word_dict: Dict[str, str] = dict()

        self.__approximate_matcher = FuzzySet(word2vec.vocab)

    def is_word_correct(self, word: str):
        if word in self.__word2vec:
            return True
        return False

    def fix(self, word: str):

        if word in self.__fixed_word_dict:
            return self.__fixed_word_dict[word]

        candidate = self.__approximate_matcher.get(word)
        if candidate is not None and len(candidate) > 0:
            fixed_word = candidate[0][1]
            self.__fixed_word_dict[word] = fixed_word
            return fixed_word

        raise Exception("Cannot be fixed")

Beispiel #27

0

Datei anzeigen

Datei: plotter.py Projekt: arkaragi/fuzzylib

##FuzzyPlotter(v)

### pic1
##name = 'Parametric Membership Functions'
##uod = np.arange(-10, 50, 0.01)
##terms = ['trimf, [-5,2,12]', 'trapmf, [8,14,22,28]', 'gaussmf, [30,4]']
##v = FuzzyVariable(name, uod, terms)
##v.setmf([('trimf', [-5,2,12]),
##         ('trapmf', [8,14,22,28]),
##         ('gaussmf', [30,4])])
##FuzzyPlotter(v)


### pic 2345
uod = np.arange(-10,110,0.1)
A = FuzzySet(uod)
A.set_mf('trimf', [0,35,60])
B = FuzzySet(uod)
B.set_mf('trimf', [20,75,95])
C = A | B
D = A & B
E = ~A
F = ~B
plt.figure()
plt.plot(A.x,A.m,'r')
plt.plot(E.x,E.m,'g')
plt.title('Complementary Operation')
#plt.title('The intersection of A and B')
plt.xlabel('Universe of Discourse')
plt.ylabel('Membership Degree')
plt.legend(['μA', 'μAcomplement'])

Beispiel #28

0

Datei anzeigen

Datei: test_fuzzysetlib_matcher.py Projekt: PauloMigAlmeida/fuzzy_cnpj_matcher

 def setUp(self):
     with open('../bulk/cnpjs.txt') as f:
         self.fuzzy_set = f.read().splitlines()
         self.fuzzy_matcher = FuzzySet(self.fuzzy_set)

Beispiel #29

0

Datei anzeigen

Datei: sequential_search.py Projekt: viniciusbig/fuzzy_cnpj_matcher

class SequentialFuzzyCnpjMatcher:
    """
    Class that performs fuzzy string matching on CNPJs sequentially. For small
    fuzzyset this class is the easiest way to get started. However if you going
    for a large fuzzyset we strongly recommend using LocalParallelFuzzyCnpjMatcher
     instead.
    """
    def __init__(self):
        """
        Default constructor
        :return: a SequentialFuzzyCnpjMatcher instance
        """
        self.__cnpj_bases = []

        for x in xrange(0, 100):
            idx = x * 1000000
            self.__cnpj_bases.append('../bulk/cnpjs_base_' +
                                     str(idx).zfill(7) + '.txt')

        self.__fuzzy_matcher = None

    def match_cnpj(self, cnpj, debug=False):
        """
        Search the closest valid CNPJ given a invalid one
        :param cnpj: a invalid CNPJ
        :param debug: whether you want to see debugging logs or not
        :return: a list of the most similar valid CNPJs to the one you've provided
        """
        best_matches = []

        for cnpj_base_str in self.__cnpj_bases:
            with open(cnpj_base_str) as f:
                # temp variables
                start_time = time.time()

                # Searching
                self.__log('Searching for %s on %s' % (cnpj, cnpj_base_str),
                           debug)
                self.__fuzzy_matcher = FuzzySet(f.read().splitlines())

                match = self.__fuzzy_matcher.get(cnpj)
                elapsed_time = time.time() - start_time

                self.__log(
                    'Best match for this file is %s and it took %d seconds' %
                    (match, elapsed_time), debug)
                # Appending to the best matches so far
                if not match is None:
                    for m in match:
                        best_matches.append(m[1])

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.__fuzzy_matcher = FuzzySet(best_matches)
        return self.__fuzzy_matcher.get(cnpj)[0]

    def __log(self, msg, debug=False):
        """
        Prints a message to console depending on debug variable
        :param msg: a message string
        :param debug: a boolean value
        :return:
        """
        if debug:
            print msg

Beispiel #30

0

Datei anzeigen

Datei: video_analysis.py Projekt: lukic-aleksandar/hawkEyeProject

class VideoAnalyzer:

    def __init__(self, camera):
        self.Camera = camera
        self.ellipseFuzzySet = FuzzySet()#([0,1,2.5],[0,1,0])
        self.colorFuzzySet = FuzzySet()#([0,1,1.05],[0,1,0])
        self.lowerColorBnd = (0, 0, 0)
        self.upperColorBnd = (180, 255, 255)
        self.bgs = cv2.createBackgroundSubtractorMOG2()
        self.bgsLearningRate = 0.1
        self.imageAnalyzer = ImageAnalyzer()
        self.morphologyArray = []
        self.sizeBoundaries = ((0, 0), (0, 0))
        self.contourThreshold = 0.5

    def analyze(self):
        cnt = 0
        fcnt = 0
        result = []
        images = []

        while(1):

            ret, frame = self.Camera.capture.read()

            if(ret != False):

                if self.Camera.calibrationMatrix is not None:
                    w,  h = int(self.Camera.resolution[0]), int(self.Camera.resolution[1])
                    newCamMatrix, roi = cv2.getOptimalNewCameraMatrix(self.Camera.calibrationMatrix, self.Camera.distortCoefs, (w, h), 1, (w, h))
                    frame = cv2.undistort(frame, self.Camera.calibrationMatrix, self.Camera.distortCoefs, None, newCamMatrix)

                images.append(frame.copy())

                rectImg = frame.copy()
                hsvImg = frame.copy()

                hsvImg = cv2.cvtColor(hsvImg, cv2.COLOR_BGR2HSV)
                hsvMask = cv2.inRange(hsvImg, self.lowerColorBnd, self.upperColorBnd)  #(30,10,110), (60,255,255))

                frame = self.bgs.apply(frame, learningRate = self.bgsLearningRate)

                for morph in self.morphologyArray:
                    frame = self.imageAnalyzer.morphology(frame, morph)

                image, contours, hierarchy = cv2.findContours(frame.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

                fittingContours = []
                for i,contour in enumerate(contours):
                    x,y,w,h = cv2.boundingRect(contour)
                    if(w >= self.sizeBoundaries[0][0] and w <= self.sizeBoundaries[0][1]
                       and h >= self.sizeBoundaries[1][0] and h <= self.sizeBoundaries[1][1]):

                        rotatedRect = cv2.fitEllipse(contour)
                        center = (rotatedRect[0][0], rotatedRect[0][1])
                        size = (rotatedRect[1][0], rotatedRect[1][1])
                        angle = (rotatedRect[2])

                        ellipseFit = None

                        if(size[0] >= size[1]):
                            ellipseFit = self.ellipseFuzzySet.fitLinear(size[1] / size[0])
                        elif(size[0] < size [1]):
                            ellipseFit = self.ellipseFuzzySet.fitLinear(size[0] / size[1])

                        if(ellipseFit is not None and ellipseFit >= self.ellipseFuzzySet.threshold):
                            foundBlack = 0
                            foundWhite = 0

                            for j in range (x,x+w):
                                for k in range(y,y+h):
                                    dist = cv2.pointPolygonTest(contour,(j,k),False)
                                    if dist>= 0:
                                        foundBlack += 1.0
                                        if(hsvMask[k,j] == 255):
                                            foundWhite+=1.0
                                            foundBlack-=1.0

                            colorFit = None

                            if(foundBlack > foundWhite):
                                colorFit = self.colorFuzzySet.fitLinear(foundWhite / foundBlack)
                            elif(foundWhite >= foundBlack and foundBlack > 0):
                                colorFit = 1 + self.colorFuzzySet.fitLinear(foundBlack / foundWhite)

                            if(colorFit is not None and colorFit >= self.colorFuzzySet.threshold):
                                fittingContours.append((center, size, angle,(colorFit+ellipseFit)/2, cnt))

                if len(fittingContours) > 0:
                    best = fittingContours[0]
                    for i in range (0,len(fittingContours)):
                        if(fittingContours[i][3] > best[3]):
                            best = fittingContours[i]
                    if(best[3] >= self.contourThreshold):
                        cv2.rectangle(rectImg,(int(best[0][0]-best[1][0]/2), int(best[0][1]-best[1][1]/2)),
                                     (int(best[0][0]+best[1][0]/2), int(best[0][1]+best[1][1]/2)),(255,0,0),2)

                        result.append(best)
                        fcnt += 1

                cv2.imshow('frame', rectImg)
            else:
                print "Frame not found"

            cnt += 1
            if (cnt == self.Camera.capture.get(cv2.CAP_PROP_FRAME_COUNT)) or cv2.waitKey(3) & 0xff == ord('q'):
                print 'found', fcnt, 'has', self.Camera.capture.get(cv2.CAP_PROP_FRAME_COUNT)
                return result, images

Beispiel #31

0

Datei anzeigen

Datei: nlp_playground.py Projekt: DevrathIyer/SciFair2019

 def _learn(self, command):
     fs = FuzzySet(self.function_names)
     fs.add(command)
     print("No command found! Please input commands")
     commands = raw_input()
     commandArray = commands.split('.')

Beispiel #32

0

Datei anzeigen

Datei: sequential_search.py Projekt: viniciusbig/fuzzy_cnpj_matcher

class SequentialFuzzyCnpjMatcher:
    """
    Class that performs fuzzy string matching on CNPJs sequentially. For small
    fuzzyset this class is the easiest way to get started. However if you going
    for a large fuzzyset we strongly recommend using LocalParallelFuzzyCnpjMatcher
     instead.
    """

    def __init__(self):
        """
        Default constructor
        :return: a SequentialFuzzyCnpjMatcher instance
        """
        self.__cnpj_bases = []

        for x in xrange(0, 100):
            idx = x * 1000000
            self.__cnpj_bases.append("../bulk/cnpjs_base_" + str(idx).zfill(7) + ".txt")

        self.__fuzzy_matcher = None

    def match_cnpj(self, cnpj, debug=False):
        """
        Search the closest valid CNPJ given a invalid one
        :param cnpj: a invalid CNPJ
        :param debug: whether you want to see debugging logs or not
        :return: a list of the most similar valid CNPJs to the one you've provided
        """
        best_matches = []

        for cnpj_base_str in self.__cnpj_bases:
            with open(cnpj_base_str) as f:
                # temp variables
                start_time = time.time()

                # Searching
                self.__log("Searching for %s on %s" % (cnpj, cnpj_base_str), debug)
                self.__fuzzy_matcher = FuzzySet(f.read().splitlines())

                match = self.__fuzzy_matcher.get(cnpj)
                elapsed_time = time.time() - start_time

                self.__log("Best match for this file is %s and it took %d seconds" % (match, elapsed_time), debug)
                # Appending to the best matches so far
                if not match is None:
                    for m in match:
                        best_matches.append(m[1])

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.__fuzzy_matcher = FuzzySet(best_matches)
        return self.__fuzzy_matcher.get(cnpj)[0]

    def __log(self, msg, debug=False):
        """
        Prints a message to console depending on debug variable
        :param msg: a message string
        :param debug: a boolean value
        :return:
        """
        if debug:
            print msg

Beispiel #33

0

Datei anzeigen

class MiniBaseIndex(object):
    def __init__(self,
                 field=None,
                 tokenizer=None,
                 similarity=None,
                 base=None,
                 idf_limit=0.05,
                 **kw):
        super(MiniBaseIndex, self).__init__(**kw)
        self.content = {}
        self.field = field
        self.tokenizer = tokenizer
        self.similarity = similarity
        self.base = base
        self.counts = {}
        self.fuzzwords = FuzzySet(rel_sim_cutoff=0.7, use_levenshtein=False)
        self.blacklist = set()
        self.idf_limit = idf_limit

    def add(self, tok, i):
        if tok not in self.content:
            if tok not in self.blacklist:
                self.content[tok] = set()
            self.counts[tok] = 0
        self.content[tok].add(i)
        self.counts[tok] += 1
        # if self.counts[tok]/len(self.base.entries) > self.idf_limit:
        #     self.blacklist.add(tok)
        #     del self.counts[tok]
        #     del self.content[tok]
        self.fuzzwords.add(tok)

    def finalize(self):
        for tok in self.content:
            pass
            # self.fuzzwords.add(tok)

    def search(self, x, expl=5000, top=25, maxtok=250, debug=False):
        tokenizer = self.tokenizer
        xtoks = tokenizer(x)
        # maxtok = maxtok * len(xtoks)
        results = {}
        # collect all toks
        alltoks = []
        alltoks_set = set()
        for xtok in xtoks:
            for xtok_fuzz_score, xtok_fuzz_tok \
                    in self.fuzzwords.get(xtok):
                xtok_fuzz_sim = self.similarity(xtok, xtok_fuzz_tok)
                if xtok_fuzz_tok not in alltoks_set:
                    alltoks.append(
                        (xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim))
                    alltoks_set.add(xtok_fuzz_tok)
        # alltoks = list(alltoks)
        # sort together by fuzziness
        alltoks = sorted(alltoks,
                         key=lambda x: x[2] * 100 + 1 / self.counts[x[1]],
                         reverse=True)
        # take maxtok only
        if debug:
            print(len(alltoks), maxtok)
            for tok in alltoks:
                print(tok, self.counts[tok[1]])
        alltoks = alltoks[:maxtok]
        # sort by inverse frequency
        # alltoks = sorted(alltoks, key=lambda x: self.counts[x[1]])
        # alltoksset = set(alltoks)
        for xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim in alltoks:
            for _id in self.content[xtok_fuzz_tok]:
                if _id not in results:
                    results[_id] = 0
                results[_id] += xtok_fuzz_score
                if len(results) > expl:
                    break
            if len(results) > expl:
                break
        if debug:
            print(len(results))
        results = [(res[0], res[1],
                    self.similarity(x, self.base.entries[res[0]][self.field]))
                   for res in results.items()]

        def sortkey(x):
            entid = x[0]
            pop = self.base.entries[entid]["pop"]
            sim = x[2]
            return sim * 1e2 + pop * 1e-3

        results = sorted(results, key=sortkey, reverse=True)
        results = results[:top]
        return results

Beispiel #34

0

Datei anzeigen

    def __init__(self, word2vec: Word2VecKeyedVectors):
        self.__word2vec = word2vec
        self.__fixed_word_dict: Dict[str, str] = dict()

        self.__approximate_matcher = FuzzySet(word2vec.vocab)

Beispiel #35

0

Datei anzeigen

geoctrs = [
    feat['properties']['name'] for k, feat in enumerate(geos['features'])
]

# list of countries from C19 data
c19ctrs = c19.columns.tolist()

# list of countries from population data
popsctrs = pops['Country'].tolist()

# geo data is king, we need to match everything else to it
# the country name becomes the key matching the tables / dictionaries

# some countries in c19 do not match any country in geo data
# let's print fuzzy matches
fzs = FuzzySet()
for c in geoctrs:
    fzs.add(c)

#for c in c19ctrs:
#    if c not in geoctrs:
#        print(c, fzs.get(c))

# In[8]:

c19notfound = [
    'Andorra', 'Antigua and Barbuda', 'Bahrain', 'Barbados', 'Cabo Verde',
    'Comoros', 'Diamond Princess', 'Dominica', 'Grenada', 'Holy See',
    'Liechtenstein', 'MS Zaandam', 'Maldives', 'Mauritius', 'Monaco',
    'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines',
    'San Marino', 'Sao Tome and Principe', 'Seychelles', 'Singapore'