Ejemplo n.º 1
0
 def execute(self, command):
     ReturnedJson = json.loads(
         self._nlp.annotate(
             command, properties=self._nlp_properties))['sentences'][0]
     dependencies = []
     for dependency in ReturnedJson['enhancedPlusPlusDependencies']:
         dependencies.append(
             (dependency['governorGloss'], dependency['dependentGloss'],
              dependency['dep']))
     verb_tuple = [x for x in dependencies if x[2] == u'dobj'][0]
     functions = FuzzySet(self.function_names)
     query = functions.get(str(verb_tuple[0]))
     if query:
         query = query[0]
         print(query)
         if query[0] < .5:
             print("learning")
             self._learn(command)
         else:
             dyn = self.functions.get(query[1])
             obj = verb_tuple[1]
             obj_adj = str(" ".join([
                 x[1]
                 for x in dependencies if x[0] == obj and not x[2] == u'det'
             ]) + " " + obj)
             objects = FuzzySet(world.objects)
             object_query = objects.get(obj_adj)
             if object_query:
                 dyn(world.attributes.get(object_query[0][1]))
             else:
                 print("no objects found!")
     else:
         self._learn(command)
Ejemplo n.º 2
0
 def matchedIds(postId, threshold):
     keywords = Model.getKeywords()
     postKeywords = list(filter(lambda x: x["id"] == postId,
                                keywords))[0]["keywords"]
     matches = []
     for keyword in keywords:
         fs = FuzzySet(keyword['keywords'])
         for pk in postKeywords:
             if postId != keyword["id"]:
                 m = fs.get(pk)
                 if m:
                     for score, val in fs.get(pk):
                         if score > threshold:
                             matches.append((keyword["id"], score, val))
     return matches
class LocalParallelFuzzyCnpjMatcher(BaseParallelFuzzyCnpjMatcher):

    def __init__(self, cpu_count="autodetect"):
        super(LocalParallelFuzzyCnpjMatcher, self).__init__()
        self.__job_server = pp.Server(ncpus=cpu_count)

    def match_cnpj(self, cnpj, debug=False):
        best_matches = []

        # temp variables
        start_time = time.time()

        jobs = [(
                    cnpj_base_str,
                    self.__job_server.submit(
                        fuzzy_cnpj_search,
                        (cnpj_base_str, cnpj, debug,),
                        (log, ),
                        ("from fuzzyset import FuzzySet", "time")
                    )) for cnpj_base_str in self.cnpj_bases]

        for cnpj_base_str, job in jobs:
            print "Results", cnpj_base_str, "is", job()

        elapsed_time = time.time() - start_time

        log('Parallel processes took %d seconds to finish' % elapsed_time, debug)

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.fuzzy_matcher = FuzzySet(best_matches)
        return self.fuzzy_matcher.get(cnpj)[0]
Ejemplo n.º 4
0
class FuzzyBaseIndex(object):
    def __init__(self, field=None, similarity=None, base=None, **kw):
        super(FuzzyBaseIndex, self).__init__(**kw)
        self.fuzz = FuzzySet(rel_sim_cutoff=1., use_levenshtein=False)
        self.content = {}
        self.field = field
        self.similarity = similarity
        self.base = base

    def add(self, x, i):
        self.fuzz.add(x)
        if x not in self.content:
            self.content[x] = set()
        self.content[x].add(i)

    def finalize(self):
        pass

    def search(self, x, top=25, debug=True):
        results = self.fuzz.get(x)
        ret = []
        for r in results:
            for i in self.content[r[1]]:
                sim = self.similarity(x, r[1])
                ret.append((i, r[0], sim))
        ret = sorted(ret, key=lambda x: x[2], reverse=True)
        ret = ret[:top]
        return ret
Ejemplo n.º 5
0
def get_oov_vocabulary_map(vocabulary_words_weights, wordset):
    oov = wordset - set(vocabulary_words_weights.keys())
    vocabulary_words_set = FuzzySet(sorted(vocabulary_words_weights.keys()))
    mapping = {}
    for word in tqdm(oov):
        word_matches = vocabulary_words_set.get(word)
        if word_matches is None or len(word_matches) == 0:
            continue
        word_scores = {vocabulary_word: score * vocabulary_words_weights[vocabulary_word]
                       for score, vocabulary_word in word_matches}
        vocabulary_words_scored = sorted(word_scores.keys(),
                                         key=lambda vocabulary_word: -word_scores[vocabulary_word])
        mapping[word] = vocabulary_words_scored[0]
    return mapping
def fuzzy_cnpj_search(cnpj_base_str, cnpj, debug=False):
    best_matches = []
    with open(cnpj_base_str) as f:
        # temp variables
        start_time = time.time()

        # Searching
        log('Searching for %s on %s' % (cnpj, cnpj_base_str), debug)
        fuzzy_matcher = FuzzySet(f.read().splitlines())

        match = fuzzy_matcher.get(cnpj)
        elapsed_time = time.time() - start_time

        log('Best match for this file is %s and it took %d seconds'
                   % (match, elapsed_time), debug)
        # Appending to the best matches so far
        if not match is None:
            for m in match:
                best_matches.append(m[1])
        return best_matches
class TestFuzzyMatcher(unittest.TestCase):

    def setUp(self):
        with open('../bulk/cnpjs.txt') as f:
            self.fuzzy_set = f.read().splitlines()
            self.fuzzy_matcher = FuzzySet(self.fuzzy_set)

    def test_validate(self):
        self.assertEqual(self.fuzzy_matcher.get('06389497000195')[0][1],
                         '04389697000195')
        self.assertEqual(self.fuzzy_matcher.get('15574828000190')[0][1],
                         '15575829000190')
        self.assertEqual(self.fuzzy_matcher.get('15911974000144')[0][1],
                         '15922975000144')
        self.assertEqual(self.fuzzy_matcher.get('12919223000129')[0][1],
                         '12291923000129')
        self.assertEqual(self.fuzzy_matcher.get('557135900011')[0][1],
                         '55713579000121')
        self.assertEqual(self.fuzzy_matcher.get('40194766000116')[0][1],
                         '49794166000116')

        #'49794166000116'
        print self.fuzzy_matcher.get('40194766000116')[0][1]
Ejemplo n.º 8
0
def get_nutrition_data(image_class):
    url = "https://api.nal.usda.gov/ndb/search/?format=json&q=" + image_class + "&sort=n&max=25&offset=0&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu"
    r = requests.get(url).json()

    max_dist_ratio = 0
    ndbno = 0
    for item in r["list"]["item"]:
        fs = FuzzySet()
        fs.add(image_class)
        ratio = fs.get(item["name"])[0][0]

        if ratio > max_dist_ratio:
            max_dist_ratio = ratio
            ndbno = item["ndbno"]

    print(ndbno)

    nutrition_url = "https://api.nal.usda.gov/ndb/V2/reports?ndbno=" + ndbno + "&type=f&format=json&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu"
    nutrition_data = requests.get(nutrition_url).json()

    nutrition_facts = {}
    nutrients = nutrition_data["foods"][0]["food"]["nutrients"]

    nutrition_facts["serve_size"] = str(
        nutrients[0]["measures"][0]["qty"]) + " ounces"
    nutrition_facts["kcal"] = str(
        nutrients[0]["measures"][0]["value"]) + " calories"
    nutrition_facts["fat"] = str(
        nutrients[2]["measures"][0]["value"]) + " grams"
    nutrition_facts["carbs"] = str(
        nutrients[3]["measures"][0]["value"]) + " grams"
    nutrition_facts["protein"] = str(
        nutrients[1]["measures"][0]["value"]) + " grams"
    nutrition_facts["sugar"] = str(
        nutrients[4]["measures"][0]["value"]) + " grams"
    nutrition_facts["sodium"] = str(
        nutrients[5]["measures"][0]["value"]) + " milligrams"

    return nutrition_facts
Ejemplo n.º 9
0
 def _learn(self, command):
     functions = FuzzySet(self.function_names)
     rospy.logerr("No command found! Please input commands")
     self.function_names.append(command)
     self.functions[command] = []
     commands = raw_input()
     sentences = json.loads(
         self._nlp.annotate(commands,
                            properties=self._nlp_properties))['sentences']
     for sentence in sentences:
         dependencies = []
         for dependency in sentence['enhancedPlusPlusDependencies']:
             dependencies.append(
                 (dependency['governorGloss'], dependency['dependentGloss'],
                  dependency['dep']))
         verb_tuple = [x for x in dependencies if x[2] == u'dobj'][0]
         query = functions.get(str(verb_tuple[0]))
         if query:
             query = query[0]
             func = self.functions.get(query[1])
             self.functions[command].extend(func)
     rospy.logerr(self.functions)
Ejemplo n.º 10
0
class WordFixer:
    def __init__(self, word2vec: Word2VecKeyedVectors):
        self.__word2vec = word2vec
        self.__fixed_word_dict: Dict[str, str] = dict()

        self.__approximate_matcher = FuzzySet(word2vec.vocab)

    def is_word_correct(self, word: str):
        if word in self.__word2vec:
            return True
        return False

    def fix(self, word: str):

        if word in self.__fixed_word_dict:
            return self.__fixed_word_dict[word]

        candidate = self.__approximate_matcher.get(word)
        if candidate is not None and len(candidate) > 0:
            fixed_word = candidate[0][1]
            self.__fixed_word_dict[word] = fixed_word
            return fixed_word

        raise Exception("Cannot be fixed")
class SequentialFuzzyCnpjMatcher:
    """
    Class that performs fuzzy string matching on CNPJs sequentially. For small
    fuzzyset this class is the easiest way to get started. However if you going
    for a large fuzzyset we strongly recommend using LocalParallelFuzzyCnpjMatcher
     instead.
    """
    def __init__(self):
        """
        Default constructor
        :return: a SequentialFuzzyCnpjMatcher instance
        """
        self.__cnpj_bases = []

        for x in xrange(0, 100):
            idx = x * 1000000
            self.__cnpj_bases.append('../bulk/cnpjs_base_' +
                                     str(idx).zfill(7) + '.txt')

        self.__fuzzy_matcher = None

    def match_cnpj(self, cnpj, debug=False):
        """
        Search the closest valid CNPJ given a invalid one
        :param cnpj: a invalid CNPJ
        :param debug: whether you want to see debugging logs or not
        :return: a list of the most similar valid CNPJs to the one you've provided
        """
        best_matches = []

        for cnpj_base_str in self.__cnpj_bases:
            with open(cnpj_base_str) as f:
                # temp variables
                start_time = time.time()

                # Searching
                self.__log('Searching for %s on %s' % (cnpj, cnpj_base_str),
                           debug)
                self.__fuzzy_matcher = FuzzySet(f.read().splitlines())

                match = self.__fuzzy_matcher.get(cnpj)
                elapsed_time = time.time() - start_time

                self.__log(
                    'Best match for this file is %s and it took %d seconds' %
                    (match, elapsed_time), debug)
                # Appending to the best matches so far
                if not match is None:
                    for m in match:
                        best_matches.append(m[1])

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.__fuzzy_matcher = FuzzySet(best_matches)
        return self.__fuzzy_matcher.get(cnpj)[0]

    def __log(self, msg, debug=False):
        """
        Prints a message to console depending on debug variable
        :param msg: a message string
        :param debug: a boolean value
        :return:
        """
        if debug:
            print msg
class SequentialFuzzyCnpjMatcher:
    """
    Class that performs fuzzy string matching on CNPJs sequentially. For small
    fuzzyset this class is the easiest way to get started. However if you going
    for a large fuzzyset we strongly recommend using LocalParallelFuzzyCnpjMatcher
     instead.
    """

    def __init__(self):
        """
        Default constructor
        :return: a SequentialFuzzyCnpjMatcher instance
        """
        self.__cnpj_bases = []

        for x in xrange(0, 100):
            idx = x * 1000000
            self.__cnpj_bases.append("../bulk/cnpjs_base_" + str(idx).zfill(7) + ".txt")

        self.__fuzzy_matcher = None

    def match_cnpj(self, cnpj, debug=False):
        """
        Search the closest valid CNPJ given a invalid one
        :param cnpj: a invalid CNPJ
        :param debug: whether you want to see debugging logs or not
        :return: a list of the most similar valid CNPJs to the one you've provided
        """
        best_matches = []

        for cnpj_base_str in self.__cnpj_bases:
            with open(cnpj_base_str) as f:
                # temp variables
                start_time = time.time()

                # Searching
                self.__log("Searching for %s on %s" % (cnpj, cnpj_base_str), debug)
                self.__fuzzy_matcher = FuzzySet(f.read().splitlines())

                match = self.__fuzzy_matcher.get(cnpj)
                elapsed_time = time.time() - start_time

                self.__log("Best match for this file is %s and it took %d seconds" % (match, elapsed_time), debug)
                # Appending to the best matches so far
                if not match is None:
                    for m in match:
                        best_matches.append(m[1])

        # Performing Fuzzy string match on the best results of each cnpj base file
        self.__fuzzy_matcher = FuzzySet(best_matches)
        return self.__fuzzy_matcher.get(cnpj)[0]

    def __log(self, msg, debug=False):
        """
        Prints a message to console depending on debug variable
        :param msg: a message string
        :param debug: a boolean value
        :return:
        """
        if debug:
            print msg
Ejemplo n.º 13
0
class TFIDFmatcher:
    def __init__(self, choices_corpus, ngram_range=(1, 2), use_cleaner=True, preprocess_func=None):
        """
        :param choices_corpus: should be a list of texts
        :param preprocess_func: is a str->str function
        """
        self.ngram_range = ngram_range
        self.use_cleaner = use_cleaner
        self.preprocess_func = preprocess_func

        self.initial_choices_corpus = choices_corpus
        if self.use_cleaner:
            choices_corpus = self.cleaner(choices_corpus)
        if self.preprocess_func:
            choices_corpus = [self.preprocess_func(k) for k in choices_corpus]

        self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True,  # strip_accents='ascii',
                                     lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(choices_corpus)

        self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus)
        self.initial_corpus_tf_idf_dict = {}
        for k in range(len(choices_corpus)):
            self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k]
        self.vocabulary = self.tfidf.vocabulary_.keys()
        self.fset_vocabulary = FuzzySet()
        for brnd in self.vocabulary:
            self.fset_vocabulary.add(brnd)


    def cleaner(self, x, verbose=False):
        if verbose:
            print("Before cleaning", type(x), x)

        def cleaning_function(x):
            return clean_string(x).lower()

        if type(x) == list:
            x = [cleaning_function(el) for el in x]
        if type(x) in [str]:
            x = cleaning_function(x)
        if verbose:
            print("After cleaning", type(x), x)
        return x

    def extract(self, query, choices=None, limit=5, verbose=False):
        """
        :param choices should be a list of texts
        :param query: TODO add an input type checker
        :param processor: TODO : add a cleaning process
        :param scorer: TODO : Add other distances
        :return:
        """
        # print("---------------------------\n"
        # Get rid of this case
        if choices == []:
            return []

        if choices:
            choices = list(set(choices))

            # Clean the choices corpus
            initial_choices = choices
            if self.use_cleaner:
                choices = self.cleaner(choices)
            if self.preprocess_func:
                choices = [self.preprocess_func(elk) for elk in choices]
            choices_corpus = choices

            corpus_tf_idf = self.tfidf.transform(choices_corpus)
        else:
            initial_choices = self.initial_choices_corpus
            choices_corpus = self.initial_choices_corpus
            corpus_tf_idf = self.initial_corpus_tf_idf
            # print("Defaulting"

        if self.use_cleaner:
            query = self.cleaner(query)
        if self.preprocess_func:
            query = self.preprocess_func(query)

        # building fuzzy query
        new_query = []
        # print("Vocabulary", vocabulary)
        for q in query.split():
            if q in self.vocabulary:
                new_query.append(q)
            else:
                fset_get = self.fset_vocabulary.get(q)
                if fset_get:
                    tmp_score, new_q = fset_get[0]
                    if verbose:
                        print("Modified", q, new_q, tmp_score)
                    if tmp_score >= 0.80:
                        new_query.append(new_q)
        query = " ".join(new_query)
        if verbose:
            print("NEW QUERY", query)
        x = self.tfidf.transform([query])

        cosine_similarities = linear_kernel(x, corpus_tf_idf).flatten()
        related_docs_indices = cosine_similarities.argsort().flatten()
        if choices:
            result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices if choices_corpus[k]]
        else:
            result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices]
        result.sort(key=lambda tup: tup[1], reverse=True)  # sorts in place
        # print("Query", query, "\nChoices", choices, "\nResult", result
        result = [(initial_choices[choices_corpus.index(k[0])], k[1]) for k in result]
        # print("Query", query, "\nChoices", choices, "\nResult", result
        if limit:
            return result[0:limit]
        return result

    def export_vocabulary(self, vocabulary_csv_destination, choices_corpus=None):
        if not choices_corpus:
            choices_corpus = self.initial_choices_corpus

        if self.use_cleaner:
            choices_corpus = [clean_string(x).lower() for x in choices_corpus]

        cnt_vec = CountVectorizer(ngram_range=self.ngram_range)
        transformed_data = cnt_vec.fit_transform(choices_corpus)
        l = [{'word':k, 'freq':v} for k, v in zip(cnt_vec.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))]
        df = pd.DataFrame(l)
        df = df[['word', 'freq']]
        df.sort_values('freq', ascending=False, inplace=True)
        df.to_csv(vocabulary_csv_destination, encoding='utf-8', index=False, sep=";", doublequote=True, quoting=csv.QUOTE_ALL)
        print('The vocabulary was exported at : ', vocabulary_csv_destination)
Ejemplo n.º 14
0
    'Eswatini': 'Swaziland',
    'Timor-Leste': 'East Timor',
    'Taiwan*': 'Taiwan',
    'Tanzania': 'United Republic of Tanzania',
    'US': 'United States of America',
    'West Bank and Gaza': 'West Bank'
}

c19.rename(columns=c19partm, inplace=True)
c19.drop(columns=c19notfound, inplace=True)

# if nothing is printed by this loop,
# then every country in c19 matches a country in geo data
for c in c19.columns.tolist():
    if c not in geoctrs:
        print(c, fzs.get(c))

# In[9]:

# let's fix country names in population data

fzs = FuzzySet()
for c in popsctrs:
    fzs.add(c)

popnotfound = ['Kosovo', 'West Bank']

poppartm = {
    'Bahamas': 'The Bahamas',
    'Brunei ': 'Brunei',
    'DR Congo': 'Democratic Republic of the Congo',
Ejemplo n.º 15
0
class MiniBaseIndex(object):
    def __init__(self,
                 field=None,
                 tokenizer=None,
                 similarity=None,
                 base=None,
                 idf_limit=0.05,
                 **kw):
        super(MiniBaseIndex, self).__init__(**kw)
        self.content = {}
        self.field = field
        self.tokenizer = tokenizer
        self.similarity = similarity
        self.base = base
        self.counts = {}
        self.fuzzwords = FuzzySet(rel_sim_cutoff=0.7, use_levenshtein=False)
        self.blacklist = set()
        self.idf_limit = idf_limit

    def add(self, tok, i):
        if tok not in self.content:
            if tok not in self.blacklist:
                self.content[tok] = set()
            self.counts[tok] = 0
        self.content[tok].add(i)
        self.counts[tok] += 1
        # if self.counts[tok]/len(self.base.entries) > self.idf_limit:
        #     self.blacklist.add(tok)
        #     del self.counts[tok]
        #     del self.content[tok]
        self.fuzzwords.add(tok)

    def finalize(self):
        for tok in self.content:
            pass
            # self.fuzzwords.add(tok)

    def search(self, x, expl=5000, top=25, maxtok=250, debug=False):
        tokenizer = self.tokenizer
        xtoks = tokenizer(x)
        # maxtok = maxtok * len(xtoks)
        results = {}
        # collect all toks
        alltoks = []
        alltoks_set = set()
        for xtok in xtoks:
            for xtok_fuzz_score, xtok_fuzz_tok \
                    in self.fuzzwords.get(xtok):
                xtok_fuzz_sim = self.similarity(xtok, xtok_fuzz_tok)
                if xtok_fuzz_tok not in alltoks_set:
                    alltoks.append(
                        (xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim))
                    alltoks_set.add(xtok_fuzz_tok)
        # alltoks = list(alltoks)
        # sort together by fuzziness
        alltoks = sorted(alltoks,
                         key=lambda x: x[2] * 100 + 1 / self.counts[x[1]],
                         reverse=True)
        # take maxtok only
        if debug:
            print(len(alltoks), maxtok)
            for tok in alltoks:
                print(tok, self.counts[tok[1]])
        alltoks = alltoks[:maxtok]
        # sort by inverse frequency
        # alltoks = sorted(alltoks, key=lambda x: self.counts[x[1]])
        # alltoksset = set(alltoks)
        for xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim in alltoks:
            for _id in self.content[xtok_fuzz_tok]:
                if _id not in results:
                    results[_id] = 0
                results[_id] += xtok_fuzz_score
                if len(results) > expl:
                    break
            if len(results) > expl:
                break
        if debug:
            print(len(results))
        results = [(res[0], res[1],
                    self.similarity(x, self.base.entries[res[0]][self.field]))
                   for res in results.items()]

        def sortkey(x):
            entid = x[0]
            pop = self.base.entries[entid]["pop"]
            sim = x[2]
            return sim * 1e2 + pop * 1e-3

        results = sorted(results, key=sortkey, reverse=True)
        results = results[:top]
        return results
Ejemplo n.º 16
0
class BrandMatcher:
    def __init__(self, ngram_range=(1, 3)):
        """
        :param choices_corpus: should be a list of texts
        :param preprocess_func: is a str->str function
        """
        self.ngram_range = ngram_range
        choices_corpus = [str(x) for x in list(brands['brnd'].dropna().unique())]

        l = brands[['brnd', 'equivalents']].dropna().to_dict('records')
        self.equivalents = {}
        for el in l:
            for eq in el['equivalents'].split(';'):
                self.equivalents[eq.strip()] = el['brnd']

        choices_corpus.extend(self.equivalents.keys())

        self.initial_choices_corpus = choices_corpus
        self.cleaned_choices_corpus = self.cleaner(choices_corpus)

        self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True,  # strip_accents='ascii',
                                     lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(self.cleaned_choices_corpus)

        self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus)
        self.initial_corpus_tf_idf_dict = {}
        for k in range(len(choices_corpus)):
            self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k]

        # Creating fuzzy set
        self.fset_brands = FuzzySet()
        for token in [str(x) for x in list(brands['brnd'].dropna().unique())]:
            self.fset_brands.add(token)

        self.fset_tokens = FuzzySet()
        for token in list(self.tfidf.vocabulary_):
            self.fset_tokens.add(token)

        # Prepare the japanese matching
        jp_brands = brands[['brnd', 'brnd_jp_clean']]
        jp_brands = jp_brands[jp_brands.brnd_jp_clean.notnull()]
        jp_brands['brnd_jp_clean'] = jp_brands['brnd_jp_clean'].apply(lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', '')))
        jp_brands['brnd_jp_size'] = jp_brands['brnd_jp_clean'].apply(lambda x: len(x))
        jp_brands.sort_values(['brnd_jp_size', 'brnd'], ascending=[False, False], inplace=True)
        self.jp_brands = jp_brands
        # jp_brands.to_excel('/tmp/jp_brands.xlsx')

    def cleaner(self, x, verbose=False):
        if verbose:
            print("Before cleaning", type(x), x)

        def cleaning_function(x):
            return clean_string(x).lower()

        if type(x) == list:
            x = [cleaning_function(str(el)) for el in x]
        if type(x) in [str]:
            x = cleaning_function(x)
        if verbose:
            print("After cleaning", type(x), x)
        return x

    def extract(self, query, verbose=False):
        """
        :param choices should be a list of texts
        :param query: TODO add an input type checker
        :param processor: TODO : add a cleaning process
        :param scorer: TODO : Add other distances
        :return:
        """
        initial_choices = self.initial_choices_corpus
        choices_corpus = self.initial_choices_corpus
        corpus_tf_idf = self.initial_corpus_tf_idf
        query = self.cleaner(query)

        # building fuzzy query
        new_query = []

        for q in query.split():
            if verbose:
                print(q)
            fset_get = self.fset_tokens.get(q)
            if fset_get:
                tmp_score, new_q = fset_get[0]
                if verbose:
                    print("Modified", q, new_q, tmp_score)
                if tmp_score > 0.80:
                    new_query.append(new_q)
        query = " ".join(new_query)
        if verbose:
            print("NEW QUERY", query)

        x = self.tfidf.transform([query])

        cosine_similarities = linear_kernel(x, corpus_tf_idf).flatten()
        related_docs_indices = cosine_similarities.argsort().flatten()
        result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices]
        result = [(initial_choices[choices_corpus.index(k[0])], k[1]) for k in result]
        # correcting with fuzzyratio score between result and query
        # result = [(k[0], k[1] * 0.01 * 0.5 * (fuzz.token_set_ratio(k[0], query) + fuzz.ratio(k[0], query))) for k in result]
        # result = [(k[0], k[1]) for k in result]
        result.sort(key=lambda tup: tup[1], reverse=True)  # sorts in place

        if verbose:
            print("Query", query, "\nResult", result)
        max_score = max(result, key=itemgetter(1))[1]
        result = [k for k in result if k[1] == max_score]
        return result

    def find_brand(self, pdct_name_on_eretailer, special_country=None, verbose=False):

        if not pdct_name_on_eretailer:
            return {'brand': None, 'score': 0}
        assert special_country in ['JP', None]

        if bool(pattern_japanese_chinese_caracters.search(pdct_name_on_eretailer)) or special_country == 'JP':
            clean_jp_str = lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', '').replace('・', ''))
            clean_jp_name = clean_jp_str(pdct_name_on_eretailer)

            # Forbidden words:
            japanese_forbidden_words = [" shoulder ", ' bag ', '【CD】', "【SHM-CD】", 'dvd', 'helmet', 'rucksack',
                                        'daypack', 'daiken', 'ダイケン', "スリープスパ", 'リンゴビール', 'パターソン', 'ヘネシー澄子',
                                        ]
            clean_japanese_forbidden_words = [clean_jp_str(x).lower() for x in japanese_forbidden_words]
            # print(clean_jp_name, clean_japanese_forbidden_words)
            if any(x in clean_jp_name.lower() for x in clean_japanese_forbidden_words):
                return {'brand': None, 'score': 0}

            for br in self.jp_brands.to_dict(orient='records'):
                if br['brnd_jp_clean'] in clean_jp_name:
                    # print("clean_jp_name :", clean_jp_name, "candidate", br['brnd_jp_clean'])
                    return {'brand': br['brnd'], "score": 98.765}
            if "モエ " in pdct_name_on_eretailer and any(x in clean_jp_name for x in ["750", 'ml', 'cl']):
                return {'brand': "Moët & Chandon", "score": 98.765}
        # Ad-hoc rules
        if any([x in pdct_name_on_eretailer.lower() for x in ["moet ", "moët"]]) and 'dom p' in pdct_name_on_eretailer.lower():
            return {'brand': 'Dom Pérignon', 'score': 99}
        if any([x in pdct_name_on_eretailer.lower() for x in ["moet ", "moët"]]):
            return {'brand': 'Moët & Chandon', 'score': 99}
        if any([x in pdct_name_on_eretailer.lower() for x in ["clicquot"]]):
            return {'brand': 'Veuve Clicquot', 'score': 99}
        if any([x in pdct_name_on_eretailer.lower() for x in ["ruinart"]]):
            return {'brand': 'Ruinart', 'score': 99}

        # # Forbidden words:
        # forbidden_words = ['leinwand', "hamper ", ' hamper', ' poster', 'poster ', 'chocolates ', ' chocolates',
        #                    'truffle ', ' truffle', 'birthday cake', ' cake', 'candle', 'poplin', ' sheet ', ' bed ',
        #                    ' cover ', ' kimono', 'towel', 'dvd']
        # if any(x in pdct_name_on_eretailer.lower() for x in forbidden_words):
        #     return {'brand': None, 'score': 0}

        # Cleaning
        pdct_name_on_eretailer = pdct_name_on_eretailer.replace('–', ' ')
        pdct_name_on_eretailer = pdct_name_on_eretailer.replace('-', ' ')
        pdct_name_on_eretailer = pdct_name_on_eretailer.replace('_', ' ')
        pdct_name_on_eretailer = ' '.join(w for w in pdct_name_on_eretailer.split() if w)
        pdct_name_on_eretailer = pdct_name_on_eretailer.replace("'", "").replace('é', 'e').replace('Â', '').replace(
            'ë', 'e')
        # print(pdct_name_on_eretailer)
        candidates = self.extract(pdct_name_on_eretailer, verbose=verbose)
        if not candidates:
            return {'brand': None, 'score': 0}
        # print(candidates)
        # print("FIRST SCORE :", brand, score)
        # Post treatment
        clean_tokens = clean_string(pdct_name_on_eretailer).split()
        # s = FuzzySet()
        # s.add(candidate)
        # l = [deepcopy(s.get(ngram, candidate)) for ngram in ngrams]
        # l = [x[0][0] for x in l if type(x) == list]
        brand, score = candidates[0], 0
        for candidate in candidates:
            candidate_str = self.cleaner(candidate[0])
            candidate_str = " ".join(candidate_str.split()[:9])
            nb_token_candidate = len(candidate_str.split())
            ngrams = [" ".join(clean_tokens[start:start + length]) for start in range(len(clean_tokens))
                      for length in range(max(nb_token_candidate, min(4, len(clean_tokens) - start + 1)))]
            # print([("'" + ngram + "'", "'" + candidate + "'", fuzz.ratio(ngram, candidate)) for ngram in ngrams])
            l = [fuzz.ratio(ngram, candidate_str) for ngram in list(set(ngrams))]
            max_score = (max(l + [0])*0.01) ** 2
            if max_score > score:
                score = max_score
                brand = candidate[0]

        if brand in self.equivalents:
            brand = self.equivalents[brand]
        score = round(100 * score, 2)
        # print("SECOND SCORE :", brand, score)

        # Forbidden words
        if any([x in pdct_name_on_eretailer.lower() for x in ["poster", 'dvd']]):
            return {'brand': None, 'score': 0}

        if score >= 80:
            if brand in ['Mercier']:  # Add Krug ???
                if 'hampagne' in pdct_name_on_eretailer.lower():
                    return {'brand': brand, 'score': score}
            if brand in ["Krug"] and any([x.lower() in pdct_name_on_eretailer.lower() for x in ['butler']]):
                return {'brand': None, 'score': 0}
            elif brand == "Belvedere":
                if not any([x in pdct_name_on_eretailer.lower() for x in
                            ['zinfandel', 'chardonnay', 'sauvignon', 'pinot', 'merlot', 'syrah']]):
                    return {'brand': brand, 'score': score}
            else:
                return {'brand': brand, 'score': score}
        elif verbose:
            print("Score is too low for: ", pdct_name_on_eretailer, {'brand': brand, 'score': score})
        return {'brand': None, 'score': 0}
Ejemplo n.º 17
0
class MovedBlocksDetector(object):
    def __init__(self, removed_lines_dicts, added_lines_dicts):
        self.removed_lines = []
        self.trim_text_to_array_of_added_lines = defaultdict(list)
        self.added_file_name_to_line_no_to_line = defaultdict(dict)
        self.removed_file_name_to_line_no_to_line = defaultdict(dict)
        self.added_lines_fuzzy_set = FuzzySet()

        for added_line_dict in added_lines_dicts:
            line = Line.from_dict(added_line_dict)
            self.trim_text_to_array_of_added_lines[line.trim_text].append(line)
            self.added_lines_fuzzy_set.add(line.trim_text)
            self.added_file_name_to_line_no_to_line[line.file][line.line_no] = line

        for removed_line_dict in removed_lines_dicts:
            line = Line.from_dict(removed_line_dict)
            self.removed_lines.append(line)
            self.removed_file_name_to_line_no_to_line[line.file][line.line_no] = line

    @staticmethod
    def from_diff(diff_text):
        parsed = diff_to_added_and_removed_lines(diff_text)
        return MovedBlocksDetector(parsed['removed_lines'], parsed['added_lines'])

    @measure_fun_time()
    def filter_out_block_inside_other_blocks(self, filtered_blocks: List[MatchingBlock]):
        filtered_blocks.sort(key=lambda fb: fb.get_filter_sort_tuple_for_remove())

        last_matching_block = None
        for matching_block in filtered_blocks:
            if last_matching_block is None:
                last_matching_block = matching_block
                continue
            if matching_block.last_removed_line.file == last_matching_block.last_removed_line.file \
                    and matching_block.first_removed_line.line_no >= last_matching_block.first_removed_line.line_no \
                    and matching_block.last_removed_line.line_no <= last_matching_block.last_removed_line.line_no:
                if matching_block.weighted_lines_count < last_matching_block.weighted_lines_count\
                   and matching_block.removed_lines_numbers.issubset(last_matching_block.removed_lines_numbers):
                    matching_block.remove_part_is_inside_larger_block = True
            else:
                last_matching_block = matching_block

        filtered_blocks.sort(key=lambda fb: fb.get_filter_sort_tuple_for_add())
        ok_blocks = []
        last_matching_block = None
        for matching_block in filtered_blocks:
            if getattr(matching_block, "remove_part_is_inside_larger_block", False): # TODO getattr was used to act like in javascript - rewrite it without getattr
                continue
            if last_matching_block is None:
                last_matching_block = matching_block
                ok_blocks.append(matching_block)
                continue
            if matching_block.last_added_line.file == last_matching_block.last_added_line.file \
                    and matching_block.first_added_line.line_no >= last_matching_block.first_added_line.line_no \
                    and matching_block.last_added_line.line_no <= last_matching_block.last_added_line.line_no\
                    and matching_block.weighted_lines_count < last_matching_block.weighted_lines_count\
                    and not matching_block.added_lines_numbers.issubset(last_matching_block.added_lines_numbers):
                pass
            else:
                last_matching_block = matching_block
                ok_blocks.append(matching_block)

        return ok_blocks

    def _filter_out_small_blocks(self, matching_blocks, min_lines_count):
        return [block for block in matching_blocks if block.weighted_lines_count >= min_lines_count and block.char_count >= 20]

    def _clear_not_matching_lines_at_end_and_filter_out_empty_blocks(self, matching_blocks):
        filtered_blocks = []
        for matching_block in matching_blocks:
            block_without_empty_end = matching_block.clear_empty_lines_at_end()
            if block_without_empty_end is not None:
                filtered_blocks.append(matching_block)
        return filtered_blocks

    def merge_blocks(self, block1, block2):
        new_block = MatchingBlock()
        new_block.lines.extend(block1.lines)
        new_block.lines.extend(block2.lines)
        # TODO what about lines between those 2 blocks?
        new_block.first_added_line = block1.first_added_line or block2.first_added_line
        new_block.first_removed_line = block1.first_removed_line or block2.first_removed_line
        new_block.last_added_line = block2.last_added_line or block1.last_added_line
        new_block.last_removed_line = block2.last_removed_line or block1.last_removed_line
        new_block.weighted_lines_count = block1.weighted_lines_count + block2.weighted_lines_count
        new_block.not_empty_lines = block1.not_empty_lines + block2.not_empty_lines
        new_block.char_count = block1.char_count + block2.char_count
        new_block.weighted_chars_count = block1.weighted_chars_count + block2.weighted_chars_count
        new_block.match_density = new_block.weighted_chars_count / new_block.char_count
        new_block.added_lines_numbers = block1.added_lines_numbers | block2.added_lines_numbers
        new_block.removed_lines_numbers = block1.removed_lines_numbers | block2.removed_lines_numbers
        return new_block

    @measure_fun_time()
    def join_nearby_blocks(self, matching_blocks: List[MatchingBlock], max_space_between=2):
        max_space_between += 1  # if we want to allow 2 lines between blocks difference between line numbers is 3
        blocks_grouped_by_files: Dict[tuple, List[MatchingBlock]] = defaultdict(list)
        for block in matching_blocks:
            blocks_grouped_by_files[(block.file_removed, block.file_added)].append(block)
        blocks_after_merge: List[MatchingBlock] = []

        merged_blocks = 0
        for block_list in blocks_grouped_by_files.values():
            loops_made = 0
            block_list.sort(key=lambda block: (block.first_removed_line.line_no, -block.match_density))
            indexes_of_merged_blocks = set()
            merged_blocks_list = []
            for i in range(len(block_list)):
                block = block_list[i]
                for j in range(i+1, len(block_list)):
                    loops_made += 1
                    next_block = block_list[j]
                    if next_block.first_removed_line.line_no - block.last_removed_line.line_no > max_space_between:
                        break
                    elif (next_block.first_removed_line.line_no > block.last_removed_line.line_no
                            and next_block.first_added_line.line_no - block.last_added_line.line_no <= max_space_between
                            and next_block.first_added_line.line_no > block.last_added_line.line_no):
                        block = self.merge_blocks(block, next_block)
                        merged_blocks += 1
                        indexes_of_merged_blocks.add(i)
                        indexes_of_merged_blocks.add(j)
                if i in indexes_of_merged_blocks:
                    merged_blocks_list.append(block)
            for i in range(len(block_list)):
                if i not in indexes_of_merged_blocks:
                    blocks_after_merge.append(block_list[i])
            blocks_after_merge.extend(merged_blocks_list)
        return blocks_after_merge

    @measure_fun_time()
    def filter_blocks(self, matching_blocks, min_lines_count=None):
        if min_lines_count is None:
            min_lines_count = 2
        filtered_blocks = self._filter_out_small_blocks(matching_blocks, min_lines_count)
        filtered_blocks = self._clear_not_matching_lines_at_end_and_filter_out_empty_blocks(filtered_blocks)
        return self.filter_out_block_inside_other_blocks(filtered_blocks)

    def extend_matching_blocks_with_empty_added_lines_if_possible(self, currently_matching_blocks):
        for matching_block in currently_matching_blocks:
            while True:
                last_line = matching_block.last_added_line
                next_added_line = self.added_file_name_to_line_no_to_line[last_line.file].get(last_line.line_no + 1)
                if next_added_line and next_added_line.trim_text == '':
                    matching_block.extend_with_empty_added_line(next_added_line)
                else:
                    break

    def extend_matching_blocks_with_empty_removed_lines_if_possible(self, currently_matching_blocks: List[MatchingBlock]):
        extended_blocks = []
        not_extended_blocks = []
        for matching_block in currently_matching_blocks:
            last_line = matching_block.last_removed_line
            next_removed_line = self.removed_file_name_to_line_no_to_line[last_line.file].get(last_line.line_no + 1)
            if next_removed_line and next_removed_line.trim_text == '':
                matching_block.extend_with_empty_removed_line(next_removed_line)
                extended_blocks.append(matching_block)
            else:
                not_extended_blocks.append(matching_block)

        return extended_blocks, not_extended_blocks

    @measure_fun_time()
    def detect_moved_blocks(self, min_lines_count=None) -> List[MatchingBlock]:
        detected_blocks: List[MatchingBlock] = []
        currently_matching_blocks = []
        new_matching_blocks = []

        for removed_line in self.removed_lines:
            if removed_line.trim_text:
                min_match_score = 0.5 if len(removed_line.trim_text) > 2 else 0.35
                fuzzy_matching_pairs = self.added_lines_fuzzy_set.get(
                    removed_line.trim_text, default=None, exact_match_only=False, min_match_score=min_match_score
                )
                # iterate over currently_matching_blocks and try to extend them with empty lines
                self.extend_matching_blocks_with_empty_added_lines_if_possible(currently_matching_blocks)
            else:
                fuzzy_matching_pairs = [[1, '']]

            if not fuzzy_matching_pairs:
                continue

            for fuzz_pair in fuzzy_matching_pairs:
                match_probability, text = fuzz_pair
                added_lines = self.trim_text_to_array_of_added_lines[text]
                for added_line in added_lines:
                    line_extended_any_block = False
                    already_added = set()
                    for i, matching_block in enumerate(currently_matching_blocks):
                        if i in already_added:
                            continue
                        extended = matching_block.try_extend_with_line(removed_line, added_line, match_probability)
                        if extended:
                            new_matching_blocks.append(matching_block)
                            line_extended_any_block = True
                            already_added.add(i)

                    if not line_extended_any_block and removed_line.trim_text != '':
                        new_matching_blocks.append(MatchingBlock.from_line(removed_line, added_line, match_probability))
                    currently_matching_blocks = [matching_block for i, matching_block in
                                                 enumerate(currently_matching_blocks) if i not in already_added]

            if removed_line.trim_text == '':
                extended_blocks, not_extended_blocks = \
                    self.extend_matching_blocks_with_empty_removed_lines_if_possible(currently_matching_blocks)
                new_matching_blocks.extend(extended_blocks)
                currently_matching_blocks = not_extended_blocks

            for matching_block in currently_matching_blocks:
                detected_blocks.append(matching_block)

            currently_matching_blocks = new_matching_blocks
            new_matching_blocks = []

        for matching_block in currently_matching_blocks:
            detected_blocks.append(matching_block)

        detected_blocks = self.join_nearby_blocks(detected_blocks)
        filtered_blocks = self.filter_blocks(detected_blocks, min_lines_count)
        logger.info(f'Detected {len(filtered_blocks)} blocks ({len(detected_blocks) - len(filtered_blocks)} filtered)')
        return filtered_blocks