Esempio n. 1
0
def compare(s):
    from numpy import abs
    from fuzzywuzzy import fuzz

    tisim = "title_sim"
    affsim = "aff_sim"
    cpd = "cit_peryear_diff"
    yd = "year_diff"
    od = "ord_diff"
    rod = "rel_ord_diff"
    ncd = "num_coauth_diff"
    scc = "same_coauth_count"
    x = Series(
        {
            tisim: fuzz.token_set_ratio(s[tia], s[tib]),
            affsim: fuzz.token_set_ratio(s[affna], s[affnb]),
            cpd: abs((np.float(s[cita]) / (2015 - s[ya])) - (np.float(s[citb]) / (2015 - s[yb]))),
            yd: abs(s[ya] - s[yb]),
            od: abs(s[orda] - s[ordb]),
            rod: abs(((np.float(s[numaa]) - s[orda]) / s[numaa]) - ((np.float(s[numab]) - s[ordb]) / s[numab])),
            ncd: abs(s[numaa] - s[numab]),
            scc: len([x for x in s[coaa] if x in s[coab]]),
        }
    )
    # x['title_sim'] = fuzz.token_set_ratio(s[tia], s[tib])
    # x['aff_sim'] = fuzz.token_set_ratio(s[affna], s[affnb])
    # x['cit_peryear_diff'] = np.abs((s[cita]/(2015 - s[ya])) - (s[citb]/(2015 - s[yb])))
    # x['year_diff'] = abs(s[ya] - s[yb])
    # x['ord_diff'] = abs(s[orda] - s[ordb])
    # x['rel_ord_diff'] = abs(((s[numaa] - s[orda])/s[numaa]) - ((s[numab] - s[ordb]) / s[numab]))
    # x['num_coauth_diff'] = abs(s[numaa] - s[numab])
    # x['same_coauth_count'] = len([x for x in s[coaa] if x in s[coab]])
    return x
Esempio n. 2
0
def score_gr_details(search_query):
    gr_url = create_gr_url(search_query)
    response = requests.get(url)
    pq_data = pq(response.content)
    books = pq_data("tr[itemtype='http://schema.org/Book']")
    all_info = pq_data(books).children("td").eq(1)
    book_info = pq_data(all_info).children("a.bookTitle")
    author_info = pq_data(all_info).find("a.authorName").eq(0)
    # will books here

    biblio_info = []
    for book in books: # or is it for book in book_info ?
        # query_obj = pq(book)
        title = pq_data(book_info).text().strip()
        author = pq_data(author_info).text().strip()
        if author:
            biblio_info.append( (title, author) )
    
    if not biblio_info:
        return None

    scored_info = []
    
    for info in biblio_info:
        title_score = fuzz.token_set_ratio(info[0], book_title)
        author_score = fuzz.token_set_ratio(info[1], author)
        total_score = title_score + author_score
        scored_info.append( (total_score, info) )

    scored_info.sort()
    return scored_info[-1][1]
Esempio n. 3
0
def count(text,KW):
	text = text.lower()
	
	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
	sentences = sent_detector.tokenize(text.strip())
	adjmentions = []
	tokenizer = RegexpTokenizer(r'\w+')
	wordcount = 0

	for sentence in sentences:
		tokens = tokenizer.tokenize(sentence)
		words = len(tokens)
		wordcount += words
		if fuzz.token_set_ratio(KW,sentence)<30:
			score = 0
		else:	
			score = fuzz.token_set_ratio(KW,sentence) *.01
		adjmentions.append(score)

	aggregate = 0 
	for x in range(len(adjmentions)):

		aggregate += adjmentions[x]

	
	text = text.replace('   ', '')
	text = text.replace('.', ' ')
	text = text.replace('   ', '')
	character_count = len(text)
	return [aggregate,wordcount,character_count]
Esempio n. 4
0
def compare_strings(string_one, string_two):
    highest_ratio = 0
    if fuzz.ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.ratio(string_one, string_two)
    if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_sort_ratio(string_one, string_two)
    if fuzz.token_set_ratio(string_one, string_two)>highest_ratio:
        highest_ratio = fuzz.token_set_ratio(string_one, string_two)
    return highest_ratio
Esempio n. 5
0
 def row_similarity(row):
     same_email = row.author_email == row.author_email_other
     name_similarity = fuzz.token_set_ratio(row.author_name,
                                            row.author_name_other)
     email_name_similarity = fuzz.ratio(row.email_name,
                                        row.email_name_other)
     name_to_email_similarity = fuzz.token_set_ratio(row.author_name,
                                                     row.name_from_email_other)
     return pd.Series(
         [same_email, name_similarity, email_name_similarity,
          name_to_email_similarity])
Esempio n. 6
0
def match(song, gdic):
    ftype = song[song.rfind('.'):].lower()
    try:
        if ftype == ".mp3":
            smp = MP3(song)
        elif ftype == ".wma":
            print("wma")
            return "False"
        elif ftype == ".flac":
            smp = FLAC(song)
        elif ftype == ".ogg":
            print("ogg")
            return "False"
        elif ftype in (".mp4", ".m4a"):
            smp = MP4(song)
        else:
            return False
    except IOError:
        return "delete"
    if ftype == ".flac":
        name = smp['title'][0]
        artist = smp['artist'][0]
        album = smp['album'][0]
    elif ftype == ".m4a":
        name = smp['\xa9nam'][0]
        artist = smp['\xa9ART'][0]
        album = smp['\xa9alb'][0] 
    else:
        try:
            name = smp["TIT2"].pprint()[5:].replace('[','(').replace(']',')')
            artist = smp["TPE1"].pprint()[5:].replace("Feat", "Featuring").replace("Andre 3000", "OutKast").replace("Big Boi", "OutKast")
            album = smp["TALB"].pprint()[5:]
        except KeyError:
            return False
    pmatch = [i for i in gdic if fuzz.token_set_ratio(name, i['title']) > 90]
    if len(pmatch) == 1:
        return pmatch[0]
    pmatch = [i for i in pmatch if fuzz.token_set_ratio(artist, i['artist']) > 90]
    if len(pmatch) == 1:
        return pmatch[0]
    pmatch = [i for i in pmatch if fuzz.token_set_ratio(album, i['album']) > 90]
    if len(pmatch) == 1:
        return pmatch[0]
    #pmatch = [i for i in pmatch if ((('(' not in name) and ('(' not in i['title'])) or ((('(' in name) and ('(' in i['title'])) and (name[name.rindex("(") + 1:name.rindex(")")].lower() == i['title'][i['title'].rindex("(") + 1:i['title'].rindex(")")].lower())))]
    pmatch = [i for i in gdic if fuzz.token_sort_ratio(name, i['title']) > 90]
    if len(pmatch) == 1:
        return pmatch[0]
    #print ([(i['title'], i['artist'], i['album'], i['durationMillis']) for i in pmatch])
    pmatch = [i for i in pmatch if abs(smp.info.length * 1000 - int(i['durationMillis'].encode('utf-8'))) < 1000]
    if len(pmatch) == 1:
        return pmatch[0]
    else:
        #print(name, artist, album, smp.info.length * 1000)
        return False
 def compare_two_texts(self, string_a, string_b, normalize_value=True):
     """
     Compare two string and return the value of Token Set Ratio algorithm
     the value is normalized between 0 and 1 values.
     """
     if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or
             (isinstance(string_a, str) and isinstance(string_b, str))):
         if normalize_value:
             return self.__normalized_value(fuzz.token_set_ratio(string_a, string_b))
         else:
             return fuzz.token_set_ratio(string_a, string_b)
     else:
         raise TypeError
def fuzzyNameMatch(name1, name2):
	name1 = name1.lower()
	name2 = name2.lower()
	name1 = fuzz.asciidammit(name1)
	name2 = fuzz.asciidammit(name2)
	ratio = fuzz.token_set_ratio(name1,name2)	
	return ratio 
Esempio n. 9
0
def calculate_confidence(model_a, model_b, mapping='FIRST_PASS'):
    """Determine the similarity between model_a and model_b.

    Goes through the mappings and compares those attrs
    between each of the modules produced in the ``search`` function.

    :rtype float: 0.0 to 1.0, the degree that the two models are similar.

    """
    attr_map = _get_mapping(mapping, model_a)
    if not attr_map:
        return 0.0

    total_match = 0.0
    # This becomes our denominator for arithemetic mean.
    num_attrs = 0.0
    for a_attr, b_attr in attr_map:
        _trans, a_attr = _unpack_a_attr(a_attr)
        a_value = getattr(model_a, a_attr)
        b_value = getattr(model_b, b_attr)
        if not a_value or not b_value:
            continue

        num_attrs += 1.0

        # Because we want a ratio, not a precentage
        ratio = fuzz.token_set_ratio(
            unicode(a_value), unicode(b_value)
        ) / 100.0
        total_match += ratio

    return total_match / max(num_attrs, 1)
 def get_max_candidates(self, candidates, word, wDict):
     if len(candidates) == 1:
         return max(candidates,
                    key=wDict.get)
     elif len(candidates) == 0:
         return 'NO SUGGESTION'
     else:
         matched = 0
         synonyms = None
         old_word = None
         mapped = dict()
         list_value = list()
         for value in candidates:
             ratio = fuzz.token_set_ratio(word.lower().strip(), value.lower().strip())
             mapped[value] = ratio
             list_value.append(ratio)
         max_key = max(mapped.items(), key=operator.itemgetter(1))[0]
         max_ratio = max(mapped.items(), key=operator.itemgetter(1))[1]
         if list_value.count(max_ratio) > 1:
             for value, ratio in mapped.items():
                 if matched <= ratio:
                     # print(old_word, synonyms, matched)
                     if old_word is not None:
                         synonyms = self.get_soundex(word.lower().strip(), value.lower().strip(), old_word)
                         old_word = synonyms
                     else:
                         synonyms = value.lower().strip()
                         old_word = synonyms
                     matched = ratio
         else:
             synonyms = max_key
         return synonyms
Esempio n. 11
0
def fuzz_comparisons(x):
    out = {}
    out['fuzz_partial_ratio'] = fuzz.partial_ratio(*x)
    out['fuzz_ratio'] = fuzz.ratio(*x)
    out['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(*x)
    out['fuzz_token_set_ratio'] = fuzz.token_set_ratio(*x)
    return pd.Series(out)
Esempio n. 12
0
def process(translations, cmdList):

    print "translations", translations[0]

    bestNode = cmdList[len(cmdList) - 1].tree[0][0]  # no match found command
    bestNode.confidence = 40

    for translation in translations[0]:  # 1-4
        for cmd in cmdList:  # 10-20
            for nodeListId in range(len(cmd.tree)):
                if nodeListId > 0 or "Sherlock" in translation:
                    nodeList = cmd.tree[nodeListId]
                    for node in nodeList:  # 2-6
                        if node.isOpen():
                            for key in node.keys:  # 1-4
                                confidence = fuzz.token_set_ratio(key, translation)
                                if confidence > bestNode.confidence:
                                    bestNode.keys = node.keys
                                    bestNode.open = node.open
                                    bestNode.func = node.func
                                    if confidence > 60:
                                        print "Executing:", node.keys
                                        return cmdList.index(cmd), node
                                        # return node
    print "null command"
    return 0, cmdList[0].tree[0][0]
Esempio n. 13
0
def search123(request):
  global itertool
  itertool=itertools.count()
  searc=search1()
  if request.method=="POST":        
        searc=search1(request.POST or None)
        if 'button7' in request.POST:
            if 'd_box' in request.POST and request.POST['d_box'] and not request.POST['name']:
                item_map=item.objects.raw('SELECT * FROM `item` WHERE `category_id`=%s', [request.POST['d_box']])
                lis=[]
                for e in (item_map):
                    lis.append(e.id)
                price_map=item_done.objects.filter(item_id__in=lis).order_by('item_id')
                return render(request,'index.html',{'posts':price_map,'posts1':searc,'itertools':itertool})
            
            else:
              x=request.POST['name']
              sql="SELECT * FROM `item`"
              cursor.execute(sql)
              query=cursor.fetchall()
              lis=[]
              for e in range(len(query)):
                    y=str(query[e][1])
                    rat=fuzz.token_set_ratio(x,y)
                    if rat >= 75:
                        lis.append(query[e][0])
              price_map=item_done.objects.filter(item_id__in=lis).order_by('item_id','site_price')
              return render(request,'index.html',{'posts':price_map,'posts1':searc,'itertools':itertool})
  
  return render_to_response('index.html',{'posts1':searc},RequestContext(request))
Esempio n. 14
0
 def renaming(self, path, filename):
     filename = self.preview(filename)
     for element in os.listdir(path):
         if fuzz.token_set_ratio(filename, element) == 100:
             path_file = os.path.join(path, element)
             target = os.path.join(path, filename)
             os.rename(path_file, target)
Esempio n. 15
0
def check_match(row, class_type):
  and_accepted = True
  or_accepted = False
  columns = list()
  ratio = 0
  max_ratio = 0
  total_ratio = 0
  count = 0

  for entry in row:
    for word in inputs:
      ratio = fuzz.token_set_ratio(word, row[entry])
      if (ratio > 50):
        columns.append(entry) # to be hilighted later
        or_accepted = True
        total_ratio += ratio
        count += 1
        if ratio > max_ratio:
          max_ratio = ratio 
  if count != len(inputs):
    and_accepted = False
       
  if and_accepted:
    and_results.append({'type':class_type, 'ratio':total_ratio//count, 'columns': columns, 'row':row})
  elif or_accepted:
    or_results.append({'type':class_type, 'ratio':max_ratio, 'columns': columns, 'row':row})
Esempio n. 16
0
def match_popit(name):

    df = pandas.DataFrame.from_csv('data/popit-persons.csv')

    for popit_name in df.itertuples():
        if fuzz.token_set_ratio(name.upper(),popit_name[1].upper()) > 95:
            return popit_name
Esempio n. 17
0
 def _enhance_element_info(self, sen, elements):
     lower_sen = sen.lower()
     lower_placeholder_set_ratio = []
     for i in elements:
         if i.placeholder:
             lower_placeholder_set_ratio.append(fuzz.token_set_ratio(i.placeholder.lower(), lower_sen))
     vec = help2vec.input_help_to_vec(lower_sen)
     enhanced = False or (len(lower_placeholder_set_ratio) > 0 and max(lower_placeholder_set_ratio) > 50)
     if len(vec) > 0:  # Means it might be a help text. Now we have to link this with the corresponding input
         logging.debug("Following sentence was detected as input help")
         logging.debug("Sentence: %s" % (sen))
         logging.debug("Vector: %s" % (str(vec)))
         e = utilities.match_help_to_element_NLP(elements, lower_sen)
         if e and not e.help:  # We found reference to a placeholder so fine.
             logging.debug("Found following element for input help by placeholder reference")
             logging.debug("Element: %s" % (str(e)))
             e.help = sen
             e.help_vector_string = json.dumps(vec)
             self._update_element(e)
             enhanced = True
         else: # We couldn't find reference to placeholder. So visual correlation
             try:  # Check if element still exists
                 elem = self.d.find_element_by_xpath("//*[contains(text(), '%s')]" % (sen))
                 if elem:
                     e = utilities.match_help_to_element_visually(elements, elem.location, elem.size)
                     if e and not e.help:  # We found reference to a placeholder so fine.
                         logging.debug("Found following element for input help by visual reference")
                         logging.debug("Element: %s" % (str(e)))
                         e.help = sen
                         e.help_vector_string = json.dumps(vec)
                         self._update_element(e)
                         enhanced = True
             except InvalidSelectorException, NoSuchElementException:
                 pass
def get_needed_songs(queries):
    global songs

    for query in queries:
        for i in range(0, 3):
            response = pool.get_next_api().audio.search(q = query
                                             , auto_complete = 1
                                             , performer_only = 1
                                             , offset = i * 250
                                             , count = 250
                                             , v = "5.44")
            count = response["count"]
            goted_songs = response["items"]
            for goted_song in goted_songs:
                if fuzz.token_set_ratio(goted_song["artist"], query) > 90:
                    song_id = goted_song["id"]
                    if songs.get(song_id) is None:
                        songs[song_id] = defaultdict()
                        songs[song_id]["id"] = goted_song["id"]
                        songs[song_id]["owner_id"] = goted_song["owner_id"]
                        songs[song_id]["artist"] = goted_song["artist"]
                        songs[song_id]["title"] = goted_song["title"]
            dumpData(songs, neededSongs)
            sleep(1)
        print("Needed songs count: ", len(songs.keys()))
Esempio n. 19
0
	def _author_similarity(self, other_author):
		if self.author and other_author:
			if fuzz.token_set_ratio(self.author, other_author) > 95:
				return True
			else: 
				return False
		return 'NAN'
Esempio n. 20
0
def top_token_set_ratio(values):
    """Return the best token set ratio match from fuzzywuzzy module."""
    scores = []
    for combo in combinations(values, 2):
        score = fuzz.token_set_ratio(combo[0], combo[1])
        tokens_0 = len(combo[0].split())
        tokens_1 = len(combo[1].split())
        if tokens_0 > tokens_1:
            value = combo[0]
            tokens = tokens_0
        elif tokens_0 < tokens_1:
            value = combo[1]
            tokens = tokens_1
        else:
            tokens = tokens_0
            value = combo[1]
            if len(combo[0]) <= len(combo[1]):
                value = combo[0]
        scores.append(FuzzySetScore(score, value, tokens))

    ordered = sorted(
        scores,
        reverse=True,
        key=lambda s: (s.score, s.tokens, 1000000 - len(s.value)))
    return ordered[0]
Esempio n. 21
0
def fw_token_set_ratio(question1, question2):
    fuzzy = []
    for q1, q2 in zip(question1, question2):
        partial_ratio = fuzz.token_set_ratio(str(q1), str(q2)) / 100
        fuzzy.append([partial_ratio])
    print("Created fuzz token_set_ratio feature")
    return np.array(fuzzy)
Esempio n. 22
0
def get_filter_link(link_choice,goal=None,min_score=None,max_limit=4,type=0):
    """
    To get relevent link from list of link
    """
    if min_score:
        min_score = int(min_score)
    else:
        min_score = 60
    scored_link_list = []
    scored_link_list_raw = process.extract(goal,link_choice,limit=max_limit)
    logger.info("Score details for goal {0} with statistics {1}. minimum score {2}".format(goal,scored_link_list_raw,min_score))
    try:
        if scored_link_list_raw:
            for i in list(scored_link_list_raw):
                link = i[0]
                if int(type) != 1:
                    score = i[1]
                    if int(score) >= min_score:
                        scored_link_list.append(link)
                    logger.info("PARTIAL MATCH : Final score is {0} of url {1}  for goal {2}".format(score,link,goal))
                else:
                    score = fuzz.token_set_ratio(goal,link)
                    logger.info("EXACT MATCH : Final score is {0} of url {1}  for goal {2}".format(score,link,goal))
                    if int(score) >= min_score:
                        scored_link_list.append(link)
    except:
        logger.exception("Error occure in get_filter_link() function")
    return scored_link_list
Esempio n. 23
0
	def _title_similarity(self, other_title):
		if self.title and other_title:
			if fuzz.token_set_ratio(self.title, other_title) > 95:
				return True
			else:
				return False
		return 'NAN'
Esempio n. 24
0
def fuzzy(products_name_set, listings):
    """
    The function that uses Levenstein distance to determine matching pairs of
    products and listings
    :param products_name_set: Indexed product names(For faster matching)
    :param listings: Listings to be matched
    :return: A dictionary containg the matched product with all its listings
    """
    final_products = defaultdict(list)
    for listing in listings:
        possible_products = set()
        for product_name in products_name_set:
             token_set_ratio = fuzz.token_set_ratio(listing["new_title"], product_name)
             if token_set_ratio is 100:
                possible_products.add(product_name)

        #More than one possible product found
        if len(possible_products) > 1:
            for possible_product in possible_products:
                partial_ratio = fuzz.partial_ratio(listing["new_title"], possible_product)
                if partial_ratio is 100:
                    final_products[possible_product].append(listing)
        else:
            for possible_product in possible_products:
                final_products[possible_product].append(listing)
    return final_products
Esempio n. 25
0
 async def _on_answer(self, conv):
     channel = conv.channel
     answer = conv.meta['answer'].lower()
     win_event = None
     for event in conv.events:
         if 'trivia' in event:
             continue
         score = fuzz.token_set_ratio(event['text_clean'], answer)
         if score > 80:
             win_event = event
             break
         elif score > 50:
             try:
                 await self.bot.send_message(channel,
                     "<@{}> Not quite...".format(event['user']))
             except KeyError as e:
                 import traceback
                 print("\n\nSomething went wrong in trivia")
                 traceback.print_exc()
         event['trivia'] = True
         conv.meta['attempts'] += 1
     if win_event is not None:
         user = win_event['user']
         self.status[channel]['scores'][user] += conv.meta['value']
         await self.bot.send_message(channel,
             "<@{}> got it right! The answer was {}".format(user, answer))
         conv.done()
         await self.ask_question(channel)
     return True
Esempio n. 26
0
def getRatio(var1, var2, alg):

    r1test = 40
    r2test = 100
    r3test = 100
    r4test = 90 # 85 is probably too low --- too many FP
    
    # let's keep alg as a dummy, but it may be unimportant
    # it seems that the quality of results can be improved if two (or)
    # -- more results are correlated: [1] can be lowered as long as [4] remains high
    
    r1 = fuzz.ratio(var1,var2)
    r2 = fuzz.partial_ratio(var1,var2)
    r3 = fuzz.token_sort_ratio(var1,var2)
    r4 = fuzz.token_set_ratio(var1,var2)

    if r1 >= r1test:
        if r4 >= r4test:
            ratio = 100
            #reportRatio(var1, var2)
        else:
            ratio = 0
    else:
        ratio = 0

    return(ratio)
Esempio n. 27
0
def match_bus(h,st):
	global store_bus	
	l=[]
	for x in h:
		temp=x
		no_list=re.findall(r'\d+',x)
		for n in no_list:
			temp=x.replace(n,' ' + n + ' ')
		temp=temp.lower().replace('-',' ').replace('.',' ')
		temp=' '.join(temp.split())
		if fuzz.token_set_ratio(st,temp)>60:
				l.append([x,temp])
	print("buses",l)
	ma=0
	
	if len(l):
		for i in range(len(l)):
			if partial_ratio2(st,l[i][1]) > partial_ratio2(st,l[ma][1]):
				ma=i
		p=l[ma][1]
		
		for i in range(len(l)):
			if partial_ratio2(st,l[i][1]) == partial_ratio2(st,l[ma][1]) and i!=ma:
				print(l[i][1],l[ma][1])
				store_bus=l[ma][1]
				return 2
		
		print("found bus",l[ma][0])
		bus_no_l.add(l[ma][0])
		return 1
	return 0
Esempio n. 28
0
    def match(self, listing):
        ''' Decide if this listing matches this product. In this version,
        we only match one Product at most ("A single price listing may match
        at most one product."), even though some listings are for items that
        are suitable for several products.

        Returns True or False.

        :param listing: A Listing object.
        '''
        # token_set_ratio() checks for all the 'words' in the first argument
        # existing in the second argument. Case-insensitive even. Exactly how
        # I was going to code it up until I found the fuzzywuzzy library,
        # which has the advantage of being previously debugged.
        score = fuzz.token_set_ratio(self.name, listing.title)
        if score == 100:
            # Exact fuzzy match on my product name inside the listing.
            return True

        manu_score = fuzz.token_set_ratio(self.manufacturer, listing.title)
        family_score = fuzz.token_set_ratio(self.family, listing.title) if self.family else 0
        model_score = fuzz.token_set_ratio(self.model, listing.title) if self.model else 0
        if ' ' in self.model and model_score < 100:
            # Canon SX130 IS vs. SX130IS...
            model_nospaces = ''.join(' '.split(self.model))
            if fuzz.token_set_ratio(model_nospaces, listing.title) == 100:
                model_score = 100

        if manu_score == 100 and family_score == 100 and model_score == 100:
            # Seems legit.
            return True

        # Generating false positives (for example 'Canon_IXUS_300_HS' is
        # matching "Canon PowerShot ELPH 300 HS (Black)". Turning this
        # off does make us miss "Canon NB-7L Lithium-Ion Battery for G10, G11,
        # G12 Cameras" unfortunately.
        #
        #if manu_score == 100 and model_score == 100:
        #    # People sometimes call things by manufacturer and model number.
        #    # Might be ambiguous though...
        #    return True

        if family_score == 100 and model_score == 100:
            # I'm typing on an IdeaPad Y500, for example.
            return True

        return False
Esempio n. 29
0
def best_match(s, categories, top_n=5):
    """Return the top N best matches from your categories."""
    scores = []
    for cat in categories:
        scores.append((cat, fuzz.token_set_ratio(s, cat)))

    scores = sorted(scores, key=lambda x: x[1])
    return scores[-top_n:]
Esempio n. 30
0
def fuzzy_search(search):
	#import json file with wine ratings/reviews into pandas dataframe
	df_ratings = pd.read_json('../data/wine_ratings.json')
	#execute fuzzy search and save results
	df_ratings['score'] = df_ratings['name'].map(lambda x: fuzz.token_set_ratio(search, x))
	df_sorted = df_ratings.sort(columns='score', ascending=False)

	return df_sorted
# Comman features
data['len_q1'] = data.question1.apply(lambda x: len(str(x)))
data['len_q2'] = data.question2.apply(lambda x: len(str(x)))
data['diff_len'] = np.abs(data.len_q1 - data.len_q2)
data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split()))
data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split()))
data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)
data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)

print('vector features ...')
# 
model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)
data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

# 
norm_model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)
norm_model.init_sims(replace=True)
data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)


question1_vectors = np.zeros((data.shape[0], 300))
for i, q in tqdm(enumerate(data.question1.values)):
Esempio n. 32
0
def predict_chip_dict(wdir, input_pattern_str, bamExt, fromBAM=None):
    """
    Predict a chip_dict from set of bam files
    ChIP input/control samples are identified from input_pattern (default: 'input')
    for each sample then the best input sample (by fuzzywuzzy score) is selected
    chip_dict is written as yaml to workflow workingdir
    predicts whether a sample is broad or narrow based on histone mark pattern
    """
    pat = "|".join(re.split(',| |\\||;', input_pattern_str))
    input_pat = r".*(" + pat + ")"
    clean_pat = r"" + pat + ""
    pat1 = re.compile(clean_pat, re.IGNORECASE)

    if fromBAM:
        infiles = sorted(glob.glob(os.path.join(fromBAM, '*' + bamExt)))
    else:
        infiles = sorted(
            glob.glob(os.path.join(wdir, 'filtered_bam/', '*.bam')))
    samples = get_sample_names_bam(infiles, bamExt)

    chip_dict_pred = {}
    chip_dict_pred["chip_dict"] = {}
    print(
        "---------------------------------------------------------------------------------------"
    )
    print("Predict Chip-seq sample configuration")
    print(
        "---------------------------------------------------------------------------------------"
    )
    print("\nSearch for Input/control samples...")

    input_samples = set([])
    for i in samples:
        if re.match(input_pat, i, re.IGNORECASE):
            print("...found: ", i)
            input_samples.add(i)

    print("\nTry to find corresponding ChIP samples...")

    for i in samples:
        if i in input_samples:
            continue

        print(
            "\n sample: ",
            i,
        )
        matches_sim = {}
        for j in input_samples:
            c_clean = pat1.sub("", j)
            sim1 = fuzz.ratio(c_clean, i) + fuzz.partial_ratio(
                c_clean, i) + fuzz.token_sort_ratio(
                    c_clean, i) + fuzz.token_set_ratio(c_clean, i)
            matches_sim[j] = sim1 / 4

        sim = 0
        final_matches = set([])
        for key, value in sorted(matches_sim.items(),
                                 key=lambda k: (k[1], k[0]),
                                 reverse=True):
            if value >= sim:
                final_matches.add(key)
                print("   top matching input sample by score: %s = %s" %
                      (key, value))
                sim = value

        tmp = ':'.join(list(final_matches))

        if len(final_matches) > 1:
            tmp = "__PLEASE_SELECT_ONLY_ONE_CONTROL__:" + tmp
        elif len(final_matches) == 0:
            print("No control sample found!")

        chip_dict_pred["chip_dict"][i] = {}
        chip_dict_pred["chip_dict"][i]['control'] = tmp
        if re.match(".*(H3K4me1|H3K36me3|H3K9me3|H3K27me3).*", i,
                    re.IGNORECASE):
            chip_dict_pred["chip_dict"][i]['broad'] = True
        else:
            chip_dict_pred["chip_dict"][i]['broad'] = False

    outfile = os.path.join(wdir, "chip_seq_sample_config.PREDICTED.yaml")
    write_configfile(outfile, chip_dict_pred)
    print(
        "---------------------------------------------------------------------------------------"
    )
    print("Chip-seq sample configuration is written to file ", outfile)
    print(
        "Please check and modify this file - this is just a guess! Then run the workflow with it."
    )
    print(
        "---------------------------------------------------------------------------------------"
    )
	# print "Row id: ", loc[0]

	query_result = google_places.nearby_search(
		lat_lng={'lat' : lat, 'lng' : lon},
		rankby='prominence', radius=100)

	similarity = 0.0
	bestPlace = ''
	bestPlaceType = ''

	#returns a list of places
	for place in query_result.places:
		#get the place name
		placeName = (place.name).encode("utf8").lower()
		#do fuzzy match with the tag string for that location coordinate
		simNew = fuzz.token_set_ratio(text, placeName)
		
		#find the place with the highest similarity with the tag string
		if simNew > similarity:
			similarity = simNew
			bestPlace = placeName
			# query for place type or category only if it isn't present in the dictionary
			# saving API limit
			if placeName not in placeCategory.keys():
				place.get_details()
				bestPlaceType = place.details[u'types']
			
			else:

				bestPlaceType = placeCategory[placeName]
Esempio n. 34
0
def get_fuzzy_similarity(sent1, sent2):
    sim = fuzz.token_set_ratio(sent1, sent2)
    if sim == 0:
        return 0
    else:
        return sim / 100
Esempio n. 35
0
    def run_quickstart(self):
        from fuzzywuzzy import fuzz
        # [START language_quickstart]
        # Imports the Google Cloud client library
        # [START language_python_migration_imports]
        from google.cloud import language
        from google.cloud.language import enums
        from google.cloud.language import types
        # [END language_python_migration_imports]
        from database import DatabaseManager, Note
        dataIn = DatabaseManager("notes")
        dataOut = DatabaseManager("super_notes")

        # Instantiates a client
        # [START language_python_migration_client]
        client = language.LanguageServiceClient()
        # [END language_python_migration_client]

        text1 = dataOut.get_note_key(self.note1_key)["note"]
        text2 = dataIn.get_note_key(self.note2_key)["note"]

        # __________________________
        #       READ VALUE 1
        # __________________________

        # with open('input1.txt', 'r') as file:
        #     text1 = file.read().replace('\n', '')

        # ___________________________
        #       READ VALUE 2
        # ___________________________

        # with open('input2.txt', 'r') as file2:
        #     text2 = file2.read().replace('\n', '')

        words1 = text1.split(".")
        words2 = text2.split(".")

        for x in words1:
            if (x[:1] == " "):
                x = x[1:]

        for x in words2:
            if (x[:1] == " "):
                x = x[1:]

        keywords1 = []
        key_sentances1 = ""
        key_sent_array_1 = []
        keywords2 = []
        key_sentances2 = ""
        key_sent_array_2 = []

        # The text to analyze
        document1 = types.Document(content=text1,
                                   type=enums.Document.Type.PLAIN_TEXT)

        document2 = types.Document(content=text2,
                                   type=enums.Document.Type.PLAIN_TEXT)

        outputText = ""

        # Detects the sentiment of the text
        response1 = client.analyze_entities(
            document=document1,
            encoding_type='UTF32',
        )
        for entity in response1.entities:
            if entity.salience > 0.015:
                keywords1.append(entity.name)
                print('=' * 20)
                print('name: {0}'.format(entity.name))
                print('type: {0}'.format(entity.type))
                print('metadata: {0}'.format(entity.metadata))
                print('salience: {0}'.format(entity.salience))

        response2 = client.analyze_entities(
            document=document2,
            encoding_type='UTF32',
        )
        for entity in response2.entities:
            if entity.salience > 0.015:
                keywords2.append(entity.name)
                print('=' * 20)
                print('name: {0}'.format(entity.name))
                print('type: {0}'.format(entity.type))
                print('metadata: {0}'.format(entity.metadata))
                print('salience: {0}'.format(entity.salience))

        print("Keys 1:", keywords1)
        print("Keys 2:", keywords2)

        for x in words1:
            for i in keywords1:
                if (x.find(i) > -1) and x not in key_sentances1:
                    key_sentances1 += x + "\n"
                    key_sent_array_1.append(x)

        for x in words2:
            for i in keywords2:
                if (x.find(i) > -1) and x not in key_sentances2:
                    key_sentances2 += x + "\n"
                    key_sent_array_2.append(x)

        #print(key_sentances2)

        #out = open("output1.txt", "w")
        #out.write(key_sentances1)
        #out.close()

        #out = open("output2.txt", "w")
        #out.write(key_sentances2)
        #out.close()

        newVals = [" "]

        for x in key_sent_array_1:
            canAdd = True
            for i in newVals:
                Token_Set_Ratio = fuzz.token_set_ratio(x, i)
                if Token_Set_Ratio > 80:
                    canAdd = False
            if canAdd:
                newVals.append(x)

        for x in key_sent_array_2:
            canAdd = True
            for i in newVals:
                Token_Set_Ratio = fuzz.token_set_ratio(x, i)
                if Token_Set_Ratio > 50:
                    canAdd = False
            if canAdd:
                newVals.append(x)

        newValsString = ""

        for x in newVals:
            newValsString += x + "\n"

        #writing to database
        super_note = Note(2, "physics", newValsString)
        dataOut.add_note_to_db(super_note)

        #_______________________________________
        #       ADDING OUTPUT
        #_______________________________________

        # final = open("final.txt", "w")
        # final.write(newValsString)
        # final.close()

        return newValsString


# n = NoteAnalysis("-LqyiulvtclaFSFsC4_Q", "-Lqyl7NHN9vWsMeJYBIM")
Esempio n. 36
0
    def run(self):
        st = datetime.datetime.now()
        print('1. [start] init_run ------------------------------')

        init_confidence = self.confidence
        self.init_run()
        et = datetime.datetime.now()
        print('1. [end] init_run => ', et - st)

        st = datetime.datetime.now()
        print('2. [start] run_batch ------------------------------')

        # 명사 위주로 강하게 묶기 (단, 횟수가 많으면 정확도가 떨어짐)
        self.confidence = init_confidence
        for i in range(2):

            if self.confidence >= 70:

                st_1 = datetime.datetime.now()

                self.run_batch(noun=True)
                self.confidence = self.confidence - 5

                et_1 = datetime.datetime.now()
                print('2-1. [end] run_batch noun-------', i + 1,
                      '번째 run_batch => ', et_1 - st_1)

            elif self.confidence < 70:
                break

        # 동사 포함하여 약하게 풀면서 묶기
        self.confidence = init_confidence
        for i in range(self.batch_size):
            if self.confidence >= 70:

                st_1 = datetime.datetime.now()
                self.run_batch(noun=False)
                self.confidence = self.confidence - 3

                et_1 = datetime.datetime.now()
                print('2-2. [end] run_batch verb-------', i + 1,
                      '번째 run_batch => ', et_1 - st_1)

            elif self.confidence < 70:
                break

        et = datetime.datetime.now()
        print('2. [end] run_batch => ', et - st)

        # merge run > 그룹간의 대표 텍스트를 비교하여 합칠 그룹이 있다면 매칭시킴
        # reform run > 묶인 예시가 2개 이하인데 대표 텍스트가 많은 건 다른 예시가 합쳐질 확률이 적기 때문에 쪼갠 후 big그룹과 다시 비교 후 매칭
        if self.merge:
            st = datetime.datetime.now()
            print('3. [start] merge_run ------------------------------')
            self.merge_run()
            et = datetime.datetime.now()
            print('3. [end] merge_run => ', et - st)

            st = datetime.datetime.now()
            print('4. [start] reform_run ------------------------------')
            self.reform_run()
            et = datetime.datetime.now()
            print('4. [end] reform_run => ', et - st)

        #최종 비교

        self.confidence = init_confidence
        tmp_clusters = []
        for cluster in self.clusterings:
            if len(cluster['texts']) > 2:
                cluster['texts'] = []
                tmp_clusters.append(cluster)

        text_test = []
        for i, text in enumerate(self.before_texts):
            convert_text = self.filtering(str_list=[text], noun=True)
            for cluster in tmp_clusters:
                this_ratio = fuzz.token_set_ratio(cluster['totalText'], text)
                conv_ratio = fuzz.token_set_ratio(cluster['totalText'],
                                                  convert_text)
                if this_ratio >= 70 or conv_ratio >= 75:
                    cluster['texts'].append(text)
                    text_test.append(text)
                    break
        print(len(self.before_texts), len(list(set(self.before_texts))),
              len(text_test))

        final_clusters = []
        for cluster in tmp_clusters:
            if len(cluster['texts']) > 0:
                final_clusters.append(cluster)

        self.clusterings = final_clusters

        return self.clusterings
def get_location_data(location_name, api_key):
    data_dict = {'q': location_name, 'key': api_key}

    loc_by_query = LocationByQuery(data_dict)
    q_result_list = loc_by_query.get_address

    time.sleep(1)

    print(f'{location_name}: # of results:- {len(q_result_list)}')

    main_results_list = []
    for result in q_result_list:
        if 'countryRegionIso2' not in result.keys():
            result['countryRegionIso2'] = None
        if result['countryRegionIso2'] != 'US':
            continue
        else:
            result_index = q_result_list.index(result)
            q_f_address = result['formattedAddress']
            if 'locality' in result.keys():
                q_f_locality = result['locality']
            else:
                q_f_locality = None
            item_list = [
                result_index, data_dict['q'], q_f_locality, q_f_address
            ]
            main_results_list.append(item_list)
    #             print(item_list, result)

    pre_output_dict = {}
    for res_list in main_results_list:
        locality_TokenSetRatio = fuzz.token_set_ratio(res_list[1], res_list[2])
        f_address_TokenSetRatio = fuzz.token_set_ratio(res_list[1],
                                                       res_list[3])
        pre_output_dict[res_list[0]] = [
            locality_TokenSetRatio, f_address_TokenSetRatio
        ]

    # -------------------------------Filtering Based on Locality match score
    locality_score_list = []
    max_locality_score = 0
    max_locality_score_key = 0

    for key, val in pre_output_dict.items():
        locality_score = pre_output_dict[key][0]

        if locality_score > max_locality_score:
            max_locality_score = locality_score
            max_locality_score_key = key
        else:
            continue
    # -----------------------------Filtering original dict based on locality score
    filtered_pre_output_dict = {}
    for key, val in pre_output_dict.items():
        if pre_output_dict[key][0] == max_locality_score:
            filtered_pre_output_dict[key] = val
        else:
            continue
    #     print(filtered_pre_output_dict)

    # ------------------Get required index based on resulting f_address_score
    max_f_address_score = 0
    f_address_indices = []

    if max_locality_score == 0:
        used_output_dict = pre_output_dict
    else:
        used_output_dict = filtered_pre_output_dict

    for key, val in used_output_dict.items():
        f_address_score = used_output_dict[key][1]
        if f_address_score > max_f_address_score:
            max_f_address_score = f_address_score
            f_address_indices.append(key)
        else:
            continue

    if len(f_address_indices) == 0:
        return ''
    min_index_for_max_f_address_score = f_address_indices[0]

    index_to_use = min_index_for_max_f_address_score
    required_data = q_result_list[index_to_use]
    print(f'{location_name} : {required_data}')
    print('------------------------------------------------------------')

    return required_data
Esempio n. 38
0
 def testPartialTokenSetRatio(self):
     self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100)
Esempio n. 39
0
from fuzzywuzzy import fuzz
import re
for s in scrutins.values():
    estamd = re.match(r'.*l\'amendement n. *([0-9]+) de (.*)', s['desc'])
    if 0:
        namd = estamd.groups()[0]
        sig = estamd.groups()[1]
        dos = amds.get(s['dossierlien'], None)
        if not dos:
            continue
        candidats = []
        for _amds in dos:
            if namd in _amds.keys():
                candidats.append((fuzz.token_set_ratio(
                    sig,
                    _amds[namd]['signataires'].split(',')[0]), _amds[namd]))

        _amds = sorted(candidats, key=lambda x: x[0], reverse=True)[0][1]
        amd_detail = json.loads(
            requests.get(
                'http://www2.assemblee-nationale.fr/recherche/query_amendements?id='
                + _amds['id'] + '&leg=15&typeRes=doc').content)
        fields = amd_detail['infoGenerales']['description_schema'].split(
            '|') + ['autre', 'autre1', 'autre2']
        _amdscompl = [
            dict((fields[i], v) for i, v in enumerate(elt.split('|')))
            for elt in amd_detail['data_table']
        ][0]
        _amdscompl.update(_amds)
        s['reference'] = _amdscompl
Esempio n. 40
0
 def _similarityScore(s1, s2):
     return fuzz.token_set_ratio(s1, s2)
Esempio n. 41
0
    async def lookup(self, ctx, *args):
        """Takes in subject name and allows you to look up courses and sections in that subject"""

        # Store their lookup string, store list of subject strings
        lookup_string = " ".join(args)
        subject_options = [x["description"] for x in subject_list]
        subject_match = (0, "")

        # Go through all subject options and find the best fuzzy search match to lookup string
        for subject in subject_options:
            ratio = fuzz.token_set_ratio(lookup_string, subject)
            if ratio > subject_match[0]:
                subject_match = (ratio, subject)

        # Initialize mapping from course titles to objects, subject titles to objects, and list for course titles
        courses = {x["title"]: x for x in classes_dict[subject_match[1]]}
        subjects = {x["description"]: x for x in subject_list}
        course_titles = [k for (k, v) in courses.items()]

        # Separate course titles by commas
        course_str = ", ".join(course_titles)

        # Display all the course options from the best match
        embed = discord.Embed(
            title=f"Showing results for {subject_match[1]}",
            description="Please choose a course",
            color=0xFF0000,
        )
        embed.add_field(name="Type in a course name to see available sections",
                        value=course_str)
        await ctx.send(embed=embed)

        def check(m):
            """Quick check to make sure only the person in the game and channel can respond"""
            return m.channel == ctx.channel and m.author == ctx.author

        # Wait for their course choice
        msg = await self.bot.wait_for("message", check=check)

        # Keep prompting until it's a valid course
        while msg.content.upper() not in course_titles:
            await ctx.send(
                "Please enter course name exactly, case doesn't matter")
            msg = await self.bot.wait_for("message", check=check)

        # Get their course object from the map
        chosen_course = courses[msg.content.upper()]

        # Set up an embed title and description with their course
        embed = discord.Embed(
            title=f"{subject_match[1]}, {msg.content.upper()}",
            description=
            f'01:{subjects[subject_match[1]]["code"]}:{chosen_course["courseNumber"]}',
            color=0xFF0000,
        )

        # Go through all the sections of the course
        for section in chosen_course["sections"]:
            # Set up variables describing the section
            index = section["index"]
            status = "open" if check_open(index) else "closed"
            number = section["number"]
            profs = "; ".join([x["name"] for x in section["instructors"]])

            # Add a field for that section
            embed.add_field(name=f"{number}, {index}\n{profs}",
                            value=f"Status: {status}")

        # Send the section data
        await ctx.send(embed=embed)
Esempio n. 42
0
    reader = csv.DictReader(csvfile)
    for row in reader:
        nameList.append(str(row[columnName]))
counter = len(nameList)
f=csv.writer(open(fileName[:fileName.index('.')]+'NearMatches.csv','w'))
f.writerow(['percentage']+['name1']+['name2'])
completeNearMatches = []
for name in nameList:
    counter -= 1
    print('Rows remaining: ', counter)
    for name2 in nameList:
        if name != name2:
            ratio = fuzz.ratio(name, name2)
            partialRatio = fuzz.partial_ratio(name, name2)
            tokenSort = fuzz.token_sort_ratio(name, name2)
            tokenSet = fuzz.token_set_ratio(name, name2)
            avg = (ratio+partialRatio+tokenSort+tokenSet)/4
            if avg > threshold:
                nearMatch = [avg, name, name2]
                nearMatch = sorted(nearMatch)
                if nearMatch not in completeNearMatches:
                    completeNearMatches.append(nearMatch)
        else:
            pass

for nearMatch in completeNearMatches:
    f.writerow([nearMatch[0]]+[nearMatch[1]]+[nearMatch[2]])

elapsedTime = time.time() - startTime
m, s = divmod(elapsedTime, 60)
h, m = divmod(m, 60)
Esempio n. 43
0
    def create_check(self,
                     df,
                     columns_merge=['nom_etablissement', 'adresse'],
                     additional_columns_in_check=[],
                     TRESHOLD_INF=50,
                     TRESHOLD_SUP=90,
                     common_words=[],
                     merge_also_equality=False):
        """
        Create consolidate_check.xlsx in which you have to put a cross per match
        ie : for triplet you put 3 crosses

        Inputs:
        An Excel containing a column 'code_postal' converted into a pandas DataFrame

        columns_merge : Name of the colums on which we will perform the merge
        Do not put in it the postal code
        There is still the possibility to put more than one feature in columns df_check
        However, if you put just the nom_etablissement in columns_merge, the algorithm will work nicely.
        We do not recommand putting in it the address

        Return :
        At the end, the algorithme will return an Excel in which the human-checker
        will have to put x in the column 'check'
        If the score is contained between the two tresholds, there is a manual check
        """

        # Think about the tests of conformity of the Excels : hypothesis : already done in Alteryx
        # Maybe a future improvement = count each word and delete the most frequent ones or ponderate by the inverse of their occurrence

        # Putting the index in the slicing in order to writting them down easily in the duplicates_check.xlsx
        df = df.reset_index(drop=True)
        df["id_duplicates"] = df.index.values

        # Uniformisation of the inputs, without changing the output of the final_duplicates.xlsx
        # That is not necessary but sometimes in the excels, code_postal is encoded as int or str
        for feature in columns_merge:
            df[feature] = df[feature].apply(str)

        # df which will be exported into Excel as consolidate_check
        df_check = pd.DataFrame()

        postal_codes = list(set(df['code_postal']))
        match_id = -1
        print("Analyzing the duplicates ...")
        for nbr_postal_code, postal_code in enumerate(postal_codes):
            # Une matrice par code postal

            print(nbr_postal_code, len(postal_codes))

            df_postal_code = df[df['code_postal'] == postal_code]

            # for each line in the short_excel, we seek the corresponding line in the long_excel
            for i, row_short in df_postal_code.iterrows():
                match_id += 1

                if len(df_postal_code) > 0:

                    # Contains all the lines of the long Excel which matches sufficiently with the line i of the shot Excel
                    df_match = pd.DataFrame()

                    for j, row_long in df_postal_code.iterrows():
                        if j > i:
                            # matching_score > TRESHOLD_SUP : very very probably a match
                            # matching_score between the two TRESHOLD : manual check
                            # matching_score < TRESHOLD_INF : very very unlikely to be a match

                            # We want to calculate the minimum between the matching score of the address and of the name
                            min_score = 100
                            for elem_merge in columns_merge:
                                str_long = row_long[elem_merge].lower()
                                str_short = row_short[elem_merge].lower()

                                # We delete the frequent words to also perform the duplicates check on it
                                str_long_without = strip_frequently_used_word(
                                    str_long, common_words)
                                str_short_without = strip_frequently_used_word(
                                    str_short, common_words)
                                score_without = fuzz.token_set_ratio(
                                    str_long_without, str_short_without)
                                score_with = fuzz.token_set_ratio(
                                    str_long, str_short)

                                # We want to find all the duplicates, so we take the maximum between score_without and score_with
                                score = max(score_with, score_without)
                                if score > TRESHOLD_INF:
                                    # min_score = min(matching_score(address), matching_score(name))
                                    if score < min_score:
                                        min_score = score
                                else:
                                    min_score = score
                                    # We break because the min_score = min(matching_score(address), matching_score(name))
                                    # and here we already now that eiher the two address or the two name are suffisciently different in order to not compare the other features
                                    break

                            # We copy the line in the df_match if the lines matches sufficiently
                            if min_score > TRESHOLD_INF:
                                row_long['match_id'] = match_id
                                row_long['source_duplicates'] = 'long'
                                row_long['first_line_match_id'] = 0
                                row_long['matching_score'] = min_score

                                df_match = df_match.append(row_long,
                                                           sort=False)

                    # We print the potential best candidates after printing the short line
                    row_short['match_id'] = match_id
                    row_short['first_line_match_id'] = 1

                    if len(df_match) == 0:
                        # If no potential match have been found, we just write down the short single line
                        row_short['source_duplicates'] = 'single'
                        df_check = df_check.append(row_short, sort=False)
                    else:
                        # We sort the matches by putting the best matches first
                        df_match = df_match[
                            df_match['matching_score'] > TRESHOLD_INF]
                        df_match = df_match.sort_values(by=['matching_score'],
                                                        ascending=False)
                        df_match_max = df_match[
                            df_match['matching_score'] == max(
                                df_match['matching_score'])]
                        df_match_max = df_match_max.reset_index(drop=True)

                        if df_match_max.loc[0,
                                            'matching_score'] > TRESHOLD_SUP:
                            # if the score of the best matches is high enough, thery are duplicates
                            if len(df_match_max) == 1 or len(
                                    df_match_max) > 1 and merge_also_equality:
                                row_short[
                                    'source_duplicates'] = 'automatically merged'
                                df_match_max[
                                    'source_duplicates'] = 'automatically merged'
                                row_short['check'] = 'x'
                                df_match_max.loc[:, 'check'] = 'x'
                            else:
                                # if there is a triplet having the same matching score, we let the human checker select the duplicates
                                row_short[
                                    'source_duplicates'] = 'not merged because equality'
                                df_match_max[
                                    'source_duplicates'] = 'not merged because equality'
                            df_check = df_check.append(row_short, sort=False)
                            df_check = df_check.append(df_match_max,
                                                       sort=False)

                            #  à chaque fois qu'on fait un append comme df_check = df_check.append(df_match_max, sort=False), le code est beaucoup ralenti car l'append recopie tout le df_check. Ce problème a été patché dans la consolidation. On pourra s'en inspirer ici si besoin
                        else:
                            # if all matching score are under TRESHOLD_SUP, we writte down all matches
                            row_short['source_duplicates'] = 'manual check'
                            row_short['check'] = ''
                            df_match['source_duplicates'] = 'manual check'
                            df_check = df_check.append(row_short, sort=False)
                            df_check = df_check.append(df_match, sort=False)

                else:
                    print("No ", postal_code, "fund in the long Excel file.")

        # Keeping only the merge columns
        df_check = df_check[columns_merge + additional_columns_in_check + [
            'code_postal', 'match_id', "first_line_match_id", 'matching_score',
            'source_duplicates', 'id_duplicates', "check"
        ]]
        df_check = df_check.reset_index()
        return df_check
Esempio n. 44
0
def imageanalise(id_gen,source_pdfs,folders):
	from fuzzywuzzy import fuzz
	from fuzzywuzzy import process
	import os
	import pyocr
	import pyocr.builders
	import re
	from PIL import Image

	# Creating a report file id_source,spath,pages,id_gen,destination,types,docid,outcome
	GenerateReport('ID Source','Source Path','Pages','ID Destination','Destination','Type','Doc ID','Outcome', True)

	# Verify subfolder in main folder
	for i,folder in folders:
		vals = pathmapping(folder,'croped/croped_txt1_*.jpg',False,True)
		pdf_pages_number = []
		pdf_pages_sv = []
		fdfd = list(vals)
		
		# Check for validation images inside folder
		for numpage,val in reversed(fdfd):
			print(val)
			#green_grade = 0
			im = Image.open(val)

			jpg = val.replace('croped/croped_txt1_','')

			# Saving PDF pages
			pdf_pages_number.append(numpage)
			
			# Check for green grade in image
			# for pixel in im.getdata():
			# 	if (pixel[1]>(pixel[2]+10) and pixel[1]>(pixel[0]+10)):
			# 		green_grade += 1
			
			# Check text inside main area of analises
			# if (green_grade >=200):

			# Build txt image in order to be analised
			cropimage(folder,jpg,100,120,700,270,'croped_txt1_')

			jpg_text = val.replace('val1','txt1')

			# Convert image into text mode
			tools = pyocr.get_available_tools()[0]
			text_txt1 = tools.image_to_string(Image.open(jpg_text), builder=pyocr.builders.DigitBuilder())
			print(fuzz.token_set_ratio('ALVARA', text_txt1))
			print(fuzz.token_set_ratio('HABITESSE', text_txt1))
			startnum = val.rfind('_')
			endnum = val.rfind('.')

			if fuzz.token_set_ratio('ALVARA', text_txt1) > 70 and fuzz.token_set_ratio('CUMPRIMENTO', text_txt1) < 30:

				if len(pdf_pages_number)>1:
					pdf_pages_sv.append(pdf_pages_number.pop())
					pdf_pages_sv.append(pdf_pages_number.pop())
					pdf_pages_sv.reverse()
					id_gen = GenerateDoc(id_gen,source_pdfs,'ALVARA',folder,jpg,550,180,770,330,pdf_pages_sv,None)
					print('\n ============ DOCUMENT FOUND (ALVARA) =========== \n')
					if len(pdf_pages_number)>0:
						id_gen = GenerateDoc(id_gen,source_pdfs,'NotRecon',folder,jpg,0,0,1,1,pdf_pages_number,None)
						print('\n ============ DOCUMENT NOT FOUND =========== \n')
				
				else:
					id_gen = GenerateDoc(id_gen,source_pdfs,'ALVARA',folder,jpg,550,180,770,330,pdf_pages_number,None)
					print('\n ============ DOCUMENT FOUND (ALVARA) =========== \n')

			elif fuzz.token_set_ratio('HABITESSE', text_txt1) > 70 and fuzz.token_set_ratio('VALIDAMOS', text_txt1) < 30 :
				# Saving PDF pages
				if len(pdf_pages_number)>1:
					pdf_pages_sv.append(pdf_pages_number.pop())
					pdf_pages_sv.append(pdf_pages_number.pop())
					pdf_pages_sv.reverse()
					id_gen = GenerateDoc(id_gen,source_pdfs,'HABITESSE',folder,jpg,250,260,420,300,pdf_pages_sv,None)
					print('\n ============ DOCUMENT FOUND (HABITESSE) =========== \n')
					if len(pdf_pages_number)>0:
						id_gen = GenerateDoc(id_gen,source_pdfs,'NotRecon',folder,jpg,0,0,1,1,pdf_pages_number,None)
						print('\n ============ DOCUMENT NOT FOUND =========== \n')
				else:
					id_gen = GenerateDoc(id_gen,source_pdfs,'HABITESSE',folder,jpg,250,260,420,300,pdf_pages_number,None)
					print('\n ============ DOCUMENT FOUND (HABITESSE) =========== \n')
			elif int(val[startnum+1:endnum])==0:
				id_gen = GenerateDoc(id_gen,source_pdfs,'NotRecon',folder,jpg,0,0,1,1,pdf_pages_number,None)
				print('\n ============ DOCUMENT NOT FOUND =========== \n')
			# else:
			# 	jpg_text2 = val.replace('val1','txt2')

			# 	tools = pyocr.get_available_tools()[0]
			# 	text_txt2 = tools.image_to_string(Image.open(jpg_text2), builder=pyocr.builders.DigitBuilder())

			# 	if fuzz.partial_ratio('Sistema de Tratamento de Efluentes', text_txt2) > 70:
			# 		id_gen = GenerateDoc(id_gen,source_pdfs,'STE',folder,jpg,380,60,620,130,pdf_pages_number,None)
			# 		print('\n ============ DOCUMENT FOUND (STE) =========== \n')
			# 	else:
			# 		jpg_text3 = val.replace('val1','txt3')

			# 		tools = pyocr.get_available_tools()[0]
			# 		text_txt3 = tools.image_to_string(Image.open(jpg_text3), builder=pyocr.builders.DigitBuilder())
			# 		startnum = val.rfind('_')
			# 		endnum = val.rfind('.')
			# 		doc_num = re.findall(r'r\d+/\d+|$', text_txt3)
			# 		doc_num = ''.join(doc_num[0])
			# 		doc_num = doc_num.replace('/','.')
			# 		if doc_num == '':
			# 			doc_num = str(id_gen)

			# 		if fuzz.partial_ratio('LICENGA ESPECIAL', text_txt3) > 70:
			# 			id_gen = GenerateDoc(id_gen,source_pdfs,'LE',None,jpg,0,0,0,0,pdf_pages_number,doc_num)
			# 			print('\n ============ DOCUMENT FOUND (LE) =========== \n')
			# 		elif int(val[startnum+1:endnum])==0:
			# 			id_gen = GenerateDoc(id_gen,source_pdfs,'NotRecon',None,jpg,0,0,0,0,pdf_pages_number,doc_num)
			# 			print('\n ============ DOCUMENT NOT FOUND =========== \n')
	
	os.system('rm -r -f docclass/')
for i in ax.patches:
    # get_width pulls left or right; get_y pushes up or down
    ax.text(i.get_width()+.1, i.get_y()+.31, str(round((i.get_width()), 2)), fontsize=9, color='dimgrey')
# invert for largest on top 
ax.invert_yaxis()
plt.gcf().subplots_adjust(left=0.3)


# Remove df's fuzzyAutoAdd1, etc., FuzzyWuzzyProcResult1, etc., fuzzySourceZ, etc., GoldStandard, others


#%%
# ==================================================================
# 3. FuzzyWuzzyListToCheck - Set up manual matching UI
# ==================================================================
'''
Now that the safe bets have been taken out, let's allow more liberal matching
and finish some assignments using human review.

Over time you can change the parameters to match your time and desired level
of effort. You can reduce the list, change the type of match (full phrase or 
any word), and change the score, to change the number of candidates to 
match how much time you want to spend in the browser. When starting with a
new site you should probably spend a good deal of time here, to make connections
the other steps can't make. Decisions you make here will provide training 
data that the machine learning component can use.

Some options described at https://www.neudesic.com/blog/fuzzywuzzy-using-python/.
See for example fuzz.ratio (conservative) vs. fuzz.partial_ratio (medium) vs.
fuzz.token_set_ratio (any single word in the phrases, very liberal). The more
liberal you get here, the more you will see multiple-concept searches, which
Esempio n. 46
0
    def lookup(self):

        self.donor = str(self.donor)
        company_number = None
        people_links = []
        found = False

        # ugly hack corrections
        if self.donor in ['Tresco Estate', 'James Hay', 'Think BDW Ltd']:
            self.status = 'company'

        if self.status == 'company, no 10120655':
            company_number = 10120655

        if 'Armed Forces Parliamentary Trust' == self.donor:
            self.status = 'other'
        if u'Buck’s Club 1919' in self.donor:
            self.donor = "Buck's Club 1919"
            self.status = 'members'
        if u'Pratt’s Club' in self.donor:
            self.donor = "Pratt's Club"
            self.status = 'members'
        if 'carlton club' in self.donor.lower():
            self.donor = 'Carlton Club'
            self.status = 'members'
        if 'National Liberal Club' in self.donor:
            self.donor = 'National Liberal Club'
            self.status = 'members'
        if 'The Public Interest Foundation (UK charity)' == self.donor:
            self.status = 'charity'

        # apply patches
        if self.donor in urls.keys():
            company_number = urls[self.donor].split('/')[-1]

        if self.donor in people.keys():
            people_links = people[self.donor]

        if not company_number:
            # use the supplied company number from the register of interests
            # if 'company' in self.status:
            company_number_search = re.search('registration [0-9|a-z|A-Z]+',
                                              self.status)
            if company_number_search:
                company_number = company_number_search.group().split(
                    'registration ')[-1]

                # needs padding to 8 digits, if it starts with an int
                if re.match('[0-9]', company_number):
                    company_number = '%08d' % (int(company_number))

        self.company = {
            'company_name': self.donor,
            'company_number': 'N/A',
            'company_status': 'Active'
        }
        self.persons = []
        self.officers = []
        self.link = None
        self.appointments = []

        if company_number:

            # we have a company number, no need to search for it
            self.company = getlink(
                {'links': {
                    'self': '/company/%s' % str(company_number)
                }}, 'self')
            persons = getlink(self.company, 'persons_with_significant_control')
            self.persons = persons['items']
            officers = getlink(self.company, 'officers')
            self.officers = officers['items']

            if not self.company.has_key('errors'):
                self.link = 'https://beta.companieshouse.gov.uk' + self.company[
                    'links']['self']
                found = True
            else:
                self.company = {
                    'company_name': self.donor,
                    'company_number': 'N/A',
                    'company_status': 'Active'
                }
                self.link = ''

        else:

            if 'individual' in self.status.lower(
            ) or 'private' in self.status.lower():
                # found = True
                # for individuals, we store the appointments, then the company, officers etc as children
                # of the appointment

                if people_links != []:

                    for pl in people_links:
                        bit = pl.split(
                            'https://beta.companieshouse.gov.uk')[-1]
                        appointments = getlink({'links': {
                            'self': '%s' % bit
                        }}, 'self')
                        for i in appointments['items']:
                            if i not in self.appointments:
                                self.appointments.append(i)

                    # just take the last one
                    self.link = pl
                    found = True

                for app in self.appointments:
                    # add the company, officers and persons record to appointment record
                    app['company'] = getlink(app, 'company')
                    app['officers'] = getlink(app['company'],
                                              'officers')['items']
                    app['persons_with_significant_control'] = getlink(
                        app['company'],
                        'persons_with_significant_control')['items']

            # eveything below here, should generate a company / entity
            elif 'trade' in self.status.lower():
                self.type = 'union'
                if self.donor in trade_union.keys():
                    self.donor = trade_union[self.donor]
                    found = True

            elif 'charity' in self.status.lower():
                self.type = 'charity'
                if self.donor in charities.keys():
                    self.donor = charities[self.donor]
                    found = True

            elif 'unincorporated' in self.status.lower():
                self.type = 'club'
                if self.donor in clubs.keys():
                    self.donor = clubs[self.donor]
                    found = True

            elif 'members' in self.status.lower():
                self.type = 'club'
                if self.donor in clubs.keys():
                    self.donor = clubs[self.donor]
                    found = True

            elif 'friendly' in self.status.lower():
                self.type = 'club'
                if self.donor in clubs.keys():
                    self.donor = clubs[self.donor]
                    found = True

            elif 'other' in self.status.lower():
                self.type = 'other'
                if self.donor in others.keys():
                    self.donor = others[self.donor]
                    found = True

            elif 'trust' in self.status.lower():
                self.type = 'other'
                if self.donor in others.keys():
                    self.donor = others[self.donor]
                    found = True

            elif 'provident' in self.status.lower():
                self.type = 'company'
                if self.donor in others.keys():
                    self.donor = others[self.donor]
                    found = True

            elif 'visit' in self.status:
                # TODO
                self.type = 'visit'

            else:
                # we dont have a company number, so do a company search
                if 'llp' in self.status.lower(
                ) or 'limited' in self.status.lower():
                    self.type = 'company'
                else:
                    self.type = 'other'

                # these are the remaining things to search - can only do a company search really
                companies = CompaniesHouseCompanySearch([self.donor])

                for i in companies.data:

                    # we need the name and address to fuzzy match

                    name_ratio = fuzz.token_set_ratio(i['title'].lower(),
                                                      self.donor)

                    if name_ratio > 90:

                        if i['address_snippet']:

                            addr_ratio = fuzz.token_set_ratio(
                                i['address_snippet'].lower(), self.address)

                            # if the address matches enough
                            if addr_ratio > 90:

                                self.link = 'https://beta.companieshouse.gov.uk' + i[
                                    'links']['self']
                                self.company = getlink(i, 'self')
                                persons = getlink(
                                    self.company,
                                    'persons_with_significant_control')
                                self.persons = persons['items']
                                officers = getlink(self.company, 'officers')
                                self.officers = officers['items']
                                # print 'FOUND %s: , %s' % (self.status.upper(), self.company['company_name'])
                                found = True
                                break

        # print self.donor, self.address
        # if 'sw1p 3ql' in self.address.lower():
        # 	print '*'*100
        # 	print '55 TUFTON STREET: %s' % self.donor
        # 	print '*'*100

        if found:
            pass
            # print '\tFOUND %s: %s' % (self.status.upper(), self.donor)
        else:
            # pass
            print '\tMISSING %s: %s' % (self.status.upper(), self.donor)
def fuzzyScore(string1, string2):
    return fuzz.token_set_ratio(string1, string2)
Esempio n. 48
0
                                if multi_match[ic][0] == list1[j][
                                        5] and multi_match[ic][1] == list2[i][
                                            5]:
                                    multi_match[ic][2] = multi_match[ic][
                                        2] + list2[i][4] + " " + list2[i][
                                            1] + " " + list2[i][0] + "\n"
                                    multi_match[ic][8] = multi_match[ic][
                                        8] + " " + list1[j][4]
                                    multi_match[ic][9] = multi_match[ic][
                                        9] + " " + list2[i][4]
                                    ff = 1
                                    break

                            if ff == 0:
                                multi_match.append([list1[j][5], list2[i][5], list2[i][4]+" "+list2[i][1]+" "+list2[i][0]+"\n", \
                                                    "", "", "", fuzz.token_set_ratio(list1[j][3], list2[i][3])/100, 1, list1[j][4], list2[i][4]])
                                cnt_m = cnt_m + 1
                            break
                        else:  #If Event_Name is not same, Close_match
                            # Close_Match=Close_Match+" "+list2[i][4]+" "+list2[i][1]+" "+list2[i][0]+"\n"
                            # F01 = F01 + " " + list1[j][4]
                            # F02 = F02 + " " + list2[i][4]
                            ff = 0
                            for ic in range(cnt_m):
                                if multi_match[ic][0] == list1[j][
                                        5] and multi_match[ic][1] == list2[i][
                                            5]:
                                    multi_match[ic][3] = multi_match[ic][2] + list2[i][4] + " " + list2[i][1] + " " + \
                                                         list2[i][0] + "\n"
                                    multi_match[ic][8] = multi_match[ic][
                                        8] + " " + list1[j][4]
Esempio n. 49
0
def get_similarity(first_string: str, second_string: str):
    return fuzz.token_set_ratio(first_string, second_string)
Esempio n. 50
0
def ExtractSymbol(title):
    final = ""
    df_matchedSym = []
    matchedSymList = ""
    for i in range(0, len(Symbols)):
        sym = Symbols[i].lower()
        st = title.lower()
        if(Symbols[i] in excep):
            st = title
            sym = Symbols[i]
        temp = []
        reg = r"\b"+sym+r"\b"
        reg = re.compile(reg)
        if(reg.search(st)):
            temp.append(Symbols[i])
            temp.append(Names[i])
            temp.append(True)
        else:
            temp.append(Symbols[i])
            temp.append(Names[i])
            temp.append(False)
        df_matchedSym.append(temp)
    df_matchedSym = [z for z in df_matchedSym if z[2] == True]
    if df_matchedSym:
        for term in df_matchedSym:
            matchedSymList = matchedSymList+"|"+term[0]
    df_matchedSym = []
    temp2 = re.sub('[^A-Za-z0-9. ]+', '', title)
    for i in range(0, len(Names)):
        thres=50
        tsor = fuzz.token_sort_ratio(temp2.lower(), Names[i].lower())
        tser = fuzz.token_set_ratio(temp2.lower(), Names[i].lower())
        r = fuzz.ratio(temp2.lower(), Names[i].lower())
        pr = fuzz.partial_ratio(temp2.lower(), Names[i].lower())
        avg = (tsor+tser+r+pr)/4
        if(Names[i]=="CONSOLIDATED CONSTRUCTION"):
            thres = 60
        temp = []
        if(avg >= thres):
            temp.append(Symbols[i])
            temp.append(Names[i])
            temp.append(True)
        else:
            temp.append(Symbols[i])
            temp.append(Names[i])
            temp.append(False)
        df_matchedSym.append(temp)
    
    df_matchedSym = [z for z in df_matchedSym if z[2] == True]
    if(len(df_matchedSym) == 0):
        if(matchedSymList == ""):
            final = ""
        else:
            final = final+matchedSymList
    else:
        for i in range(0, len(df_matchedSym)):
            reg = r"\b"+df_matchedSym[i][0].lower()+r"\b"
            reg = re.compile(reg)
            if(reg.search(matchedSymList.lower())):
                continue
            symSplit = df_matchedSym[i][1].split(" ")
            reg = r"\b"+symSplit[0].lower()+r"\b"
            reg = re.compile(reg)
            if(reg.search(temp2.lower())):
                if(len(symSplit) < 3):
                    matchedSymList = matchedSymList+"|"+df_matchedSym[i][0]
                else:
                    
                    reg = r"\b"+symSplit[1].lower()
                    reg = re.compile(reg)
                    if(reg.search(temp2.lower())):
                        matchedSymList = matchedSymList+"|"+df_matchedSym[i][0]
        final = final+matchedSymList
    return final
Esempio n. 51
0
        return False
    ratio_ = difflib.SequenceMatcher(None, source_str, target_str).ratio()
    #print "Ratio for threshold {0} : {1}".format(threshold, ratio_)
    return  ratio_ > threshold

city_mapped = mapped.loc[mapped['City'].str.upper() == 'CHARLOTTE']
city_target = usi_target[usi_target.City.str.upper() == 'CHARLOTTE']
city_brokers = usi_brokers[usi_brokers['Physical City'].str.upper() == 'CHARLOTTE']



for target_index, target_row in city_target.iterrows():
    top_ratio = 0
    top_index = 0
    for broker_index, broker_row in city_brokers.iterrows():
        ratio = fuzz.token_set_ratio(target_row['Address1'], broker_row['Physical Street Address Line 1'])
        if top_ratio < ratio:
            top_ratio = ratio
            top_index = broker_index
    #variables
    source_zip = str(city_brokers.loc[top_index, ['Physical Zip (All)']].values[0])
    target_zip = str(city_target.loc[target_index, ['PostalCode']].values[0])
    source_name = str(city_brokers.loc[top_index, ['Business Name']].values[0])
    target_name = str(city_target.loc[target_index, ['PartyCompanyName']].values[0])
    zip_matched = str_matched(source_zip, target_zip)
    name_matched = str_matched(source_name, target_name, threshold = 0.2999)
    address_matched = (top_ratio/100.0) > 0.6299
    
    print "Ratio: {0} : {1} - {2}% ZIP:{3}, NAME:{4}".format(target_row['Address1'], 
              city_brokers.loc[top_index, ['Physical Street Address Line 1']].values[0], top_ratio,
              zip_matched, name_matched)
Esempio n. 52
0
        for author2 in authors2:
            # Don't compare the author to itself
            if (author != author2):
                # First check for a high token_set_ratio value
                # token_set_ratio
                # The strings being compared are tokenized and preprocessed (made lower case
                # without punctuation).  Then, a set operation identifies the common tokens
                # (the intersection) and ratio() comparisons between the following new strings:
                #    s1 = Sorted_tokens_in_intersection
                #    s2 = Sorted_tokens_in_intersection + sorted_rest_of_str1_tokens
                #    s3 = Sorted_tokens_in_intersection + sorted_rest_of_str2_tokens
                # The logic behind these comparisons is that since Sorted_tokens_in_intersection
                # is always the same, the score will tend to go up as these words make up a larger
                # chunk of the original strings or the remaining tokens are closer to each other.
                TokenSetRatio = fuzz.token_set_ratio(authors[author],
                                                     authors2[author2])

                # If token_set_ratio is at least 75, then look closer at these names
                if TokenSetRatio >= 75:
                    # Get the other ratios for these two authors

                    # ratio
                    # Levenshtein distance similarity ratio
                    Ratio = fuzz.ratio(authors[author], authors2[author2])

                    # partial_ratio
                    # If the shorter string being compared has length k and the longer string has
                    # length m, then partial_ratio seeks the score of the best matching length-k
                    # substring.
                    PartialRatio = fuzz.partial_ratio(authors[author],
                                                      authors2[author2])
Esempio n. 53
0
dfASOS = pd.read_pickle("webScrapeASOS.pkl")[:]
dfASOS = dfASOS.add_prefix('ASOS_')

dfHnM = pd.read_pickle("webScrapeHM.pkl")
dfHnM = dfHnM.add_prefix('HnM_')

dfASOScopy = dfASOS.copy(deep=True)
dfHnMcopy = dfHnM.copy(deep=True)

pairedNames = []

for ASOSName in dfASOS['ASOS_Name']:
    matchName = []
    matchScore = 0
    for HnMName in dfHnM['HnM_Name']:
        stringCompareScore = fuzz.token_set_ratio(ASOSName, HnMName)
        if stringCompareScore >= matchScore:
            matchScore = stringCompareScore
            matchName = HnMName

    pairedNames.append([ASOSName, matchName, matchScore])

with open('pairedNamesListPKL.pkl', 'wb') as f:
    pickle.dump(pairedNames, f)

#------------------------------------------------------------------------------
groupDF = pd.DataFrame([])
listArb = []
for namePair in pairedNames:
    ASOSNamePair = namePair[0]
    HnMNamePair = namePair[1]
Esempio n. 54
0
# Try paralell computation with dask
#Train
print('extra fuzzy features, train....')
train_dd = from_pandas(train_df[['question1', 'question2']], npartitions=8)

start_time = time.time()
train_df['fuzz_qratio'] = train_dd.apply(
    lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])),
    axis=1,
    meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get)
train_df['fuzz_WRatio'] = train_dd.apply(
    lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])),
    axis=1,
    meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get)
train_df['fuzz_token_set_ratio'] = train_dd.apply(
    lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])),
    axis=1,
    meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get)
train_df['fuzz_token_sort_ratio'] = train_dd.apply(
    lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])),
    axis=1,
    meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get)
print((time.time() - start_time))
del train_dd

#Test
print('extra fuzzy features, test....')
test_dd = from_pandas(test_df[['question1', 'question2']], npartitions=8)

start_time = time.time()
test_df['fuzz_qratio'] = test_dd.apply(
Esempio n. 55
0
def find_and_store_duplicate_syllabi(grid_name, year, field_name):
    global stop
    try:
        # connect to existing database
        conn = psycopg2.connect(
            "dbname='litindex' user='******' host='0.0.0.0' password='******'"
        )
        # Open a cursor to perform database operations
        cur = conn.cursor()

        param_list = [grid_name, year, field_name]
        select_query = "SELECT id, text_md5, text from open_syllabi where grid_name='{}' and year='{}' and field_name='{}'".format(
            *param_list)  #unpack the list
        cur.execute(select_query)
        df = pd.DataFrame(cur.fetchall(), columns=['id', 'text_md5', 'text'])
        print("\tNO OF RECORDS = {}", len(df))

        punctuation_translator = str.maketrans('', '', string.punctuation)

        # PRE-PROCESSING REQUIRED:
        # normalize by lowering the case, removing punctuations, removing numbers and english stop words
        df['text_lower_case_words'] = df['text'].apply(lambda x: ' '.join([
            word for word in x.lower().translate(punctuation_translator).split(
            ) if not word.isdigit() and word not in stop
        ]))
        # the following pre-processing is required to improve quality of LSH results
        # especially considering highly templated text in course descriptions
        df['text_unique_words'] = df['text'].apply(lambda x: ' '.join([
            word for word in list(
                set(x.lower().translate(punctuation_translator).split()))
            if not word.isdigit() and word not in stop
        ]))
        common_words_series = pd.Series(' '.join(
            df['text_unique_words']).lower().strip(
                string.punctuation).split()).value_counts()
        most_common_words_series = common_words_series[common_words_series > (
            0.5 * len(df))].dropna()
        most_common_words_list = most_common_words_series.index.tolist()
        df['text_without_common_words'] = df['text'].apply(lambda x: ' '.join([
            word for word in x.lower().translate(punctuation_translator).split(
            ) if word not in (most_common_words_list) and word not in stop
        ]))

        # STEP 1: use LSH algorithm to find candidate duplicates
        # find duplicates
        # run through adding documents to the LSH cache
        hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4)
        lshcache = cache.Cache(bands=10, hasher=hasher)

        for idx in range(0, (len(df) - 1)):
            lshcache.add_fingerprint(
                hasher.fingerprint(df.loc[idx, 'text_without_common_words']),
                df.loc[idx, 'id'])

        # for every bucket in the LSH cache get the candidate duplicates
        # note this fast way to get candidate pairs with reasonable accuracy, that will be filtered later
        candidate_pairs = set()
        for b in lshcache.bins:
            for bucket_id in b:
                if len(
                        b[bucket_id]
                ) > 1:  # if the bucket contains more than a single document
                    pairs_ = set(itertools.combinations(b[bucket_id], r=2))
                    candidate_pairs.update(pairs_)
        list_candidate_pairs = list(candidate_pairs)
        tsl = []
        # df = df.set_index('id')
        print("\tcandidate pairs found = {}", len(list_candidate_pairs))

        # STEP 2: use TFIDF to process the records associated with the candidate duplicates and generate signature text
        tf = TfidfVectorizer(analyzer='word',
                             ngram_range=(1, 1),
                             min_df=0,
                             stop_words='english')
        tfidf_matrix = tf.fit_transform(df['text_lower_case_words'])
        feature_names = tf.get_feature_names()
        dense = tfidf_matrix.todense()

        for item in list_candidate_pairs:
            idx1 = df.index[df['id'] == int(item[0])]
            idx2 = df.index[df['id'] == int(item[1])]
            episode1 = dense[idx1].tolist()[0]
            episode2 = dense[idx2].tolist()[0]
            phrase_scores1 = [
                pair for pair in zip(range(0, len(episode1)), episode1)
                if pair[1] > 0
            ]
            sorted_phrase_scores1 = sorted(phrase_scores1,
                                           key=lambda t: t[1] * -1)
            phrase_scores2 = [
                pair for pair in zip(range(0, len(episode2)), episode2)
                if pair[1] > 0
            ]
            sorted_phrase_scores2 = sorted(phrase_scores2,
                                           key=lambda t: t[1] * -1)
            list_summarized_text1 = []
            list_summarized_text2 = []
            for phrase, score in [(feature_names[word_id], score)
                                  for (word_id, score) in sorted_phrase_scores1
                                  ][:10]:
                # print('{0: <20} {1}'.format(phrase, score))
                list_summarized_text1.append(phrase)
            for phrase, score in [(feature_names[word_id], score)
                                  for (word_id, score) in sorted_phrase_scores2
                                  ][:10]:
                # print('{0: <20} {1}'.format(phrase, score))
                list_summarized_text2.append(phrase)

            summarized_text1 = ' '.join(list_summarized_text1)
            summarized_text2 = ' '.join(list_summarized_text2)
            # STEP 3: apply fuzzy match for the two signature texts to generate accuracy score
            fuzz_ratio = fuzz.token_set_ratio(summarized_text1,
                                              summarized_text2)
            tsl.append(
                (grid_name, field_name, int(year), int(item[0]), int(item[1]),
                 summarized_text1, summarized_text2, fuzz_ratio))
        # for item in list_candidate_pairs:
        insert_duplicate_pairs(tsl)

        df = df.set_index('id')
        return df
    except Exception as e:
        if conn:
            conn.rollback()
        # print("Unexpected error:", sys.exc_info()[0]])
        print(e)
        sys.exit(1)
    finally:
        # Close communication with the database
        if cur:
            cur.close()
        if conn:
            conn.close()
Esempio n. 56
0
def post_form():
    form = request.form.to_dict()
    datafile_name = form.get('dataFileName')
    # records = pd.read_excel(request.files['dataFile'], sheet_name=None)
    records = utils.read_spreadsheet(request.files['dataFile'], datafile_name)
    date_cols = []
    if datafile_name.endswith('.xlsx') or datafile_name.endswith('.xls'):
        records_with_format = load_workbook(request.files['dataFile'])
        for sheet in records_with_format.sheetnames:
            for row in records_with_format[sheet].iter_rows(min_row=2):
                for cell in row:
                    # MRN
                    column_letter = get_column_letter(cell.column)
                    column_header = records_with_format[sheet][column_letter +
                                                               '1'].value
                    if column_header in records[
                            sheet].columns and cell.number_format == '00000000':
                        current_list = list(records[sheet][column_header])
                        current_list = [
                            str(i).rjust(8, '0') if isinstance(i, int) else i
                            for i in current_list
                        ]
                        records[sheet][column_header] = current_list
                    if column_header in records[
                            sheet].columns and cell.number_format == 'mm-dd-yy':
                        date_cols.append(column_header)
                        current_list = list(records[sheet][column_header])
                        current_list = [
                            i.strftime('%m/%d/%Y') if isinstance(i, datetime)
                            and not pd.isnull(i) else i for i in current_list
                        ]
                        records[sheet][column_header] = current_list
                break
    token = form.get('token')
    env = form.get('env')
    mappings = None
    existing_records = None
    form_names = set()
    form_name_to_dd_fields = {}
    data_field_to_redcap_field_map = {}
    data_field_to_choice_map = {}
    original_to_correct_value_map = {}
    no_match_redcap_fields = []

    if 'mappingsFile' in request.files:
        mappings = pd.read_excel(request.files['mappingsFile'],
                                 sheet_name="Sheet1")

        if list(mappings["dataFieldToRedcapFieldMap"]):
            data_field_to_redcap_field_map = json.loads(
                list(mappings["dataFieldToRedcapFieldMap"])[0])
        if list(mappings["dataFieldToChoiceMap"]):
            data_field_to_choice_map = json.loads(
                list(mappings["dataFieldToChoiceMap"])[0])
        if list(mappings["originalToCorrectedValueMap"]):
            original_to_correct_value_map = json.loads(
                list(mappings["originalToCorrectedValueMap"])[0])
        if list(mappings["noMatchRedcapFields"]):
            no_match_redcap_fields = json.loads(
                list(mappings["noMatchRedcapFields"])[0])

    redcap_api = RedcapApi(env)

    project_info = {
        'secondary_unique_field': '',
        'record_autonumbering_enabled': 0,
        'repeatable_instruments': [],
        'next_record_name': 1
    }

    data_dictionary = None
    existing_records = None
    if token:
        try:
            data_dictionary = redcap_api.fetch_data_dictionary(token)
            project_info = redcap_api.fetch_project_info(token)
            project_info[
                'next_record_name'] = redcap_api.generate_next_record_name(
                    token)
            if project_info.get('secondary_unique_field'):
                project_info['secondary_unique_field'] = [
                    project_info.get('secondary_unique_field')
                ]
            if project_info['has_repeating_instruments_or_events'] == 1:
                repeatable_instruments = redcap_api.fetch_repeatable_instruments(
                    token)
                project_info['repeatable_instruments'] = [
                    i['form_name'] for i in repeatable_instruments
                ]
            if project_info['record_autonumbering_enabled'] == 0:
                data_dictionary[0]['required'] = 'Y'
            dd = [RedcapField.from_json(field) for field in data_dictionary]
        except Exception as e:
            logging.warning(e)
            results = {'error': "Error: {0}".format(e)}
            response = flask.jsonify(results)
            response.headers.add('Access-Control-Allow-Origin', '*')
            return response
    else:
        data_dictionary_name = form.get('dataDictionaryName')
        if data_dictionary_name.endswith('.csv'):
            dd_df = pd.read_csv(request.files['dataDictionary'])
            dd_df.fillna('', inplace=True)
        elif data_dictionary_name.endswith(
                '.xlsx') or data_dictionary_name.endswith('.xls'):
            dd_df = pd.read_excel(request.files['dataDictionary'])
        dd = [
            RedcapField.from_data_dictionary(dd_df, field)
            for field in list(dd_df['Variable / Field Name'])
        ]
        if dd[0].field_name == 'record_id':
            project_info['record_autonumbering_enabled'] = 1
        if 'existingRecordsFile' in request.files:
            existing_records = pd.read_csv(
                request.files['existingRecordsFile'])
            existing_records = json.loads(
                existing_records.to_json(orient='records', date_format='iso'))

    all_csv_headers = []
    dd_headers = []
    dd_data = {}
    dd_data_raw = {}
    if data_dictionary is not None:
        dd_headers = list(data_dictionary[0].keys())
        dd_data_raw = data_dictionary
    else:
        dd_headers = list(dd_df.columns)
        dd_data_raw = json.loads(
            dd_df.to_json(orient='records', date_format='iso'))

    dd_data = [field.__dict__ for field in dd]

    for dd_field in dd:
        if not form_name_to_dd_fields.get(dd_field.form_name):
            form_name_to_dd_fields[dd_field.form_name] = []
        form_name_to_dd_fields.get(dd_field.form_name).append(
            dd_field.field_name)
        form_names.add(dd_field.form_name)

    recordid_field = dd[0].field_name

    form_names = list(form_names)

    for sheet_name, sheet in records.items():
        all_csv_headers += list(sheet.columns)
        all_csv_headers = [i for i in all_csv_headers if 'Unnamed' not in i]

    all_field_names = [f.field_name for f in dd]

    redcap_field_candidates = {}
    data_field_candidates = {}
    csv_headers = {}
    fields_not_in_redcap = {}
    duplicate_fields = {}

    for sheet_name, sheet in records.items():
        duplicate_fields[sheet_name] = {}
        # Remove empty rows
        sheet.dropna(axis=0, how='all', inplace=True)
        csv_headers[sheet_name] = list(sheet.columns)
        csv_headers[sheet_name] = [
            item for item in csv_headers[sheet_name] if 'Unnamed' not in item
        ]
        for header in csv_headers[sheet_name]:
            duplicate_fields[sheet_name][
                header] = duplicate_fields[sheet_name].get(header, 0) + 1
        duplicate_fields[sheet_name] = [
            k for k, v in duplicate_fields[sheet_name].items() if v > 1
        ]
        normalized_headers = utils.parameterize_list(csv_headers[sheet_name])
        fields_not_in_redcap[sheet_name] = [
            header for header, normalized_header in zip(
                csv_headers[sheet_name], normalized_headers)
            if normalized_header not in all_field_names
        ]

    all_csv_headers = list(set(all_csv_headers))

    unmatched_data_fields = {}

    for sheet in csv_headers:
        data_field_to_redcap_field_map[
            sheet] = data_field_to_redcap_field_map.get(sheet, {})
        unmatched_data_fields[sheet] = unmatched_data_fields.get(sheet, [])
        for header in csv_headers[sheet]:
            normalized_header = utils.parameterize(header)
            if data_field_to_redcap_field_map[sheet].get(header):
                continue
            if normalized_header in all_field_names:
                data_field_to_redcap_field_map[sheet][
                    header] = normalized_header
            else:
                unmatched_data_fields[sheet].append(header)

    selected_columns = {}

    matched_redcap_fields = []
    matched_redcap_fields += no_match_redcap_fields
    for sheet_name, field_map in data_field_to_redcap_field_map.items():
        selected_columns[sheet_name] = field_map.keys()
        matched_redcap_fields += field_map.values()
    unmatched_redcap_fields = [
        f for f in all_field_names
        if f not in matched_redcap_fields and f != 'record_id'
    ]
    for f1 in all_field_names:
        dd_field = [f for f in dd_data if f['field_name'] == f1][0]
        redcap_field_candidates[f1] = []
        for sheet in csv_headers:
            for f2 in csv_headers[sheet]:
                redcap_field_candidates[f1].append({
                    'candidate':
                    f2,
                    'sheets': [sheet],
                    'score':
                    max(fuzz.token_set_ratio(f1, f2),
                        fuzz.token_set_ratio(dd_field['field_label'], f2))
                })

    for sheet in csv_headers:
        for f1 in csv_headers[sheet]:
            if data_field_candidates.get(f1):
                continue
            data_field_candidates[f1] = []
            for f2 in all_field_names:
                dd_field = [f for f in dd_data if f['field_name'] == f2][0]
                data_field_candidates[f1].append({
                    'candidate':
                    f2,
                    'form_name':
                    dd_field['form_name'],
                    'score':
                    max(fuzz.token_set_ratio(f1, f2),
                        fuzz.token_set_ratio(dd_field['field_label'], f1))
                })

    malformed_sheets = []

    form_names = [redcap_field.form_name for redcap_field in dd]
    form_names = list(set(form_names))
    for sheet_name in records.keys():
        sheet = records.get(sheet_name)

        redcap_field_names = [f.field_name for f in dd]

        matching_fields = [f for f in sheet.columns if f in redcap_field_names]
        if not matching_fields and not data_field_to_redcap_field_map.get(
                sheet_name):
            malformed_sheets.append(sheet_name)

    json_data = {}

    for sheet_name, sheet in records.items():
        json_data[sheet_name] = json.loads(
            sheet.to_json(orient='records', date_format='iso'))

    results = {
        'csvHeaders': csv_headers,
        'jsonData': json_data,
        'ddHeaders': dd_headers,
        'ddData': dd_data,
        'ddDataRaw': dd_data_raw,
        'formNames': form_names,
        'dateColumns': date_cols,
        'duplicateFields': duplicate_fields,
        'malformedSheets': malformed_sheets,
        'recordFieldsNotInRedcap': fields_not_in_redcap,
        'formNameToDdFields': form_name_to_dd_fields,
        'projectInfo': project_info,
        'existingRecords': existing_records,
        'recordidField': recordid_field,
        'redcapFieldCandidates': redcap_field_candidates,
        'dataFieldCandidates': data_field_candidates,
        'unmatchedRedcapFields': unmatched_redcap_fields,
        'unmatchedDataFields': unmatched_data_fields,
        'dataFileName': datafile_name,
        'token': token,
    }
    if data_field_to_redcap_field_map:
        results['dataFieldToRedcapFieldMap'] = data_field_to_redcap_field_map
    if data_field_to_choice_map:
        results['dataFieldToChoiceMap'] = data_field_to_choice_map
    if original_to_correct_value_map:
        results['originalToCorrectedValueMap'] = original_to_correct_value_map
    if no_match_redcap_fields:
        results['noMatchRedcapFields'] = no_match_redcap_fields

    response = flask.jsonify(results)
    return response
Esempio n. 57
0
def cosine(a, b):
    return fuzz.token_set_ratio(a, b)
#print(cou_list)

from fuzzywuzzy import fuzz

#READING EACH LINE FROM THE LIST FILE AND COMPARING WITH THE ARTICLE, AND USING FUZZY
news_open = open("TEST_2.txt")
news_read = news_open.readlines()
for keywords in news_read:
    for word in uni_list:
        if fuzz.token_sort_ratio(keywords, word) >= 90:
            uni_set.add(word.rstrip())
        if word in keywords:
            #print("UNIVERSITY: ",word)
            uni_set.add(word)
    for word_1 in deg_list:
        if fuzz.token_set_ratio(keywords, word_1) >= 90:
            deg_set.add(word_1.rstrip())
        if word_1 in keywords:
            #print("DEGREE: ",word_1)
            deg_set.add(word_1)
    for word_2 in exa_list:
        if fuzz.token_set_ratio(keywords, word_2) >= 92:
            exa_set.add(word_2.rstrip())
        if word_2 in keywords:
            #print("EXAMS: ",word_2)
            exa_set.add(word_2)
    for word_3 in streams_list:
        if fuzz.token_sort_ratio(keywords, word_3) >= 90:
            streams_set.add(word_3.rstrip())
        if word_3 in keywords:
            #print("STREAM: ",word_3)
Esempio n. 59
0
#update stopword_list
for i in list_remove_user_skills:
    stopword_list.append(i)
for i in educational_list:
    stopword_list.append(i)
for i in cert_list:
    stopword_list.append(i)
###############################################################################################################################
#In this section
####remove the stopwords from user skills columns based on fuzzy logic , this could be have been done in data cleaning code.
for d in range(len(dataset)):
    if pd.notnull(dataset[d][6]):
        words = dataset[d][6].split(",")
        for w in words:
            for s in stopword_list:
                match_score = fuzz.token_set_ratio(w, s)
                if match_score > 70:
                    dataset[d][6] = dataset[d][6].replace(w, "")
##############################################################################################################################
#In this section

#1.Topic modelling is performed for each sub function.The objective was to find out key topics for each sub function and match it with
#  user skill column based on fuzzy logic at threshold of 70% (this can vary) to find out whether the mentioned skill is relevant and by what percent.
#2.Intutively 25 topics was build for each sub function and top 40 words

for i in industry_list:
    print i
    Tfidf_Vectorizer = TfidfVectorizer(max_df=.95,
                                       min_df=2,
                                       stop_words=stopword_list,
                                       norm="l2",
Esempio n. 60
0
from fuzzywuzzy import process
st = "apple inc"
strOptions = ["Apple Inc.","apple park","apple incorporated","iphone","apple inc"]
Ratios = process.extract(st,strOptions)
print(Ratios)
# You can also select the string with the highest matching percentage
highest = process.extractOne(st,strOptions)
print(highest)

from fuzzywuzzy import fuzz
Str1 =  dataset['p_body'][37]
Str2 = dataset['p_body'][38]
Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2)
print(Ratio)
print(Partial_Ratio)
print(Token_Sort_Ratio)
print(Token_Set_Ratio)

Str1= dataset['p_body'][37]
Str2 = dataset['p_body'][38]
Ratios = process.extract(Str1,Str2)
print(Ratios)
# You can also select the string with the highest matching percentage
highest = process.extractOne(Str1,Str2)
print(highest)