def compare(s): from numpy import abs from fuzzywuzzy import fuzz tisim = "title_sim" affsim = "aff_sim" cpd = "cit_peryear_diff" yd = "year_diff" od = "ord_diff" rod = "rel_ord_diff" ncd = "num_coauth_diff" scc = "same_coauth_count" x = Series( { tisim: fuzz.token_set_ratio(s[tia], s[tib]), affsim: fuzz.token_set_ratio(s[affna], s[affnb]), cpd: abs((np.float(s[cita]) / (2015 - s[ya])) - (np.float(s[citb]) / (2015 - s[yb]))), yd: abs(s[ya] - s[yb]), od: abs(s[orda] - s[ordb]), rod: abs(((np.float(s[numaa]) - s[orda]) / s[numaa]) - ((np.float(s[numab]) - s[ordb]) / s[numab])), ncd: abs(s[numaa] - s[numab]), scc: len([x for x in s[coaa] if x in s[coab]]), } ) # x['title_sim'] = fuzz.token_set_ratio(s[tia], s[tib]) # x['aff_sim'] = fuzz.token_set_ratio(s[affna], s[affnb]) # x['cit_peryear_diff'] = np.abs((s[cita]/(2015 - s[ya])) - (s[citb]/(2015 - s[yb]))) # x['year_diff'] = abs(s[ya] - s[yb]) # x['ord_diff'] = abs(s[orda] - s[ordb]) # x['rel_ord_diff'] = abs(((s[numaa] - s[orda])/s[numaa]) - ((s[numab] - s[ordb]) / s[numab])) # x['num_coauth_diff'] = abs(s[numaa] - s[numab]) # x['same_coauth_count'] = len([x for x in s[coaa] if x in s[coab]]) return x
def score_gr_details(search_query): gr_url = create_gr_url(search_query) response = requests.get(url) pq_data = pq(response.content) books = pq_data("tr[itemtype='http://schema.org/Book']") all_info = pq_data(books).children("td").eq(1) book_info = pq_data(all_info).children("a.bookTitle") author_info = pq_data(all_info).find("a.authorName").eq(0) # will books here biblio_info = [] for book in books: # or is it for book in book_info ? # query_obj = pq(book) title = pq_data(book_info).text().strip() author = pq_data(author_info).text().strip() if author: biblio_info.append( (title, author) ) if not biblio_info: return None scored_info = [] for info in biblio_info: title_score = fuzz.token_set_ratio(info[0], book_title) author_score = fuzz.token_set_ratio(info[1], author) total_score = title_score + author_score scored_info.append( (total_score, info) ) scored_info.sort() return scored_info[-1][1]
def count(text,KW): text = text.lower() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sentences = sent_detector.tokenize(text.strip()) adjmentions = [] tokenizer = RegexpTokenizer(r'\w+') wordcount = 0 for sentence in sentences: tokens = tokenizer.tokenize(sentence) words = len(tokens) wordcount += words if fuzz.token_set_ratio(KW,sentence)<30: score = 0 else: score = fuzz.token_set_ratio(KW,sentence) *.01 adjmentions.append(score) aggregate = 0 for x in range(len(adjmentions)): aggregate += adjmentions[x] text = text.replace(' ', '') text = text.replace('.', ' ') text = text.replace(' ', '') character_count = len(text) return [aggregate,wordcount,character_count]
def compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def row_similarity(row): same_email = row.author_email == row.author_email_other name_similarity = fuzz.token_set_ratio(row.author_name, row.author_name_other) email_name_similarity = fuzz.ratio(row.email_name, row.email_name_other) name_to_email_similarity = fuzz.token_set_ratio(row.author_name, row.name_from_email_other) return pd.Series( [same_email, name_similarity, email_name_similarity, name_to_email_similarity])
def match(song, gdic): ftype = song[song.rfind('.'):].lower() try: if ftype == ".mp3": smp = MP3(song) elif ftype == ".wma": print("wma") return "False" elif ftype == ".flac": smp = FLAC(song) elif ftype == ".ogg": print("ogg") return "False" elif ftype in (".mp4", ".m4a"): smp = MP4(song) else: return False except IOError: return "delete" if ftype == ".flac": name = smp['title'][0] artist = smp['artist'][0] album = smp['album'][0] elif ftype == ".m4a": name = smp['\xa9nam'][0] artist = smp['\xa9ART'][0] album = smp['\xa9alb'][0] else: try: name = smp["TIT2"].pprint()[5:].replace('[','(').replace(']',')') artist = smp["TPE1"].pprint()[5:].replace("Feat", "Featuring").replace("Andre 3000", "OutKast").replace("Big Boi", "OutKast") album = smp["TALB"].pprint()[5:] except KeyError: return False pmatch = [i for i in gdic if fuzz.token_set_ratio(name, i['title']) > 90] if len(pmatch) == 1: return pmatch[0] pmatch = [i for i in pmatch if fuzz.token_set_ratio(artist, i['artist']) > 90] if len(pmatch) == 1: return pmatch[0] pmatch = [i for i in pmatch if fuzz.token_set_ratio(album, i['album']) > 90] if len(pmatch) == 1: return pmatch[0] #pmatch = [i for i in pmatch if ((('(' not in name) and ('(' not in i['title'])) or ((('(' in name) and ('(' in i['title'])) and (name[name.rindex("(") + 1:name.rindex(")")].lower() == i['title'][i['title'].rindex("(") + 1:i['title'].rindex(")")].lower())))] pmatch = [i for i in gdic if fuzz.token_sort_ratio(name, i['title']) > 90] if len(pmatch) == 1: return pmatch[0] #print ([(i['title'], i['artist'], i['album'], i['durationMillis']) for i in pmatch]) pmatch = [i for i in pmatch if abs(smp.info.length * 1000 - int(i['durationMillis'].encode('utf-8'))) < 1000] if len(pmatch) == 1: return pmatch[0] else: #print(name, artist, album, smp.info.length * 1000) return False
def compare_two_texts(self, string_a, string_b, normalize_value=True): """ Compare two string and return the value of Token Set Ratio algorithm the value is normalized between 0 and 1 values. """ if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or (isinstance(string_a, str) and isinstance(string_b, str))): if normalize_value: return self.__normalized_value(fuzz.token_set_ratio(string_a, string_b)) else: return fuzz.token_set_ratio(string_a, string_b) else: raise TypeError
def fuzzyNameMatch(name1, name2): name1 = name1.lower() name2 = name2.lower() name1 = fuzz.asciidammit(name1) name2 = fuzz.asciidammit(name2) ratio = fuzz.token_set_ratio(name1,name2) return ratio
def calculate_confidence(model_a, model_b, mapping='FIRST_PASS'): """Determine the similarity between model_a and model_b. Goes through the mappings and compares those attrs between each of the modules produced in the ``search`` function. :rtype float: 0.0 to 1.0, the degree that the two models are similar. """ attr_map = _get_mapping(mapping, model_a) if not attr_map: return 0.0 total_match = 0.0 # This becomes our denominator for arithemetic mean. num_attrs = 0.0 for a_attr, b_attr in attr_map: _trans, a_attr = _unpack_a_attr(a_attr) a_value = getattr(model_a, a_attr) b_value = getattr(model_b, b_attr) if not a_value or not b_value: continue num_attrs += 1.0 # Because we want a ratio, not a precentage ratio = fuzz.token_set_ratio( unicode(a_value), unicode(b_value) ) / 100.0 total_match += ratio return total_match / max(num_attrs, 1)
def get_max_candidates(self, candidates, word, wDict): if len(candidates) == 1: return max(candidates, key=wDict.get) elif len(candidates) == 0: return 'NO SUGGESTION' else: matched = 0 synonyms = None old_word = None mapped = dict() list_value = list() for value in candidates: ratio = fuzz.token_set_ratio(word.lower().strip(), value.lower().strip()) mapped[value] = ratio list_value.append(ratio) max_key = max(mapped.items(), key=operator.itemgetter(1))[0] max_ratio = max(mapped.items(), key=operator.itemgetter(1))[1] if list_value.count(max_ratio) > 1: for value, ratio in mapped.items(): if matched <= ratio: # print(old_word, synonyms, matched) if old_word is not None: synonyms = self.get_soundex(word.lower().strip(), value.lower().strip(), old_word) old_word = synonyms else: synonyms = value.lower().strip() old_word = synonyms matched = ratio else: synonyms = max_key return synonyms
def fuzz_comparisons(x): out = {} out['fuzz_partial_ratio'] = fuzz.partial_ratio(*x) out['fuzz_ratio'] = fuzz.ratio(*x) out['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(*x) out['fuzz_token_set_ratio'] = fuzz.token_set_ratio(*x) return pd.Series(out)
def process(translations, cmdList): print "translations", translations[0] bestNode = cmdList[len(cmdList) - 1].tree[0][0] # no match found command bestNode.confidence = 40 for translation in translations[0]: # 1-4 for cmd in cmdList: # 10-20 for nodeListId in range(len(cmd.tree)): if nodeListId > 0 or "Sherlock" in translation: nodeList = cmd.tree[nodeListId] for node in nodeList: # 2-6 if node.isOpen(): for key in node.keys: # 1-4 confidence = fuzz.token_set_ratio(key, translation) if confidence > bestNode.confidence: bestNode.keys = node.keys bestNode.open = node.open bestNode.func = node.func if confidence > 60: print "Executing:", node.keys return cmdList.index(cmd), node # return node print "null command" return 0, cmdList[0].tree[0][0]
def search123(request): global itertool itertool=itertools.count() searc=search1() if request.method=="POST": searc=search1(request.POST or None) if 'button7' in request.POST: if 'd_box' in request.POST and request.POST['d_box'] and not request.POST['name']: item_map=item.objects.raw('SELECT * FROM `item` WHERE `category_id`=%s', [request.POST['d_box']]) lis=[] for e in (item_map): lis.append(e.id) price_map=item_done.objects.filter(item_id__in=lis).order_by('item_id') return render(request,'index.html',{'posts':price_map,'posts1':searc,'itertools':itertool}) else: x=request.POST['name'] sql="SELECT * FROM `item`" cursor.execute(sql) query=cursor.fetchall() lis=[] for e in range(len(query)): y=str(query[e][1]) rat=fuzz.token_set_ratio(x,y) if rat >= 75: lis.append(query[e][0]) price_map=item_done.objects.filter(item_id__in=lis).order_by('item_id','site_price') return render(request,'index.html',{'posts':price_map,'posts1':searc,'itertools':itertool}) return render_to_response('index.html',{'posts1':searc},RequestContext(request))
def renaming(self, path, filename): filename = self.preview(filename) for element in os.listdir(path): if fuzz.token_set_ratio(filename, element) == 100: path_file = os.path.join(path, element) target = os.path.join(path, filename) os.rename(path_file, target)
def check_match(row, class_type): and_accepted = True or_accepted = False columns = list() ratio = 0 max_ratio = 0 total_ratio = 0 count = 0 for entry in row: for word in inputs: ratio = fuzz.token_set_ratio(word, row[entry]) if (ratio > 50): columns.append(entry) # to be hilighted later or_accepted = True total_ratio += ratio count += 1 if ratio > max_ratio: max_ratio = ratio if count != len(inputs): and_accepted = False if and_accepted: and_results.append({'type':class_type, 'ratio':total_ratio//count, 'columns': columns, 'row':row}) elif or_accepted: or_results.append({'type':class_type, 'ratio':max_ratio, 'columns': columns, 'row':row})
def match_popit(name): df = pandas.DataFrame.from_csv('data/popit-persons.csv') for popit_name in df.itertuples(): if fuzz.token_set_ratio(name.upper(),popit_name[1].upper()) > 95: return popit_name
def _enhance_element_info(self, sen, elements): lower_sen = sen.lower() lower_placeholder_set_ratio = [] for i in elements: if i.placeholder: lower_placeholder_set_ratio.append(fuzz.token_set_ratio(i.placeholder.lower(), lower_sen)) vec = help2vec.input_help_to_vec(lower_sen) enhanced = False or (len(lower_placeholder_set_ratio) > 0 and max(lower_placeholder_set_ratio) > 50) if len(vec) > 0: # Means it might be a help text. Now we have to link this with the corresponding input logging.debug("Following sentence was detected as input help") logging.debug("Sentence: %s" % (sen)) logging.debug("Vector: %s" % (str(vec))) e = utilities.match_help_to_element_NLP(elements, lower_sen) if e and not e.help: # We found reference to a placeholder so fine. logging.debug("Found following element for input help by placeholder reference") logging.debug("Element: %s" % (str(e))) e.help = sen e.help_vector_string = json.dumps(vec) self._update_element(e) enhanced = True else: # We couldn't find reference to placeholder. So visual correlation try: # Check if element still exists elem = self.d.find_element_by_xpath("//*[contains(text(), '%s')]" % (sen)) if elem: e = utilities.match_help_to_element_visually(elements, elem.location, elem.size) if e and not e.help: # We found reference to a placeholder so fine. logging.debug("Found following element for input help by visual reference") logging.debug("Element: %s" % (str(e))) e.help = sen e.help_vector_string = json.dumps(vec) self._update_element(e) enhanced = True except InvalidSelectorException, NoSuchElementException: pass
def get_needed_songs(queries): global songs for query in queries: for i in range(0, 3): response = pool.get_next_api().audio.search(q = query , auto_complete = 1 , performer_only = 1 , offset = i * 250 , count = 250 , v = "5.44") count = response["count"] goted_songs = response["items"] for goted_song in goted_songs: if fuzz.token_set_ratio(goted_song["artist"], query) > 90: song_id = goted_song["id"] if songs.get(song_id) is None: songs[song_id] = defaultdict() songs[song_id]["id"] = goted_song["id"] songs[song_id]["owner_id"] = goted_song["owner_id"] songs[song_id]["artist"] = goted_song["artist"] songs[song_id]["title"] = goted_song["title"] dumpData(songs, neededSongs) sleep(1) print("Needed songs count: ", len(songs.keys()))
def _author_similarity(self, other_author): if self.author and other_author: if fuzz.token_set_ratio(self.author, other_author) > 95: return True else: return False return 'NAN'
def top_token_set_ratio(values): """Return the best token set ratio match from fuzzywuzzy module.""" scores = [] for combo in combinations(values, 2): score = fuzz.token_set_ratio(combo[0], combo[1]) tokens_0 = len(combo[0].split()) tokens_1 = len(combo[1].split()) if tokens_0 > tokens_1: value = combo[0] tokens = tokens_0 elif tokens_0 < tokens_1: value = combo[1] tokens = tokens_1 else: tokens = tokens_0 value = combo[1] if len(combo[0]) <= len(combo[1]): value = combo[0] scores.append(FuzzySetScore(score, value, tokens)) ordered = sorted( scores, reverse=True, key=lambda s: (s.score, s.tokens, 1000000 - len(s.value))) return ordered[0]
def fw_token_set_ratio(question1, question2): fuzzy = [] for q1, q2 in zip(question1, question2): partial_ratio = fuzz.token_set_ratio(str(q1), str(q2)) / 100 fuzzy.append([partial_ratio]) print("Created fuzz token_set_ratio feature") return np.array(fuzzy)
def get_filter_link(link_choice,goal=None,min_score=None,max_limit=4,type=0): """ To get relevent link from list of link """ if min_score: min_score = int(min_score) else: min_score = 60 scored_link_list = [] scored_link_list_raw = process.extract(goal,link_choice,limit=max_limit) logger.info("Score details for goal {0} with statistics {1}. minimum score {2}".format(goal,scored_link_list_raw,min_score)) try: if scored_link_list_raw: for i in list(scored_link_list_raw): link = i[0] if int(type) != 1: score = i[1] if int(score) >= min_score: scored_link_list.append(link) logger.info("PARTIAL MATCH : Final score is {0} of url {1} for goal {2}".format(score,link,goal)) else: score = fuzz.token_set_ratio(goal,link) logger.info("EXACT MATCH : Final score is {0} of url {1} for goal {2}".format(score,link,goal)) if int(score) >= min_score: scored_link_list.append(link) except: logger.exception("Error occure in get_filter_link() function") return scored_link_list
def _title_similarity(self, other_title): if self.title and other_title: if fuzz.token_set_ratio(self.title, other_title) > 95: return True else: return False return 'NAN'
def fuzzy(products_name_set, listings): """ The function that uses Levenstein distance to determine matching pairs of products and listings :param products_name_set: Indexed product names(For faster matching) :param listings: Listings to be matched :return: A dictionary containg the matched product with all its listings """ final_products = defaultdict(list) for listing in listings: possible_products = set() for product_name in products_name_set: token_set_ratio = fuzz.token_set_ratio(listing["new_title"], product_name) if token_set_ratio is 100: possible_products.add(product_name) #More than one possible product found if len(possible_products) > 1: for possible_product in possible_products: partial_ratio = fuzz.partial_ratio(listing["new_title"], possible_product) if partial_ratio is 100: final_products[possible_product].append(listing) else: for possible_product in possible_products: final_products[possible_product].append(listing) return final_products
async def _on_answer(self, conv): channel = conv.channel answer = conv.meta['answer'].lower() win_event = None for event in conv.events: if 'trivia' in event: continue score = fuzz.token_set_ratio(event['text_clean'], answer) if score > 80: win_event = event break elif score > 50: try: await self.bot.send_message(channel, "<@{}> Not quite...".format(event['user'])) except KeyError as e: import traceback print("\n\nSomething went wrong in trivia") traceback.print_exc() event['trivia'] = True conv.meta['attempts'] += 1 if win_event is not None: user = win_event['user'] self.status[channel]['scores'][user] += conv.meta['value'] await self.bot.send_message(channel, "<@{}> got it right! The answer was {}".format(user, answer)) conv.done() await self.ask_question(channel) return True
def getRatio(var1, var2, alg): r1test = 40 r2test = 100 r3test = 100 r4test = 90 # 85 is probably too low --- too many FP # let's keep alg as a dummy, but it may be unimportant # it seems that the quality of results can be improved if two (or) # -- more results are correlated: [1] can be lowered as long as [4] remains high r1 = fuzz.ratio(var1,var2) r2 = fuzz.partial_ratio(var1,var2) r3 = fuzz.token_sort_ratio(var1,var2) r4 = fuzz.token_set_ratio(var1,var2) if r1 >= r1test: if r4 >= r4test: ratio = 100 #reportRatio(var1, var2) else: ratio = 0 else: ratio = 0 return(ratio)
def match_bus(h,st): global store_bus l=[] for x in h: temp=x no_list=re.findall(r'\d+',x) for n in no_list: temp=x.replace(n,' ' + n + ' ') temp=temp.lower().replace('-',' ').replace('.',' ') temp=' '.join(temp.split()) if fuzz.token_set_ratio(st,temp)>60: l.append([x,temp]) print("buses",l) ma=0 if len(l): for i in range(len(l)): if partial_ratio2(st,l[i][1]) > partial_ratio2(st,l[ma][1]): ma=i p=l[ma][1] for i in range(len(l)): if partial_ratio2(st,l[i][1]) == partial_ratio2(st,l[ma][1]) and i!=ma: print(l[i][1],l[ma][1]) store_bus=l[ma][1] return 2 print("found bus",l[ma][0]) bus_no_l.add(l[ma][0]) return 1 return 0
def match(self, listing): ''' Decide if this listing matches this product. In this version, we only match one Product at most ("A single price listing may match at most one product."), even though some listings are for items that are suitable for several products. Returns True or False. :param listing: A Listing object. ''' # token_set_ratio() checks for all the 'words' in the first argument # existing in the second argument. Case-insensitive even. Exactly how # I was going to code it up until I found the fuzzywuzzy library, # which has the advantage of being previously debugged. score = fuzz.token_set_ratio(self.name, listing.title) if score == 100: # Exact fuzzy match on my product name inside the listing. return True manu_score = fuzz.token_set_ratio(self.manufacturer, listing.title) family_score = fuzz.token_set_ratio(self.family, listing.title) if self.family else 0 model_score = fuzz.token_set_ratio(self.model, listing.title) if self.model else 0 if ' ' in self.model and model_score < 100: # Canon SX130 IS vs. SX130IS... model_nospaces = ''.join(' '.split(self.model)) if fuzz.token_set_ratio(model_nospaces, listing.title) == 100: model_score = 100 if manu_score == 100 and family_score == 100 and model_score == 100: # Seems legit. return True # Generating false positives (for example 'Canon_IXUS_300_HS' is # matching "Canon PowerShot ELPH 300 HS (Black)". Turning this # off does make us miss "Canon NB-7L Lithium-Ion Battery for G10, G11, # G12 Cameras" unfortunately. # #if manu_score == 100 and model_score == 100: # # People sometimes call things by manufacturer and model number. # # Might be ambiguous though... # return True if family_score == 100 and model_score == 100: # I'm typing on an IdeaPad Y500, for example. return True return False
def best_match(s, categories, top_n=5): """Return the top N best matches from your categories.""" scores = [] for cat in categories: scores.append((cat, fuzz.token_set_ratio(s, cat))) scores = sorted(scores, key=lambda x: x[1]) return scores[-top_n:]
def fuzzy_search(search): #import json file with wine ratings/reviews into pandas dataframe df_ratings = pd.read_json('../data/wine_ratings.json') #execute fuzzy search and save results df_ratings['score'] = df_ratings['name'].map(lambda x: fuzz.token_set_ratio(search, x)) df_sorted = df_ratings.sort(columns='score', ascending=False) return df_sorted
# Comman features data['len_q1'] = data.question1.apply(lambda x: len(str(x))) data['len_q2'] = data.question2.apply(lambda x: len(str(x))) data['diff_len'] = np.abs(data.len_q1 - data.len_q2) data['len_char_q1'] = data.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_char_q2'] = data.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split())) data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split())) data['common_words'] = data.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1) data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) print('vector features ...') # model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True) data['wmd'] = data.apply(lambda x: wmd(x['question1'], x['question2']), axis=1) # norm_model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True) norm_model.init_sims(replace=True) data['norm_wmd'] = data.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.question1.values)):
def predict_chip_dict(wdir, input_pattern_str, bamExt, fromBAM=None): """ Predict a chip_dict from set of bam files ChIP input/control samples are identified from input_pattern (default: 'input') for each sample then the best input sample (by fuzzywuzzy score) is selected chip_dict is written as yaml to workflow workingdir predicts whether a sample is broad or narrow based on histone mark pattern """ pat = "|".join(re.split(',| |\\||;', input_pattern_str)) input_pat = r".*(" + pat + ")" clean_pat = r"" + pat + "" pat1 = re.compile(clean_pat, re.IGNORECASE) if fromBAM: infiles = sorted(glob.glob(os.path.join(fromBAM, '*' + bamExt))) else: infiles = sorted( glob.glob(os.path.join(wdir, 'filtered_bam/', '*.bam'))) samples = get_sample_names_bam(infiles, bamExt) chip_dict_pred = {} chip_dict_pred["chip_dict"] = {} print( "---------------------------------------------------------------------------------------" ) print("Predict Chip-seq sample configuration") print( "---------------------------------------------------------------------------------------" ) print("\nSearch for Input/control samples...") input_samples = set([]) for i in samples: if re.match(input_pat, i, re.IGNORECASE): print("...found: ", i) input_samples.add(i) print("\nTry to find corresponding ChIP samples...") for i in samples: if i in input_samples: continue print( "\n sample: ", i, ) matches_sim = {} for j in input_samples: c_clean = pat1.sub("", j) sim1 = fuzz.ratio(c_clean, i) + fuzz.partial_ratio( c_clean, i) + fuzz.token_sort_ratio( c_clean, i) + fuzz.token_set_ratio(c_clean, i) matches_sim[j] = sim1 / 4 sim = 0 final_matches = set([]) for key, value in sorted(matches_sim.items(), key=lambda k: (k[1], k[0]), reverse=True): if value >= sim: final_matches.add(key) print(" top matching input sample by score: %s = %s" % (key, value)) sim = value tmp = ':'.join(list(final_matches)) if len(final_matches) > 1: tmp = "__PLEASE_SELECT_ONLY_ONE_CONTROL__:" + tmp elif len(final_matches) == 0: print("No control sample found!") chip_dict_pred["chip_dict"][i] = {} chip_dict_pred["chip_dict"][i]['control'] = tmp if re.match(".*(H3K4me1|H3K36me3|H3K9me3|H3K27me3).*", i, re.IGNORECASE): chip_dict_pred["chip_dict"][i]['broad'] = True else: chip_dict_pred["chip_dict"][i]['broad'] = False outfile = os.path.join(wdir, "chip_seq_sample_config.PREDICTED.yaml") write_configfile(outfile, chip_dict_pred) print( "---------------------------------------------------------------------------------------" ) print("Chip-seq sample configuration is written to file ", outfile) print( "Please check and modify this file - this is just a guess! Then run the workflow with it." ) print( "---------------------------------------------------------------------------------------" )
# print "Row id: ", loc[0] query_result = google_places.nearby_search( lat_lng={'lat' : lat, 'lng' : lon}, rankby='prominence', radius=100) similarity = 0.0 bestPlace = '' bestPlaceType = '' #returns a list of places for place in query_result.places: #get the place name placeName = (place.name).encode("utf8").lower() #do fuzzy match with the tag string for that location coordinate simNew = fuzz.token_set_ratio(text, placeName) #find the place with the highest similarity with the tag string if simNew > similarity: similarity = simNew bestPlace = placeName # query for place type or category only if it isn't present in the dictionary # saving API limit if placeName not in placeCategory.keys(): place.get_details() bestPlaceType = place.details[u'types'] else: bestPlaceType = placeCategory[placeName]
def get_fuzzy_similarity(sent1, sent2): sim = fuzz.token_set_ratio(sent1, sent2) if sim == 0: return 0 else: return sim / 100
def run_quickstart(self): from fuzzywuzzy import fuzz # [START language_quickstart] # Imports the Google Cloud client library # [START language_python_migration_imports] from google.cloud import language from google.cloud.language import enums from google.cloud.language import types # [END language_python_migration_imports] from database import DatabaseManager, Note dataIn = DatabaseManager("notes") dataOut = DatabaseManager("super_notes") # Instantiates a client # [START language_python_migration_client] client = language.LanguageServiceClient() # [END language_python_migration_client] text1 = dataOut.get_note_key(self.note1_key)["note"] text2 = dataIn.get_note_key(self.note2_key)["note"] # __________________________ # READ VALUE 1 # __________________________ # with open('input1.txt', 'r') as file: # text1 = file.read().replace('\n', '') # ___________________________ # READ VALUE 2 # ___________________________ # with open('input2.txt', 'r') as file2: # text2 = file2.read().replace('\n', '') words1 = text1.split(".") words2 = text2.split(".") for x in words1: if (x[:1] == " "): x = x[1:] for x in words2: if (x[:1] == " "): x = x[1:] keywords1 = [] key_sentances1 = "" key_sent_array_1 = [] keywords2 = [] key_sentances2 = "" key_sent_array_2 = [] # The text to analyze document1 = types.Document(content=text1, type=enums.Document.Type.PLAIN_TEXT) document2 = types.Document(content=text2, type=enums.Document.Type.PLAIN_TEXT) outputText = "" # Detects the sentiment of the text response1 = client.analyze_entities( document=document1, encoding_type='UTF32', ) for entity in response1.entities: if entity.salience > 0.015: keywords1.append(entity.name) print('=' * 20) print('name: {0}'.format(entity.name)) print('type: {0}'.format(entity.type)) print('metadata: {0}'.format(entity.metadata)) print('salience: {0}'.format(entity.salience)) response2 = client.analyze_entities( document=document2, encoding_type='UTF32', ) for entity in response2.entities: if entity.salience > 0.015: keywords2.append(entity.name) print('=' * 20) print('name: {0}'.format(entity.name)) print('type: {0}'.format(entity.type)) print('metadata: {0}'.format(entity.metadata)) print('salience: {0}'.format(entity.salience)) print("Keys 1:", keywords1) print("Keys 2:", keywords2) for x in words1: for i in keywords1: if (x.find(i) > -1) and x not in key_sentances1: key_sentances1 += x + "\n" key_sent_array_1.append(x) for x in words2: for i in keywords2: if (x.find(i) > -1) and x not in key_sentances2: key_sentances2 += x + "\n" key_sent_array_2.append(x) #print(key_sentances2) #out = open("output1.txt", "w") #out.write(key_sentances1) #out.close() #out = open("output2.txt", "w") #out.write(key_sentances2) #out.close() newVals = [" "] for x in key_sent_array_1: canAdd = True for i in newVals: Token_Set_Ratio = fuzz.token_set_ratio(x, i) if Token_Set_Ratio > 80: canAdd = False if canAdd: newVals.append(x) for x in key_sent_array_2: canAdd = True for i in newVals: Token_Set_Ratio = fuzz.token_set_ratio(x, i) if Token_Set_Ratio > 50: canAdd = False if canAdd: newVals.append(x) newValsString = "" for x in newVals: newValsString += x + "\n" #writing to database super_note = Note(2, "physics", newValsString) dataOut.add_note_to_db(super_note) #_______________________________________ # ADDING OUTPUT #_______________________________________ # final = open("final.txt", "w") # final.write(newValsString) # final.close() return newValsString # n = NoteAnalysis("-LqyiulvtclaFSFsC4_Q", "-Lqyl7NHN9vWsMeJYBIM")
def run(self): st = datetime.datetime.now() print('1. [start] init_run ------------------------------') init_confidence = self.confidence self.init_run() et = datetime.datetime.now() print('1. [end] init_run => ', et - st) st = datetime.datetime.now() print('2. [start] run_batch ------------------------------') # 명사 위주로 강하게 묶기 (단, 횟수가 많으면 정확도가 떨어짐) self.confidence = init_confidence for i in range(2): if self.confidence >= 70: st_1 = datetime.datetime.now() self.run_batch(noun=True) self.confidence = self.confidence - 5 et_1 = datetime.datetime.now() print('2-1. [end] run_batch noun-------', i + 1, '번째 run_batch => ', et_1 - st_1) elif self.confidence < 70: break # 동사 포함하여 약하게 풀면서 묶기 self.confidence = init_confidence for i in range(self.batch_size): if self.confidence >= 70: st_1 = datetime.datetime.now() self.run_batch(noun=False) self.confidence = self.confidence - 3 et_1 = datetime.datetime.now() print('2-2. [end] run_batch verb-------', i + 1, '번째 run_batch => ', et_1 - st_1) elif self.confidence < 70: break et = datetime.datetime.now() print('2. [end] run_batch => ', et - st) # merge run > 그룹간의 대표 텍스트를 비교하여 합칠 그룹이 있다면 매칭시킴 # reform run > 묶인 예시가 2개 이하인데 대표 텍스트가 많은 건 다른 예시가 합쳐질 확률이 적기 때문에 쪼갠 후 big그룹과 다시 비교 후 매칭 if self.merge: st = datetime.datetime.now() print('3. [start] merge_run ------------------------------') self.merge_run() et = datetime.datetime.now() print('3. [end] merge_run => ', et - st) st = datetime.datetime.now() print('4. [start] reform_run ------------------------------') self.reform_run() et = datetime.datetime.now() print('4. [end] reform_run => ', et - st) #최종 비교 self.confidence = init_confidence tmp_clusters = [] for cluster in self.clusterings: if len(cluster['texts']) > 2: cluster['texts'] = [] tmp_clusters.append(cluster) text_test = [] for i, text in enumerate(self.before_texts): convert_text = self.filtering(str_list=[text], noun=True) for cluster in tmp_clusters: this_ratio = fuzz.token_set_ratio(cluster['totalText'], text) conv_ratio = fuzz.token_set_ratio(cluster['totalText'], convert_text) if this_ratio >= 70 or conv_ratio >= 75: cluster['texts'].append(text) text_test.append(text) break print(len(self.before_texts), len(list(set(self.before_texts))), len(text_test)) final_clusters = [] for cluster in tmp_clusters: if len(cluster['texts']) > 0: final_clusters.append(cluster) self.clusterings = final_clusters return self.clusterings
def get_location_data(location_name, api_key): data_dict = {'q': location_name, 'key': api_key} loc_by_query = LocationByQuery(data_dict) q_result_list = loc_by_query.get_address time.sleep(1) print(f'{location_name}: # of results:- {len(q_result_list)}') main_results_list = [] for result in q_result_list: if 'countryRegionIso2' not in result.keys(): result['countryRegionIso2'] = None if result['countryRegionIso2'] != 'US': continue else: result_index = q_result_list.index(result) q_f_address = result['formattedAddress'] if 'locality' in result.keys(): q_f_locality = result['locality'] else: q_f_locality = None item_list = [ result_index, data_dict['q'], q_f_locality, q_f_address ] main_results_list.append(item_list) # print(item_list, result) pre_output_dict = {} for res_list in main_results_list: locality_TokenSetRatio = fuzz.token_set_ratio(res_list[1], res_list[2]) f_address_TokenSetRatio = fuzz.token_set_ratio(res_list[1], res_list[3]) pre_output_dict[res_list[0]] = [ locality_TokenSetRatio, f_address_TokenSetRatio ] # -------------------------------Filtering Based on Locality match score locality_score_list = [] max_locality_score = 0 max_locality_score_key = 0 for key, val in pre_output_dict.items(): locality_score = pre_output_dict[key][0] if locality_score > max_locality_score: max_locality_score = locality_score max_locality_score_key = key else: continue # -----------------------------Filtering original dict based on locality score filtered_pre_output_dict = {} for key, val in pre_output_dict.items(): if pre_output_dict[key][0] == max_locality_score: filtered_pre_output_dict[key] = val else: continue # print(filtered_pre_output_dict) # ------------------Get required index based on resulting f_address_score max_f_address_score = 0 f_address_indices = [] if max_locality_score == 0: used_output_dict = pre_output_dict else: used_output_dict = filtered_pre_output_dict for key, val in used_output_dict.items(): f_address_score = used_output_dict[key][1] if f_address_score > max_f_address_score: max_f_address_score = f_address_score f_address_indices.append(key) else: continue if len(f_address_indices) == 0: return '' min_index_for_max_f_address_score = f_address_indices[0] index_to_use = min_index_for_max_f_address_score required_data = q_result_list[index_to_use] print(f'{location_name} : {required_data}') print('------------------------------------------------------------') return required_data
def testPartialTokenSetRatio(self): self.assertEqual(fuzz.token_set_ratio(self.s4, self.s5),100)
from fuzzywuzzy import fuzz import re for s in scrutins.values(): estamd = re.match(r'.*l\'amendement n. *([0-9]+) de (.*)', s['desc']) if 0: namd = estamd.groups()[0] sig = estamd.groups()[1] dos = amds.get(s['dossierlien'], None) if not dos: continue candidats = [] for _amds in dos: if namd in _amds.keys(): candidats.append((fuzz.token_set_ratio( sig, _amds[namd]['signataires'].split(',')[0]), _amds[namd])) _amds = sorted(candidats, key=lambda x: x[0], reverse=True)[0][1] amd_detail = json.loads( requests.get( 'http://www2.assemblee-nationale.fr/recherche/query_amendements?id=' + _amds['id'] + '&leg=15&typeRes=doc').content) fields = amd_detail['infoGenerales']['description_schema'].split( '|') + ['autre', 'autre1', 'autre2'] _amdscompl = [ dict((fields[i], v) for i, v in enumerate(elt.split('|'))) for elt in amd_detail['data_table'] ][0] _amdscompl.update(_amds) s['reference'] = _amdscompl
def _similarityScore(s1, s2): return fuzz.token_set_ratio(s1, s2)
async def lookup(self, ctx, *args): """Takes in subject name and allows you to look up courses and sections in that subject""" # Store their lookup string, store list of subject strings lookup_string = " ".join(args) subject_options = [x["description"] for x in subject_list] subject_match = (0, "") # Go through all subject options and find the best fuzzy search match to lookup string for subject in subject_options: ratio = fuzz.token_set_ratio(lookup_string, subject) if ratio > subject_match[0]: subject_match = (ratio, subject) # Initialize mapping from course titles to objects, subject titles to objects, and list for course titles courses = {x["title"]: x for x in classes_dict[subject_match[1]]} subjects = {x["description"]: x for x in subject_list} course_titles = [k for (k, v) in courses.items()] # Separate course titles by commas course_str = ", ".join(course_titles) # Display all the course options from the best match embed = discord.Embed( title=f"Showing results for {subject_match[1]}", description="Please choose a course", color=0xFF0000, ) embed.add_field(name="Type in a course name to see available sections", value=course_str) await ctx.send(embed=embed) def check(m): """Quick check to make sure only the person in the game and channel can respond""" return m.channel == ctx.channel and m.author == ctx.author # Wait for their course choice msg = await self.bot.wait_for("message", check=check) # Keep prompting until it's a valid course while msg.content.upper() not in course_titles: await ctx.send( "Please enter course name exactly, case doesn't matter") msg = await self.bot.wait_for("message", check=check) # Get their course object from the map chosen_course = courses[msg.content.upper()] # Set up an embed title and description with their course embed = discord.Embed( title=f"{subject_match[1]}, {msg.content.upper()}", description= f'01:{subjects[subject_match[1]]["code"]}:{chosen_course["courseNumber"]}', color=0xFF0000, ) # Go through all the sections of the course for section in chosen_course["sections"]: # Set up variables describing the section index = section["index"] status = "open" if check_open(index) else "closed" number = section["number"] profs = "; ".join([x["name"] for x in section["instructors"]]) # Add a field for that section embed.add_field(name=f"{number}, {index}\n{profs}", value=f"Status: {status}") # Send the section data await ctx.send(embed=embed)
reader = csv.DictReader(csvfile) for row in reader: nameList.append(str(row[columnName])) counter = len(nameList) f=csv.writer(open(fileName[:fileName.index('.')]+'NearMatches.csv','w')) f.writerow(['percentage']+['name1']+['name2']) completeNearMatches = [] for name in nameList: counter -= 1 print('Rows remaining: ', counter) for name2 in nameList: if name != name2: ratio = fuzz.ratio(name, name2) partialRatio = fuzz.partial_ratio(name, name2) tokenSort = fuzz.token_sort_ratio(name, name2) tokenSet = fuzz.token_set_ratio(name, name2) avg = (ratio+partialRatio+tokenSort+tokenSet)/4 if avg > threshold: nearMatch = [avg, name, name2] nearMatch = sorted(nearMatch) if nearMatch not in completeNearMatches: completeNearMatches.append(nearMatch) else: pass for nearMatch in completeNearMatches: f.writerow([nearMatch[0]]+[nearMatch[1]]+[nearMatch[2]]) elapsedTime = time.time() - startTime m, s = divmod(elapsedTime, 60) h, m = divmod(m, 60)
def create_check(self, df, columns_merge=['nom_etablissement', 'adresse'], additional_columns_in_check=[], TRESHOLD_INF=50, TRESHOLD_SUP=90, common_words=[], merge_also_equality=False): """ Create consolidate_check.xlsx in which you have to put a cross per match ie : for triplet you put 3 crosses Inputs: An Excel containing a column 'code_postal' converted into a pandas DataFrame columns_merge : Name of the colums on which we will perform the merge Do not put in it the postal code There is still the possibility to put more than one feature in columns df_check However, if you put just the nom_etablissement in columns_merge, the algorithm will work nicely. We do not recommand putting in it the address Return : At the end, the algorithme will return an Excel in which the human-checker will have to put x in the column 'check' If the score is contained between the two tresholds, there is a manual check """ # Think about the tests of conformity of the Excels : hypothesis : already done in Alteryx # Maybe a future improvement = count each word and delete the most frequent ones or ponderate by the inverse of their occurrence # Putting the index in the slicing in order to writting them down easily in the duplicates_check.xlsx df = df.reset_index(drop=True) df["id_duplicates"] = df.index.values # Uniformisation of the inputs, without changing the output of the final_duplicates.xlsx # That is not necessary but sometimes in the excels, code_postal is encoded as int or str for feature in columns_merge: df[feature] = df[feature].apply(str) # df which will be exported into Excel as consolidate_check df_check = pd.DataFrame() postal_codes = list(set(df['code_postal'])) match_id = -1 print("Analyzing the duplicates ...") for nbr_postal_code, postal_code in enumerate(postal_codes): # Une matrice par code postal print(nbr_postal_code, len(postal_codes)) df_postal_code = df[df['code_postal'] == postal_code] # for each line in the short_excel, we seek the corresponding line in the long_excel for i, row_short in df_postal_code.iterrows(): match_id += 1 if len(df_postal_code) > 0: # Contains all the lines of the long Excel which matches sufficiently with the line i of the shot Excel df_match = pd.DataFrame() for j, row_long in df_postal_code.iterrows(): if j > i: # matching_score > TRESHOLD_SUP : very very probably a match # matching_score between the two TRESHOLD : manual check # matching_score < TRESHOLD_INF : very very unlikely to be a match # We want to calculate the minimum between the matching score of the address and of the name min_score = 100 for elem_merge in columns_merge: str_long = row_long[elem_merge].lower() str_short = row_short[elem_merge].lower() # We delete the frequent words to also perform the duplicates check on it str_long_without = strip_frequently_used_word( str_long, common_words) str_short_without = strip_frequently_used_word( str_short, common_words) score_without = fuzz.token_set_ratio( str_long_without, str_short_without) score_with = fuzz.token_set_ratio( str_long, str_short) # We want to find all the duplicates, so we take the maximum between score_without and score_with score = max(score_with, score_without) if score > TRESHOLD_INF: # min_score = min(matching_score(address), matching_score(name)) if score < min_score: min_score = score else: min_score = score # We break because the min_score = min(matching_score(address), matching_score(name)) # and here we already now that eiher the two address or the two name are suffisciently different in order to not compare the other features break # We copy the line in the df_match if the lines matches sufficiently if min_score > TRESHOLD_INF: row_long['match_id'] = match_id row_long['source_duplicates'] = 'long' row_long['first_line_match_id'] = 0 row_long['matching_score'] = min_score df_match = df_match.append(row_long, sort=False) # We print the potential best candidates after printing the short line row_short['match_id'] = match_id row_short['first_line_match_id'] = 1 if len(df_match) == 0: # If no potential match have been found, we just write down the short single line row_short['source_duplicates'] = 'single' df_check = df_check.append(row_short, sort=False) else: # We sort the matches by putting the best matches first df_match = df_match[ df_match['matching_score'] > TRESHOLD_INF] df_match = df_match.sort_values(by=['matching_score'], ascending=False) df_match_max = df_match[ df_match['matching_score'] == max( df_match['matching_score'])] df_match_max = df_match_max.reset_index(drop=True) if df_match_max.loc[0, 'matching_score'] > TRESHOLD_SUP: # if the score of the best matches is high enough, thery are duplicates if len(df_match_max) == 1 or len( df_match_max) > 1 and merge_also_equality: row_short[ 'source_duplicates'] = 'automatically merged' df_match_max[ 'source_duplicates'] = 'automatically merged' row_short['check'] = 'x' df_match_max.loc[:, 'check'] = 'x' else: # if there is a triplet having the same matching score, we let the human checker select the duplicates row_short[ 'source_duplicates'] = 'not merged because equality' df_match_max[ 'source_duplicates'] = 'not merged because equality' df_check = df_check.append(row_short, sort=False) df_check = df_check.append(df_match_max, sort=False) # à chaque fois qu'on fait un append comme df_check = df_check.append(df_match_max, sort=False), le code est beaucoup ralenti car l'append recopie tout le df_check. Ce problème a été patché dans la consolidation. On pourra s'en inspirer ici si besoin else: # if all matching score are under TRESHOLD_SUP, we writte down all matches row_short['source_duplicates'] = 'manual check' row_short['check'] = '' df_match['source_duplicates'] = 'manual check' df_check = df_check.append(row_short, sort=False) df_check = df_check.append(df_match, sort=False) else: print("No ", postal_code, "fund in the long Excel file.") # Keeping only the merge columns df_check = df_check[columns_merge + additional_columns_in_check + [ 'code_postal', 'match_id', "first_line_match_id", 'matching_score', 'source_duplicates', 'id_duplicates', "check" ]] df_check = df_check.reset_index() return df_check
def imageanalise(id_gen,source_pdfs,folders): from fuzzywuzzy import fuzz from fuzzywuzzy import process import os import pyocr import pyocr.builders import re from PIL import Image # Creating a report file id_source,spath,pages,id_gen,destination,types,docid,outcome GenerateReport('ID Source','Source Path','Pages','ID Destination','Destination','Type','Doc ID','Outcome', True) # Verify subfolder in main folder for i,folder in folders: vals = pathmapping(folder,'croped/croped_txt1_*.jpg',False,True) pdf_pages_number = [] pdf_pages_sv = [] fdfd = list(vals) # Check for validation images inside folder for numpage,val in reversed(fdfd): print(val) #green_grade = 0 im = Image.open(val) jpg = val.replace('croped/croped_txt1_','') # Saving PDF pages pdf_pages_number.append(numpage) # Check for green grade in image # for pixel in im.getdata(): # if (pixel[1]>(pixel[2]+10) and pixel[1]>(pixel[0]+10)): # green_grade += 1 # Check text inside main area of analises # if (green_grade >=200): # Build txt image in order to be analised cropimage(folder,jpg,100,120,700,270,'croped_txt1_') jpg_text = val.replace('val1','txt1') # Convert image into text mode tools = pyocr.get_available_tools()[0] text_txt1 = tools.image_to_string(Image.open(jpg_text), builder=pyocr.builders.DigitBuilder()) print(fuzz.token_set_ratio('ALVARA', text_txt1)) print(fuzz.token_set_ratio('HABITESSE', text_txt1)) startnum = val.rfind('_') endnum = val.rfind('.') if fuzz.token_set_ratio('ALVARA', text_txt1) > 70 and fuzz.token_set_ratio('CUMPRIMENTO', text_txt1) < 30: if len(pdf_pages_number)>1: pdf_pages_sv.append(pdf_pages_number.pop()) pdf_pages_sv.append(pdf_pages_number.pop()) pdf_pages_sv.reverse() id_gen = GenerateDoc(id_gen,source_pdfs,'ALVARA',folder,jpg,550,180,770,330,pdf_pages_sv,None) print('\n ============ DOCUMENT FOUND (ALVARA) =========== \n') if len(pdf_pages_number)>0: id_gen = GenerateDoc(id_gen,source_pdfs,'NotRecon',folder,jpg,0,0,1,1,pdf_pages_number,None) print('\n ============ DOCUMENT NOT FOUND =========== \n') else: id_gen = GenerateDoc(id_gen,source_pdfs,'ALVARA',folder,jpg,550,180,770,330,pdf_pages_number,None) print('\n ============ DOCUMENT FOUND (ALVARA) =========== \n') elif fuzz.token_set_ratio('HABITESSE', text_txt1) > 70 and fuzz.token_set_ratio('VALIDAMOS', text_txt1) < 30 : # Saving PDF pages if len(pdf_pages_number)>1: pdf_pages_sv.append(pdf_pages_number.pop()) pdf_pages_sv.append(pdf_pages_number.pop()) pdf_pages_sv.reverse() id_gen = GenerateDoc(id_gen,source_pdfs,'HABITESSE',folder,jpg,250,260,420,300,pdf_pages_sv,None) print('\n ============ DOCUMENT FOUND (HABITESSE) =========== \n') if len(pdf_pages_number)>0: id_gen = GenerateDoc(id_gen,source_pdfs,'NotRecon',folder,jpg,0,0,1,1,pdf_pages_number,None) print('\n ============ DOCUMENT NOT FOUND =========== \n') else: id_gen = GenerateDoc(id_gen,source_pdfs,'HABITESSE',folder,jpg,250,260,420,300,pdf_pages_number,None) print('\n ============ DOCUMENT FOUND (HABITESSE) =========== \n') elif int(val[startnum+1:endnum])==0: id_gen = GenerateDoc(id_gen,source_pdfs,'NotRecon',folder,jpg,0,0,1,1,pdf_pages_number,None) print('\n ============ DOCUMENT NOT FOUND =========== \n') # else: # jpg_text2 = val.replace('val1','txt2') # tools = pyocr.get_available_tools()[0] # text_txt2 = tools.image_to_string(Image.open(jpg_text2), builder=pyocr.builders.DigitBuilder()) # if fuzz.partial_ratio('Sistema de Tratamento de Efluentes', text_txt2) > 70: # id_gen = GenerateDoc(id_gen,source_pdfs,'STE',folder,jpg,380,60,620,130,pdf_pages_number,None) # print('\n ============ DOCUMENT FOUND (STE) =========== \n') # else: # jpg_text3 = val.replace('val1','txt3') # tools = pyocr.get_available_tools()[0] # text_txt3 = tools.image_to_string(Image.open(jpg_text3), builder=pyocr.builders.DigitBuilder()) # startnum = val.rfind('_') # endnum = val.rfind('.') # doc_num = re.findall(r'r\d+/\d+|$', text_txt3) # doc_num = ''.join(doc_num[0]) # doc_num = doc_num.replace('/','.') # if doc_num == '': # doc_num = str(id_gen) # if fuzz.partial_ratio('LICENGA ESPECIAL', text_txt3) > 70: # id_gen = GenerateDoc(id_gen,source_pdfs,'LE',None,jpg,0,0,0,0,pdf_pages_number,doc_num) # print('\n ============ DOCUMENT FOUND (LE) =========== \n') # elif int(val[startnum+1:endnum])==0: # id_gen = GenerateDoc(id_gen,source_pdfs,'NotRecon',None,jpg,0,0,0,0,pdf_pages_number,doc_num) # print('\n ============ DOCUMENT NOT FOUND =========== \n') os.system('rm -r -f docclass/')
for i in ax.patches: # get_width pulls left or right; get_y pushes up or down ax.text(i.get_width()+.1, i.get_y()+.31, str(round((i.get_width()), 2)), fontsize=9, color='dimgrey') # invert for largest on top ax.invert_yaxis() plt.gcf().subplots_adjust(left=0.3) # Remove df's fuzzyAutoAdd1, etc., FuzzyWuzzyProcResult1, etc., fuzzySourceZ, etc., GoldStandard, others #%% # ================================================================== # 3. FuzzyWuzzyListToCheck - Set up manual matching UI # ================================================================== ''' Now that the safe bets have been taken out, let's allow more liberal matching and finish some assignments using human review. Over time you can change the parameters to match your time and desired level of effort. You can reduce the list, change the type of match (full phrase or any word), and change the score, to change the number of candidates to match how much time you want to spend in the browser. When starting with a new site you should probably spend a good deal of time here, to make connections the other steps can't make. Decisions you make here will provide training data that the machine learning component can use. Some options described at https://www.neudesic.com/blog/fuzzywuzzy-using-python/. See for example fuzz.ratio (conservative) vs. fuzz.partial_ratio (medium) vs. fuzz.token_set_ratio (any single word in the phrases, very liberal). The more liberal you get here, the more you will see multiple-concept searches, which
def lookup(self): self.donor = str(self.donor) company_number = None people_links = [] found = False # ugly hack corrections if self.donor in ['Tresco Estate', 'James Hay', 'Think BDW Ltd']: self.status = 'company' if self.status == 'company, no 10120655': company_number = 10120655 if 'Armed Forces Parliamentary Trust' == self.donor: self.status = 'other' if u'Buck’s Club 1919' in self.donor: self.donor = "Buck's Club 1919" self.status = 'members' if u'Pratt’s Club' in self.donor: self.donor = "Pratt's Club" self.status = 'members' if 'carlton club' in self.donor.lower(): self.donor = 'Carlton Club' self.status = 'members' if 'National Liberal Club' in self.donor: self.donor = 'National Liberal Club' self.status = 'members' if 'The Public Interest Foundation (UK charity)' == self.donor: self.status = 'charity' # apply patches if self.donor in urls.keys(): company_number = urls[self.donor].split('/')[-1] if self.donor in people.keys(): people_links = people[self.donor] if not company_number: # use the supplied company number from the register of interests # if 'company' in self.status: company_number_search = re.search('registration [0-9|a-z|A-Z]+', self.status) if company_number_search: company_number = company_number_search.group().split( 'registration ')[-1] # needs padding to 8 digits, if it starts with an int if re.match('[0-9]', company_number): company_number = '%08d' % (int(company_number)) self.company = { 'company_name': self.donor, 'company_number': 'N/A', 'company_status': 'Active' } self.persons = [] self.officers = [] self.link = None self.appointments = [] if company_number: # we have a company number, no need to search for it self.company = getlink( {'links': { 'self': '/company/%s' % str(company_number) }}, 'self') persons = getlink(self.company, 'persons_with_significant_control') self.persons = persons['items'] officers = getlink(self.company, 'officers') self.officers = officers['items'] if not self.company.has_key('errors'): self.link = 'https://beta.companieshouse.gov.uk' + self.company[ 'links']['self'] found = True else: self.company = { 'company_name': self.donor, 'company_number': 'N/A', 'company_status': 'Active' } self.link = '' else: if 'individual' in self.status.lower( ) or 'private' in self.status.lower(): # found = True # for individuals, we store the appointments, then the company, officers etc as children # of the appointment if people_links != []: for pl in people_links: bit = pl.split( 'https://beta.companieshouse.gov.uk')[-1] appointments = getlink({'links': { 'self': '%s' % bit }}, 'self') for i in appointments['items']: if i not in self.appointments: self.appointments.append(i) # just take the last one self.link = pl found = True for app in self.appointments: # add the company, officers and persons record to appointment record app['company'] = getlink(app, 'company') app['officers'] = getlink(app['company'], 'officers')['items'] app['persons_with_significant_control'] = getlink( app['company'], 'persons_with_significant_control')['items'] # eveything below here, should generate a company / entity elif 'trade' in self.status.lower(): self.type = 'union' if self.donor in trade_union.keys(): self.donor = trade_union[self.donor] found = True elif 'charity' in self.status.lower(): self.type = 'charity' if self.donor in charities.keys(): self.donor = charities[self.donor] found = True elif 'unincorporated' in self.status.lower(): self.type = 'club' if self.donor in clubs.keys(): self.donor = clubs[self.donor] found = True elif 'members' in self.status.lower(): self.type = 'club' if self.donor in clubs.keys(): self.donor = clubs[self.donor] found = True elif 'friendly' in self.status.lower(): self.type = 'club' if self.donor in clubs.keys(): self.donor = clubs[self.donor] found = True elif 'other' in self.status.lower(): self.type = 'other' if self.donor in others.keys(): self.donor = others[self.donor] found = True elif 'trust' in self.status.lower(): self.type = 'other' if self.donor in others.keys(): self.donor = others[self.donor] found = True elif 'provident' in self.status.lower(): self.type = 'company' if self.donor in others.keys(): self.donor = others[self.donor] found = True elif 'visit' in self.status: # TODO self.type = 'visit' else: # we dont have a company number, so do a company search if 'llp' in self.status.lower( ) or 'limited' in self.status.lower(): self.type = 'company' else: self.type = 'other' # these are the remaining things to search - can only do a company search really companies = CompaniesHouseCompanySearch([self.donor]) for i in companies.data: # we need the name and address to fuzzy match name_ratio = fuzz.token_set_ratio(i['title'].lower(), self.donor) if name_ratio > 90: if i['address_snippet']: addr_ratio = fuzz.token_set_ratio( i['address_snippet'].lower(), self.address) # if the address matches enough if addr_ratio > 90: self.link = 'https://beta.companieshouse.gov.uk' + i[ 'links']['self'] self.company = getlink(i, 'self') persons = getlink( self.company, 'persons_with_significant_control') self.persons = persons['items'] officers = getlink(self.company, 'officers') self.officers = officers['items'] # print 'FOUND %s: , %s' % (self.status.upper(), self.company['company_name']) found = True break # print self.donor, self.address # if 'sw1p 3ql' in self.address.lower(): # print '*'*100 # print '55 TUFTON STREET: %s' % self.donor # print '*'*100 if found: pass # print '\tFOUND %s: %s' % (self.status.upper(), self.donor) else: # pass print '\tMISSING %s: %s' % (self.status.upper(), self.donor)
def fuzzyScore(string1, string2): return fuzz.token_set_ratio(string1, string2)
if multi_match[ic][0] == list1[j][ 5] and multi_match[ic][1] == list2[i][ 5]: multi_match[ic][2] = multi_match[ic][ 2] + list2[i][4] + " " + list2[i][ 1] + " " + list2[i][0] + "\n" multi_match[ic][8] = multi_match[ic][ 8] + " " + list1[j][4] multi_match[ic][9] = multi_match[ic][ 9] + " " + list2[i][4] ff = 1 break if ff == 0: multi_match.append([list1[j][5], list2[i][5], list2[i][4]+" "+list2[i][1]+" "+list2[i][0]+"\n", \ "", "", "", fuzz.token_set_ratio(list1[j][3], list2[i][3])/100, 1, list1[j][4], list2[i][4]]) cnt_m = cnt_m + 1 break else: #If Event_Name is not same, Close_match # Close_Match=Close_Match+" "+list2[i][4]+" "+list2[i][1]+" "+list2[i][0]+"\n" # F01 = F01 + " " + list1[j][4] # F02 = F02 + " " + list2[i][4] ff = 0 for ic in range(cnt_m): if multi_match[ic][0] == list1[j][ 5] and multi_match[ic][1] == list2[i][ 5]: multi_match[ic][3] = multi_match[ic][2] + list2[i][4] + " " + list2[i][1] + " " + \ list2[i][0] + "\n" multi_match[ic][8] = multi_match[ic][ 8] + " " + list1[j][4]
def get_similarity(first_string: str, second_string: str): return fuzz.token_set_ratio(first_string, second_string)
def ExtractSymbol(title): final = "" df_matchedSym = [] matchedSymList = "" for i in range(0, len(Symbols)): sym = Symbols[i].lower() st = title.lower() if(Symbols[i] in excep): st = title sym = Symbols[i] temp = [] reg = r"\b"+sym+r"\b" reg = re.compile(reg) if(reg.search(st)): temp.append(Symbols[i]) temp.append(Names[i]) temp.append(True) else: temp.append(Symbols[i]) temp.append(Names[i]) temp.append(False) df_matchedSym.append(temp) df_matchedSym = [z for z in df_matchedSym if z[2] == True] if df_matchedSym: for term in df_matchedSym: matchedSymList = matchedSymList+"|"+term[0] df_matchedSym = [] temp2 = re.sub('[^A-Za-z0-9. ]+', '', title) for i in range(0, len(Names)): thres=50 tsor = fuzz.token_sort_ratio(temp2.lower(), Names[i].lower()) tser = fuzz.token_set_ratio(temp2.lower(), Names[i].lower()) r = fuzz.ratio(temp2.lower(), Names[i].lower()) pr = fuzz.partial_ratio(temp2.lower(), Names[i].lower()) avg = (tsor+tser+r+pr)/4 if(Names[i]=="CONSOLIDATED CONSTRUCTION"): thres = 60 temp = [] if(avg >= thres): temp.append(Symbols[i]) temp.append(Names[i]) temp.append(True) else: temp.append(Symbols[i]) temp.append(Names[i]) temp.append(False) df_matchedSym.append(temp) df_matchedSym = [z for z in df_matchedSym if z[2] == True] if(len(df_matchedSym) == 0): if(matchedSymList == ""): final = "" else: final = final+matchedSymList else: for i in range(0, len(df_matchedSym)): reg = r"\b"+df_matchedSym[i][0].lower()+r"\b" reg = re.compile(reg) if(reg.search(matchedSymList.lower())): continue symSplit = df_matchedSym[i][1].split(" ") reg = r"\b"+symSplit[0].lower()+r"\b" reg = re.compile(reg) if(reg.search(temp2.lower())): if(len(symSplit) < 3): matchedSymList = matchedSymList+"|"+df_matchedSym[i][0] else: reg = r"\b"+symSplit[1].lower() reg = re.compile(reg) if(reg.search(temp2.lower())): matchedSymList = matchedSymList+"|"+df_matchedSym[i][0] final = final+matchedSymList return final
return False ratio_ = difflib.SequenceMatcher(None, source_str, target_str).ratio() #print "Ratio for threshold {0} : {1}".format(threshold, ratio_) return ratio_ > threshold city_mapped = mapped.loc[mapped['City'].str.upper() == 'CHARLOTTE'] city_target = usi_target[usi_target.City.str.upper() == 'CHARLOTTE'] city_brokers = usi_brokers[usi_brokers['Physical City'].str.upper() == 'CHARLOTTE'] for target_index, target_row in city_target.iterrows(): top_ratio = 0 top_index = 0 for broker_index, broker_row in city_brokers.iterrows(): ratio = fuzz.token_set_ratio(target_row['Address1'], broker_row['Physical Street Address Line 1']) if top_ratio < ratio: top_ratio = ratio top_index = broker_index #variables source_zip = str(city_brokers.loc[top_index, ['Physical Zip (All)']].values[0]) target_zip = str(city_target.loc[target_index, ['PostalCode']].values[0]) source_name = str(city_brokers.loc[top_index, ['Business Name']].values[0]) target_name = str(city_target.loc[target_index, ['PartyCompanyName']].values[0]) zip_matched = str_matched(source_zip, target_zip) name_matched = str_matched(source_name, target_name, threshold = 0.2999) address_matched = (top_ratio/100.0) > 0.6299 print "Ratio: {0} : {1} - {2}% ZIP:{3}, NAME:{4}".format(target_row['Address1'], city_brokers.loc[top_index, ['Physical Street Address Line 1']].values[0], top_ratio, zip_matched, name_matched)
for author2 in authors2: # Don't compare the author to itself if (author != author2): # First check for a high token_set_ratio value # token_set_ratio # The strings being compared are tokenized and preprocessed (made lower case # without punctuation). Then, a set operation identifies the common tokens # (the intersection) and ratio() comparisons between the following new strings: # s1 = Sorted_tokens_in_intersection # s2 = Sorted_tokens_in_intersection + sorted_rest_of_str1_tokens # s3 = Sorted_tokens_in_intersection + sorted_rest_of_str2_tokens # The logic behind these comparisons is that since Sorted_tokens_in_intersection # is always the same, the score will tend to go up as these words make up a larger # chunk of the original strings or the remaining tokens are closer to each other. TokenSetRatio = fuzz.token_set_ratio(authors[author], authors2[author2]) # If token_set_ratio is at least 75, then look closer at these names if TokenSetRatio >= 75: # Get the other ratios for these two authors # ratio # Levenshtein distance similarity ratio Ratio = fuzz.ratio(authors[author], authors2[author2]) # partial_ratio # If the shorter string being compared has length k and the longer string has # length m, then partial_ratio seeks the score of the best matching length-k # substring. PartialRatio = fuzz.partial_ratio(authors[author], authors2[author2])
dfASOS = pd.read_pickle("webScrapeASOS.pkl")[:] dfASOS = dfASOS.add_prefix('ASOS_') dfHnM = pd.read_pickle("webScrapeHM.pkl") dfHnM = dfHnM.add_prefix('HnM_') dfASOScopy = dfASOS.copy(deep=True) dfHnMcopy = dfHnM.copy(deep=True) pairedNames = [] for ASOSName in dfASOS['ASOS_Name']: matchName = [] matchScore = 0 for HnMName in dfHnM['HnM_Name']: stringCompareScore = fuzz.token_set_ratio(ASOSName, HnMName) if stringCompareScore >= matchScore: matchScore = stringCompareScore matchName = HnMName pairedNames.append([ASOSName, matchName, matchScore]) with open('pairedNamesListPKL.pkl', 'wb') as f: pickle.dump(pairedNames, f) #------------------------------------------------------------------------------ groupDF = pd.DataFrame([]) listArb = [] for namePair in pairedNames: ASOSNamePair = namePair[0] HnMNamePair = namePair[1]
# Try paralell computation with dask #Train print('extra fuzzy features, train....') train_dd = from_pandas(train_df[['question1', 'question2']], npartitions=8) start_time = time.time() train_df['fuzz_qratio'] = train_dd.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1, meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get) train_df['fuzz_WRatio'] = train_dd.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1, meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get) train_df['fuzz_token_set_ratio'] = train_dd.apply( lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1, meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get) train_df['fuzz_token_sort_ratio'] = train_dd.apply( lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1, meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get) print((time.time() - start_time)) del train_dd #Test print('extra fuzzy features, test....') test_dd = from_pandas(test_df[['question1', 'question2']], npartitions=8) start_time = time.time() test_df['fuzz_qratio'] = test_dd.apply(
def find_and_store_duplicate_syllabi(grid_name, year, field_name): global stop try: # connect to existing database conn = psycopg2.connect( "dbname='litindex' user='******' host='0.0.0.0' password='******'" ) # Open a cursor to perform database operations cur = conn.cursor() param_list = [grid_name, year, field_name] select_query = "SELECT id, text_md5, text from open_syllabi where grid_name='{}' and year='{}' and field_name='{}'".format( *param_list) #unpack the list cur.execute(select_query) df = pd.DataFrame(cur.fetchall(), columns=['id', 'text_md5', 'text']) print("\tNO OF RECORDS = {}", len(df)) punctuation_translator = str.maketrans('', '', string.punctuation) # PRE-PROCESSING REQUIRED: # normalize by lowering the case, removing punctuations, removing numbers and english stop words df['text_lower_case_words'] = df['text'].apply(lambda x: ' '.join([ word for word in x.lower().translate(punctuation_translator).split( ) if not word.isdigit() and word not in stop ])) # the following pre-processing is required to improve quality of LSH results # especially considering highly templated text in course descriptions df['text_unique_words'] = df['text'].apply(lambda x: ' '.join([ word for word in list( set(x.lower().translate(punctuation_translator).split())) if not word.isdigit() and word not in stop ])) common_words_series = pd.Series(' '.join( df['text_unique_words']).lower().strip( string.punctuation).split()).value_counts() most_common_words_series = common_words_series[common_words_series > ( 0.5 * len(df))].dropna() most_common_words_list = most_common_words_series.index.tolist() df['text_without_common_words'] = df['text'].apply(lambda x: ' '.join([ word for word in x.lower().translate(punctuation_translator).split( ) if word not in (most_common_words_list) and word not in stop ])) # STEP 1: use LSH algorithm to find candidate duplicates # find duplicates # run through adding documents to the LSH cache hasher = minhash.MinHasher(seeds=100, char_ngram=5, hashbytes=4) lshcache = cache.Cache(bands=10, hasher=hasher) for idx in range(0, (len(df) - 1)): lshcache.add_fingerprint( hasher.fingerprint(df.loc[idx, 'text_without_common_words']), df.loc[idx, 'id']) # for every bucket in the LSH cache get the candidate duplicates # note this fast way to get candidate pairs with reasonable accuracy, that will be filtered later candidate_pairs = set() for b in lshcache.bins: for bucket_id in b: if len( b[bucket_id] ) > 1: # if the bucket contains more than a single document pairs_ = set(itertools.combinations(b[bucket_id], r=2)) candidate_pairs.update(pairs_) list_candidate_pairs = list(candidate_pairs) tsl = [] # df = df.set_index('id') print("\tcandidate pairs found = {}", len(list_candidate_pairs)) # STEP 2: use TFIDF to process the records associated with the candidate duplicates and generate signature text tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english') tfidf_matrix = tf.fit_transform(df['text_lower_case_words']) feature_names = tf.get_feature_names() dense = tfidf_matrix.todense() for item in list_candidate_pairs: idx1 = df.index[df['id'] == int(item[0])] idx2 = df.index[df['id'] == int(item[1])] episode1 = dense[idx1].tolist()[0] episode2 = dense[idx2].tolist()[0] phrase_scores1 = [ pair for pair in zip(range(0, len(episode1)), episode1) if pair[1] > 0 ] sorted_phrase_scores1 = sorted(phrase_scores1, key=lambda t: t[1] * -1) phrase_scores2 = [ pair for pair in zip(range(0, len(episode2)), episode2) if pair[1] > 0 ] sorted_phrase_scores2 = sorted(phrase_scores2, key=lambda t: t[1] * -1) list_summarized_text1 = [] list_summarized_text2 = [] for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores1 ][:10]: # print('{0: <20} {1}'.format(phrase, score)) list_summarized_text1.append(phrase) for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores2 ][:10]: # print('{0: <20} {1}'.format(phrase, score)) list_summarized_text2.append(phrase) summarized_text1 = ' '.join(list_summarized_text1) summarized_text2 = ' '.join(list_summarized_text2) # STEP 3: apply fuzzy match for the two signature texts to generate accuracy score fuzz_ratio = fuzz.token_set_ratio(summarized_text1, summarized_text2) tsl.append( (grid_name, field_name, int(year), int(item[0]), int(item[1]), summarized_text1, summarized_text2, fuzz_ratio)) # for item in list_candidate_pairs: insert_duplicate_pairs(tsl) df = df.set_index('id') return df except Exception as e: if conn: conn.rollback() # print("Unexpected error:", sys.exc_info()[0]]) print(e) sys.exit(1) finally: # Close communication with the database if cur: cur.close() if conn: conn.close()
def post_form(): form = request.form.to_dict() datafile_name = form.get('dataFileName') # records = pd.read_excel(request.files['dataFile'], sheet_name=None) records = utils.read_spreadsheet(request.files['dataFile'], datafile_name) date_cols = [] if datafile_name.endswith('.xlsx') or datafile_name.endswith('.xls'): records_with_format = load_workbook(request.files['dataFile']) for sheet in records_with_format.sheetnames: for row in records_with_format[sheet].iter_rows(min_row=2): for cell in row: # MRN column_letter = get_column_letter(cell.column) column_header = records_with_format[sheet][column_letter + '1'].value if column_header in records[ sheet].columns and cell.number_format == '00000000': current_list = list(records[sheet][column_header]) current_list = [ str(i).rjust(8, '0') if isinstance(i, int) else i for i in current_list ] records[sheet][column_header] = current_list if column_header in records[ sheet].columns and cell.number_format == 'mm-dd-yy': date_cols.append(column_header) current_list = list(records[sheet][column_header]) current_list = [ i.strftime('%m/%d/%Y') if isinstance(i, datetime) and not pd.isnull(i) else i for i in current_list ] records[sheet][column_header] = current_list break token = form.get('token') env = form.get('env') mappings = None existing_records = None form_names = set() form_name_to_dd_fields = {} data_field_to_redcap_field_map = {} data_field_to_choice_map = {} original_to_correct_value_map = {} no_match_redcap_fields = [] if 'mappingsFile' in request.files: mappings = pd.read_excel(request.files['mappingsFile'], sheet_name="Sheet1") if list(mappings["dataFieldToRedcapFieldMap"]): data_field_to_redcap_field_map = json.loads( list(mappings["dataFieldToRedcapFieldMap"])[0]) if list(mappings["dataFieldToChoiceMap"]): data_field_to_choice_map = json.loads( list(mappings["dataFieldToChoiceMap"])[0]) if list(mappings["originalToCorrectedValueMap"]): original_to_correct_value_map = json.loads( list(mappings["originalToCorrectedValueMap"])[0]) if list(mappings["noMatchRedcapFields"]): no_match_redcap_fields = json.loads( list(mappings["noMatchRedcapFields"])[0]) redcap_api = RedcapApi(env) project_info = { 'secondary_unique_field': '', 'record_autonumbering_enabled': 0, 'repeatable_instruments': [], 'next_record_name': 1 } data_dictionary = None existing_records = None if token: try: data_dictionary = redcap_api.fetch_data_dictionary(token) project_info = redcap_api.fetch_project_info(token) project_info[ 'next_record_name'] = redcap_api.generate_next_record_name( token) if project_info.get('secondary_unique_field'): project_info['secondary_unique_field'] = [ project_info.get('secondary_unique_field') ] if project_info['has_repeating_instruments_or_events'] == 1: repeatable_instruments = redcap_api.fetch_repeatable_instruments( token) project_info['repeatable_instruments'] = [ i['form_name'] for i in repeatable_instruments ] if project_info['record_autonumbering_enabled'] == 0: data_dictionary[0]['required'] = 'Y' dd = [RedcapField.from_json(field) for field in data_dictionary] except Exception as e: logging.warning(e) results = {'error': "Error: {0}".format(e)} response = flask.jsonify(results) response.headers.add('Access-Control-Allow-Origin', '*') return response else: data_dictionary_name = form.get('dataDictionaryName') if data_dictionary_name.endswith('.csv'): dd_df = pd.read_csv(request.files['dataDictionary']) dd_df.fillna('', inplace=True) elif data_dictionary_name.endswith( '.xlsx') or data_dictionary_name.endswith('.xls'): dd_df = pd.read_excel(request.files['dataDictionary']) dd = [ RedcapField.from_data_dictionary(dd_df, field) for field in list(dd_df['Variable / Field Name']) ] if dd[0].field_name == 'record_id': project_info['record_autonumbering_enabled'] = 1 if 'existingRecordsFile' in request.files: existing_records = pd.read_csv( request.files['existingRecordsFile']) existing_records = json.loads( existing_records.to_json(orient='records', date_format='iso')) all_csv_headers = [] dd_headers = [] dd_data = {} dd_data_raw = {} if data_dictionary is not None: dd_headers = list(data_dictionary[0].keys()) dd_data_raw = data_dictionary else: dd_headers = list(dd_df.columns) dd_data_raw = json.loads( dd_df.to_json(orient='records', date_format='iso')) dd_data = [field.__dict__ for field in dd] for dd_field in dd: if not form_name_to_dd_fields.get(dd_field.form_name): form_name_to_dd_fields[dd_field.form_name] = [] form_name_to_dd_fields.get(dd_field.form_name).append( dd_field.field_name) form_names.add(dd_field.form_name) recordid_field = dd[0].field_name form_names = list(form_names) for sheet_name, sheet in records.items(): all_csv_headers += list(sheet.columns) all_csv_headers = [i for i in all_csv_headers if 'Unnamed' not in i] all_field_names = [f.field_name for f in dd] redcap_field_candidates = {} data_field_candidates = {} csv_headers = {} fields_not_in_redcap = {} duplicate_fields = {} for sheet_name, sheet in records.items(): duplicate_fields[sheet_name] = {} # Remove empty rows sheet.dropna(axis=0, how='all', inplace=True) csv_headers[sheet_name] = list(sheet.columns) csv_headers[sheet_name] = [ item for item in csv_headers[sheet_name] if 'Unnamed' not in item ] for header in csv_headers[sheet_name]: duplicate_fields[sheet_name][ header] = duplicate_fields[sheet_name].get(header, 0) + 1 duplicate_fields[sheet_name] = [ k for k, v in duplicate_fields[sheet_name].items() if v > 1 ] normalized_headers = utils.parameterize_list(csv_headers[sheet_name]) fields_not_in_redcap[sheet_name] = [ header for header, normalized_header in zip( csv_headers[sheet_name], normalized_headers) if normalized_header not in all_field_names ] all_csv_headers = list(set(all_csv_headers)) unmatched_data_fields = {} for sheet in csv_headers: data_field_to_redcap_field_map[ sheet] = data_field_to_redcap_field_map.get(sheet, {}) unmatched_data_fields[sheet] = unmatched_data_fields.get(sheet, []) for header in csv_headers[sheet]: normalized_header = utils.parameterize(header) if data_field_to_redcap_field_map[sheet].get(header): continue if normalized_header in all_field_names: data_field_to_redcap_field_map[sheet][ header] = normalized_header else: unmatched_data_fields[sheet].append(header) selected_columns = {} matched_redcap_fields = [] matched_redcap_fields += no_match_redcap_fields for sheet_name, field_map in data_field_to_redcap_field_map.items(): selected_columns[sheet_name] = field_map.keys() matched_redcap_fields += field_map.values() unmatched_redcap_fields = [ f for f in all_field_names if f not in matched_redcap_fields and f != 'record_id' ] for f1 in all_field_names: dd_field = [f for f in dd_data if f['field_name'] == f1][0] redcap_field_candidates[f1] = [] for sheet in csv_headers: for f2 in csv_headers[sheet]: redcap_field_candidates[f1].append({ 'candidate': f2, 'sheets': [sheet], 'score': max(fuzz.token_set_ratio(f1, f2), fuzz.token_set_ratio(dd_field['field_label'], f2)) }) for sheet in csv_headers: for f1 in csv_headers[sheet]: if data_field_candidates.get(f1): continue data_field_candidates[f1] = [] for f2 in all_field_names: dd_field = [f for f in dd_data if f['field_name'] == f2][0] data_field_candidates[f1].append({ 'candidate': f2, 'form_name': dd_field['form_name'], 'score': max(fuzz.token_set_ratio(f1, f2), fuzz.token_set_ratio(dd_field['field_label'], f1)) }) malformed_sheets = [] form_names = [redcap_field.form_name for redcap_field in dd] form_names = list(set(form_names)) for sheet_name in records.keys(): sheet = records.get(sheet_name) redcap_field_names = [f.field_name for f in dd] matching_fields = [f for f in sheet.columns if f in redcap_field_names] if not matching_fields and not data_field_to_redcap_field_map.get( sheet_name): malformed_sheets.append(sheet_name) json_data = {} for sheet_name, sheet in records.items(): json_data[sheet_name] = json.loads( sheet.to_json(orient='records', date_format='iso')) results = { 'csvHeaders': csv_headers, 'jsonData': json_data, 'ddHeaders': dd_headers, 'ddData': dd_data, 'ddDataRaw': dd_data_raw, 'formNames': form_names, 'dateColumns': date_cols, 'duplicateFields': duplicate_fields, 'malformedSheets': malformed_sheets, 'recordFieldsNotInRedcap': fields_not_in_redcap, 'formNameToDdFields': form_name_to_dd_fields, 'projectInfo': project_info, 'existingRecords': existing_records, 'recordidField': recordid_field, 'redcapFieldCandidates': redcap_field_candidates, 'dataFieldCandidates': data_field_candidates, 'unmatchedRedcapFields': unmatched_redcap_fields, 'unmatchedDataFields': unmatched_data_fields, 'dataFileName': datafile_name, 'token': token, } if data_field_to_redcap_field_map: results['dataFieldToRedcapFieldMap'] = data_field_to_redcap_field_map if data_field_to_choice_map: results['dataFieldToChoiceMap'] = data_field_to_choice_map if original_to_correct_value_map: results['originalToCorrectedValueMap'] = original_to_correct_value_map if no_match_redcap_fields: results['noMatchRedcapFields'] = no_match_redcap_fields response = flask.jsonify(results) return response
def cosine(a, b): return fuzz.token_set_ratio(a, b)
#print(cou_list) from fuzzywuzzy import fuzz #READING EACH LINE FROM THE LIST FILE AND COMPARING WITH THE ARTICLE, AND USING FUZZY news_open = open("TEST_2.txt") news_read = news_open.readlines() for keywords in news_read: for word in uni_list: if fuzz.token_sort_ratio(keywords, word) >= 90: uni_set.add(word.rstrip()) if word in keywords: #print("UNIVERSITY: ",word) uni_set.add(word) for word_1 in deg_list: if fuzz.token_set_ratio(keywords, word_1) >= 90: deg_set.add(word_1.rstrip()) if word_1 in keywords: #print("DEGREE: ",word_1) deg_set.add(word_1) for word_2 in exa_list: if fuzz.token_set_ratio(keywords, word_2) >= 92: exa_set.add(word_2.rstrip()) if word_2 in keywords: #print("EXAMS: ",word_2) exa_set.add(word_2) for word_3 in streams_list: if fuzz.token_sort_ratio(keywords, word_3) >= 90: streams_set.add(word_3.rstrip()) if word_3 in keywords: #print("STREAM: ",word_3)
#update stopword_list for i in list_remove_user_skills: stopword_list.append(i) for i in educational_list: stopword_list.append(i) for i in cert_list: stopword_list.append(i) ############################################################################################################################### #In this section ####remove the stopwords from user skills columns based on fuzzy logic , this could be have been done in data cleaning code. for d in range(len(dataset)): if pd.notnull(dataset[d][6]): words = dataset[d][6].split(",") for w in words: for s in stopword_list: match_score = fuzz.token_set_ratio(w, s) if match_score > 70: dataset[d][6] = dataset[d][6].replace(w, "") ############################################################################################################################## #In this section #1.Topic modelling is performed for each sub function.The objective was to find out key topics for each sub function and match it with # user skill column based on fuzzy logic at threshold of 70% (this can vary) to find out whether the mentioned skill is relevant and by what percent. #2.Intutively 25 topics was build for each sub function and top 40 words for i in industry_list: print i Tfidf_Vectorizer = TfidfVectorizer(max_df=.95, min_df=2, stop_words=stopword_list, norm="l2",
from fuzzywuzzy import process st = "apple inc" strOptions = ["Apple Inc.","apple park","apple incorporated","iphone","apple inc"] Ratios = process.extract(st,strOptions) print(Ratios) # You can also select the string with the highest matching percentage highest = process.extractOne(st,strOptions) print(highest) from fuzzywuzzy import fuzz Str1 = dataset['p_body'][37] Str2 = dataset['p_body'][38] Ratio = fuzz.ratio(Str1.lower(),Str2.lower()) Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower()) Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2) Token_Set_Ratio = fuzz.token_set_ratio(Str1,Str2) print(Ratio) print(Partial_Ratio) print(Token_Sort_Ratio) print(Token_Set_Ratio) Str1= dataset['p_body'][37] Str2 = dataset['p_body'][38] Ratios = process.extract(Str1,Str2) print(Ratios) # You can also select the string with the highest matching percentage highest = process.extractOne(Str1,Str2) print(highest)