def isstandup(sniptxt): lines = sniptxt.split("\n") matchst = fuzz.partial_ratio("standup", lines[0].lower()) matchsy = fuzz.partial_ratio("sync", lines[0].lower()) if matchst > 70 or matchsy > 70: return True return False
def fuzzy_contains(event): """Checks if an event exists in the database. Searches by name and date range in case name or date is slightly different. :event Event - The event being checked :return Boolean - True if event exists in database, False otherwise """ time_fuzz = dt.timedelta(days=3) query = (Event.start < event.start + time_fuzz) & \ (Event.start > event.start - time_fuzz) if event.end is not None: query &= (Event.end is None) | \ ((Event.end < event.end + time_fuzz) & (Event.end > event.end + time_fuzz)) # Search for good date matches. date_matches = Event.query.filter(query).all() # Filter for good name matches. res = [e for e in date_matches if fuzz.partial_ratio(e.name, event.name) >= 70] # Return None if no results come up. if not res: return None # Get best search result from the results. e = max(res, key=lambda e: (fuzz.partial_ratio(e.name, event.name), fuzz.ratio(e.name, event.name))) return e
def match_products(ingredients, limit=5): all_matches = [] for ingr_dict in ingredients: amount = ingr_dict.get('amount', None) ingredient = ingr_dict.get('ingredient', None) try: stemmed_ingredient = elastic_stemmer(ingredient) product_ratios = [] for product in products: # Fuzzy matching name_ratio = fuzz.partial_ratio(stemmed_ingredient, product['stemmed_name']) cat3_ratio = fuzz.partial_ratio(stemmed_ingredient, product['stemmed_category3']) # name_ratio = fuzz.partial_ratio(ingredient, product['name']) # cat3_ratio = fuzz.partial_ratio(ingredient, product['category3']) # gen_name_ratio = fuzz.partial_ratio(general_name, remove_accents(product['name']).lower()) product_ratios.append(( product, # [name_ratio, cat3_ratio, gen_name_ratio] [name_ratio, cat3_ratio] )) # Order by sum of ratios ingr_matches = sorted(product_ratios, key=lambda p: sum(p[1]), reverse=True)[:limit] ingr_products = [match[0] for match in ingr_matches] except Exception as exc: ingr_products = [] all_matches.append({ 'ingredient': ingredient, 'amount': amount, 'products': ingr_products, }) # 2-tuple list: (ingredient_name, [matched_products]) return all_matches
def checker(url, params, headers, GET, delay, payload, positions, timeout, encoding): checkString = 'st4r7s' + payload + '3nd' if encoding: checkString = encoding(unquote(checkString)) response = requester(url, replaceValue( params, xsschecker, checkString, copy.deepcopy), headers, GET, delay, timeout).text.lower() reflectedPositions = [] for match in re.finditer('st4r7s', response): reflectedPositions.append(match.start()) filledPositions = fillHoles(positions, reflectedPositions) # Itretating over the reflections num = 0 efficiencies = [] for position in filledPositions: allEfficiencies = [] try: reflected = response[reflectedPositions[num] :reflectedPositions[num]+len(checkString)] efficiency = fuzz.partial_ratio(reflected, checkString.lower()) allEfficiencies.append(efficiency) except IndexError: pass if position: reflected = response[position:position+len(checkString)] if encoding: checkString = encoding(checkString.lower()) efficiency = fuzz.partial_ratio(reflected, checkString) if reflected[:-2] == ('\\%s' % checkString.replace('st4r7s', '').replace('3nd', '')): efficiency = 90 allEfficiencies.append(efficiency) efficiencies.append(max(allEfficiencies)) else: efficiencies.append(0) num += 1 return list(filter(None, efficiencies))
def get_best_match(self, term): response = self.call("search", {"query": term, "types": "Track"}) # This is a hack that should be fixed print " + Searching for %s" % term try: artist, track = term.lower().split(" - ") except ValueError: artist, whatever, track = term.lower().split(" - ") artist = re.sub("\([^)]+\)", "", artist) track = re.sub("\([^)]+\)", "", track) for result in response["result"]["results"]: r_artist, r_track = result["artist"].lower(), result["name"].lower() artist_score, track_score = fuzz.partial_ratio(artist, r_artist), fuzz.partial_ratio(track, r_track) at_score, ta_score = fuzz.partial_ratio(artist, r_track), fuzz.partial_ratio(track, r_artist) print "%s - %s (%d/%d)" % (r_artist, r_track, artist_score, track_score) if artist_score > 75 and track_score > 75 or at_score > 75 and ta_score > 75: print " + Song added: %s - %s" % (r_artist, r_track) return result["key"] print " + Song not found: %s" % term return None
def matchGND(data): for n in data: for dnb in data[n]['dnb']: if dnb: dnb = dnb.replace('/about/rdf', '') resp = requests.get(dnb + '/about/lds', headers={'Accept': 'application/turtle'}) try: grp = rdflib.Graph().parse(data=resp.content, format='turtle') for o in grp.objects(rdflib.term.URIRef(dnb), DC.identifier): if '(OColc)' in o and o.replace('(OColc)', '') not in data[n]['oclc']: newoclc = 'http://worldcat.org/oclc/' + o.replace('(OColc)', '') print(newoclc) data[n]['oclc'].append(newoclc) doc_title = data[n]['title'][0] resp_title = '' title_score = title2_score = 0 for o in grp.objects(rdflib.term.URIRef(dnb), DC.title): title_score = fuzz.partial_ratio(doc_title, o) resp_title = o for o in grp.objects(rdflib.term.URIRef(dnb), RDAU.P60493): title2_score = fuzz.partial_ratio(doc_title, o) if max(title_score, title2_score) < 80: print(doc_title) print(resp_title) except rdflib.plugins.parsers.notation3.BadSyntax: if dnb == 'http://d-nb.info/gnd/': data[n]['dnb'].remove(dnb) print('ERROR URI: ' + n) return(data)
def _employees(self, company_name="", keyword=None): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates" -inurl:"/title/" -inurl:"/pulse/"' args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) #results = Google().search(qry, 10) results = Google().search(qry, 1) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) print results.columns if results.empty: print "No employees found for", company_name, keyword return results if " " in company_name: results['company_score'] = [fuzz.partial_ratio(_name, company) for company in results.company_name] else: results['company_score'] = [fuzz.ratio(_name, company) for company in results.company_name] if keyword: results['score'] = [fuzz.partial_ratio(keyword, title) for title in results.title] results = results[results.score > 75] results = results[results.company_score > 49] results = results.drop_duplicates() return results
def sort_func(self, row_a, row_b, data=None): if self.b.get_object("searchentry1").get_text(): if fuzz.partial_ratio(self.current_search, row_a.entry_name.lower()) > \ fuzz.partial_ratio(self.current_search, row_b.entry_name.lower()): return False else: return True return False
def testIssueSeven(self): s1 = "HSINCHUANG" s2 = "SINJHUAN" s3 = "LSINJHUANG DISTRIC" s4 = "SINJHUANG DISTRICT" self.assertTrue(fuzz.partial_ratio(s1, s2) > 75) self.assertTrue(fuzz.partial_ratio(s1, s3) > 75) self.assertTrue(fuzz.partial_ratio(s1, s4) > 75)
def match(dh,shas,index, ratio=100): found_dict = {} list_of_found_links = [] found_dict['index']= index found_dict['dh'] = dh found = 0 global too_match global matched global non_match global count for line_n, line in enumerate(shas): if dh in line and line in dh: found += 1 the_line=line_n list_of_found_links.append(line_n) elif fuzz.partial_ratio(dh,line) >= ratio and fuzz.partial_ratio(line,dh) >= ratio: found +=1 the_line=line_n list_of_found_links.append(line_n) elif fuzz.partial_ratio(dh,line) >= ratio: found+=1 the_line=line_n list_of_found_links.append(line_n) elif fuzz.partial_ratio(line, dh) >= ratio: found+=1 the_line=line_n list_of_found_links.append(line_n) if found > 1: found_dict['lines']=list_of_found_links too_match +=1 list_of_many_finds[index]=dh found_dict["more_than_one"]="TRUE" found_list.append(found_dict) if found ==1: found_dict['lines']=list_of_found_links found_dict["more_than_one"]="FALSE" dafamud = convert_inf_to_daf(index) # if len(links_list)>0: # previos = re.split(":",links_list[len(links_list)-1])[1] #if int(previos) > the_line+1: # print "previos", previos, "this", the_line+1 link = "Rashi on %s" % masechet +" " + dafamud +":"+ str(the_line+1) + " " + dh links_list.append(link) print "found!!",dafamud,":", the_line," ", dh matched+=1 found_list.append(found_dict) if found == 0: if ratio > 60: match(dh,shas, index, ratio-2) else: list_of_found_links.append(-1) found_dict['lines']=list_of_found_links found_dict["more_than_one"]="FALSE" print len(dh) non_match += 1 found_list.append(found_dict)
def match(self, orig_dh, page, dh_position, ratio=85): partial_ratios = [] self.found_dict[dh_position] = {} self.found_dict[dh_position][orig_dh] = [] dh = self.removeEtcFromDH(orig_dh) found = 0 dh_acronym_list = [] if dh.find('"') >= 0 or dh.find("'") >= 0: dh_acronym_list = self.replaceAcronyms(dh) for line_n, para in enumerate(page): found_this_line = False para = self.removeHTMLtags(para) para = para.encode('utf-8') if dh in para: found += 1 self.found_dict[dh_position][orig_dh].append((line_n, 100)) continue para_pr = fuzz.partial_ratio(dh, para) if para_pr < 40: # not worth checking continue elif para_pr >= ratio: found += 1 self.found_dict[dh_position][orig_dh].append((line_n, para_pr)) continue phrases = self.splitPara(para, len(dh)) for phrase in phrases: phrase_pr = fuzz.partial_ratio(dh, phrase) if found_this_line == True: break if dh in phrase: found += 1 self.found_dict[dh_position][orig_dh].append((line_n, 100)) break elif phrase_pr >= ratio: found += 1 self.found_dict[dh_position][orig_dh].append((line_n, phrase_pr)) break for expanded_acronym in dh_acronym_list: # only happens if there is an acronym, found_dh refers to expanded acronym acronym_pr = fuzz.partial_ratio(expanded_acronym, phrase) if expanded_acronym in phrase: found += 1 self.found_dict[dh_position][orig_dh].append((line_n, 100)) found_this_line = True break elif acronym_pr >= ratio: found += 1 self.found_dict[dh_position][orig_dh].append((line_n, acronym_pr)) found_this_line = True break if found == 0: if ratio > self.min_ratio: self.match(orig_dh, page, dh_position, ratio - self.step) else: self.non_match_file.write(orig_dh) self.non_match_file.write("\n")
def post_title_extract(self,sel,response): title = None title_score = 0 slug_score = 0 title_xpath = None blog=self.get_domain(response.url) slug = response.url.split('/')[-1] or response.url.split('/')[-2] slug = slug.replace('-',' ').rstrip('.html') head_title = sel.xpath('//title/text()').extract() head_title = head_title[0] if head_title else '' if '|' in head_title: pos=[head_title.split('|')[0],head_title.split('|')[-1]] word = pos[0] if fuzz.partial_ratio(pos[0],blog)>fuzz.partial_ratio(pos[-1],blog) else pos[-1] head_title_clean = head_title.replace(word,'').replace('|','') else: head_title_clean = head_title text_to_remove = sel.xpath('//link[@rel="alternate"]/@title').extract() if text_to_remove and head_title: words = (' '.join(text_to_remove)+head_title).split() if Counter(words).most_common(3): for wor in Counter(words).most_common(3): head_title_clean = head_title_clean.replace(wor[0],'') [h1,h1a,h2,h2a,h3,h3a]=["//h1","//h1/a","//h2","//h2/a","//h3","//h3/a"] head_xpaths = [h1a,h1,h2a,h2,h3a,h3] title_lists = [sel.xpath(head+'//text()').extract() for head in head_xpaths] title_dict = OrderedDict(zip(head_xpaths,title_lists)) for title_xpaths,title_list in title_dict.iteritems(): if title_list: for titles in title_list: #to prevent from one word getting higher score if titles.count(' ')>0 or head_title_clean.count(' ')<1: title_ratio = fuzz.partial_token_sort_ratio(titles,head_title_clean) if title_ratio>title_score: title_score = title_ratio title = titles title_xpath = title_xpaths if title_score==100 and title.count(' ')>0: break #slug_ratio to be added in case slug_ratio = fuzz.partial_ratio(titles.lower(),slug) if slug_ratio>80: slug_score = slug_ratio title = titles title_xpath = title_xpaths if slug_score==100: break if slug_score==100: break if title_score==100: break if title_score<51 and slug_score<81: title = head_title_clean return title,title_xpath
def search(): input_job_title = seniority_detection(request.args.get('job_title'), seniority_descriptors_grouped_list, acronyms_job_title_list) results_true_job_titles = [] for job_title in true_job_titles_list: input_job_title_without_seniority = input_job_title[0] for sen in input_job_title[1]: if sen not in job_title.keys()[0]: input_job_title_without_seniority = input_job_title_without_seniority.replace(sen.lower(), '') input_job_title_without_seniority = input_job_title_without_seniority.replace(" "*2, " ").strip().lower() similarity_factor_value = max(similarity_factor(input_job_title[0], job_title.values()[0]), similarity_factor(input_job_title_without_seniority, job_title.values()[0]) ) results_true_job_titles.append({ 'Similarity factor': similarity_factor_value, 'Canonical job title': job_title.keys()[0], 'Seniority': input_job_title[1] }) results_true_job_titles = sorted(results_true_job_titles, key=lambda k: k['Similarity factor'], reverse=True)[:10] need_change_list = False for num, item in enumerate(results_true_job_titles): if num == 0 and item['Canonical job title'].lower() in input_job_title[0] \ and fuzz.partial_ratio(item['Canonical job title'].lower(), input_job_title[0]) == 100 \ and input_job_title[0][:input_job_title[0].find(item['Canonical job title'].lower())-1].capitalize() in item['Seniority']: break elif item['Canonical job title'].lower() in input_job_title[0] \ and fuzz.partial_ratio(item['Canonical job title'].lower(), input_job_title[0]) == 100 \ and input_job_title[0][:input_job_title[0].find(item['Canonical job title'].lower())-1].capitalize() not in item['Seniority']: need_change_list = True short_input_job_title = input_job_title[0][input_job_title[0].find(item['Canonical job title'].lower()):] break if need_change_list: for item in results_true_job_titles: item['Similarity factor'] = (item['Similarity factor'] + similarity_factor(short_input_job_title, item['Canonical job title'].lower()))/2. results_true_job_titles = sorted(results_true_job_titles, key=lambda k: k['Similarity factor'], reverse=True) output_results = [] for item in results_true_job_titles[:3]: seniorities = list(filter(lambda x: x not in item['Canonical job title'] and\ len(filter(lambda y: y in replace_acronyms(item['Canonical job title'], acronyms_job_title_list), x.split())) == 0, item['Seniority'])) if item['Seniority'] == [] or seniorities == []: output_results.append('({0}) {1}'.format(round(item['Similarity factor'], 1), item['Canonical job title'])) else: output_results.append('({0}) {1}, {2}'.format(round(item['Similarity factor'], 1), item['Canonical job title'], ', '.join(seniorities))) return jsonify(true_job_title=output_results)
def mergeCSV(c1, c2, outfile, field_name): csvfile1 = file(str(c1), 'r') csvfile2 = file(str(c2), 'r') new_csvfile = file(str(outfile), 'w+') csv1 = csv.reader(csvfile1, delimiter=",") csv2 = csv.reader(csvfile2, delimiter=",") output = csv.writer(new_csvfile) csv2_rows = [row for row in csv2] if HEADER_SEARCH: print "headers in " + str(c2) + " " + str(zip(csv2_rows[0], range(0, len(csv2_rows[0])))) csv2_extract = int(raw_input("text above formatted ('data','index') select a header index to merge: ")) first_pass = True for csv1_row in csv1: match = False for csv2_row in csv2_rows: output_row = csv1_row if first_pass: output_row = output_row + [str(field_name) + item for item in (csv2_row[csv2_extract:csv2_extract+MULTIMERGE])] match = True break elif str(csv1_row[0]) == str(csv2_row[0]): match = True for data in csv2_row[csv2_extract:csv2_extract+MULTIMERGE]: output_row.append(data.split('.')[0].replace(',', '')) if (not match) and (FUZZY_MATCHING): output_rows = [] output_names = [] fuzzy_match = False for csv2_row in csv2_rows: output_row = [] output_row.extend(csv1_row) if (fuzz.partial_ratio(str(csv1_row[0]), str(csv2_row[0])) > 95) or (fuzz.partial_ratio(str(csv2_row[0]), str(csv1_row[0])) > 95): for data in csv2_row[csv2_extract:csv2_extract+MULTIMERGE]: output_row.append(data.split('.')[0].replace(',', '')) output_rows.append(output_row) output_names.append(csv2_row[0]) fuzzy_match = True if fuzzy_match: print "Found fuzzy matches for {" + str(csv1_row[0]) + "}:" for i, name in enumerate(output_names): print " (" + str(i) + ") " + name extraction_val = str(raw_input("Please select an index or enter 'n' for none: ")) if extraction_val is "n": output_row = csv1_row else: match = True output_row = output_rows[int(extraction_val)] if (not match) and (not first_pass): print "failed to find " + csv1_row[0] + " in second csv file" output.writerow(output_row) first_pass = False
def match(dh,shas,index,dibur, ratio=100): found_dict = {} list_of_found_links = [] found_dict['index']= index found_dict['dh'] = dh found_dict['dibbur'] = dibur found = 0 global too_match global matched global non_match global count for line_n, line in enumerate(shas): if dh in line and line in dh: found += 1 the_line=line_n list_of_found_links.append(line_n) elif fuzz.partial_ratio(dh,line) >= ratio and fuzz.partial_ratio(line,dh) >= ratio: found +=1 the_line=line_n list_of_found_links.append(line_n) elif fuzz.partial_ratio(dh,line) >= ratio: found+=1 the_line=line_n list_of_found_links.append(line_n) elif fuzz.partial_ratio(line, dh) >= ratio: found+=1 the_line=line_n list_of_found_links.append(line_n) if found > 1: found_dict['lines']=list_of_found_links too_match +=1 list_of_many_finds[index]=dh found_dict["more_than_one"]="TRUE" found_list.append(found_dict) if found ==1: found_dict['lines']=list_of_found_links found_dict["more_than_one"]="FALSE" dafamud = convert_inf_to_daf(index) link = "Rashi on %s" % masechet +" " + dafamud +":"+ str(the_line+1) + " " + dh add_rashi(index,dibur,the_line) links_list.append(link) print "found!!",dafamud,":", the_line," ", dh matched+=1 found_list.append(found_dict) if found == 0: if ratio > min_ratio: match(dh,shas, index,dibur, ratio-step) else: list_of_found_links.append(-1) found_dict['lines']=list_of_found_links found_dict["more_than_one"]="FALSE" print len(dh) non_match += 1 found_list.append(found_dict)
def strict_compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.partial_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.partial_ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def similarity(n1, n2): """ Returns the mean of the partial_ratio score for each field in the two entities. Note that if they don't have fields that match, the score will be zero. """ scores = [ fuzz.partial_ratio(n1, n2), fuzz.partial_ratio(G.node[n1]['type'], G.node[n2]['type']) ] return float(sum(s for s in scores)) / float(len(scores))
def compare_two_texts(self, string_a, string_b, normalize_value=True): """ Compare two string and return the value of Partial Ratio algorithm the value is normalized between 0 and 1 values. """ if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or (isinstance(string_a, str) and isinstance(string_b, str))): if normalize_value: return self.__normalized_value(fuzz.partial_ratio(string_a, string_b)) else: fuzz.partial_ratio(string_a, string_b) else: raise TypeError
def resolve_prp(self, s: list, people: list): print('\t\tTrying to resolve prepositions') if people: print('\t\t People found to resolve') for n in s: if re.match(r'(PRP.*|WP.*)', n[1]): person = people[0] if n[0].lower() in ['he', 'she', 'his', 'her']: if not self.determiner.guessed_gender: print('\t\tAssigning Gender') if n[0].lower() in ['he', 'his']: self.determiner.guessed_gender = ['he', 'his'] else: self.determiner.guessed_gender = ['she', 'her'] print(self.determiner.guessed_gender) if n[0].lower() in self.determiner.guessed_gender: print('\t\tPreposition matched guessed gender') if fuzz.partial_ratio(self.poi, person) > 90: print('\t\t Matched preposition to poi, updating string') txt = ' '.join([w[0] for w in s[:s.index(n) + 1]]).strip() txt += ' (' + person + ')' txt += ' ' + ' '.join([w[0] for w in s[s.index(n) + 1:]]).strip() return txt elif n[0].lower() in ['who', 'whom']: if fuzz.partial_ratio(self.poi, person) > 90: print('\t\t Matched preposition to poi, updating string') txt = ' '.join([w[0] for w in s[:s.index(n) + 1]]).strip() txt += ' (' + person + ')' txt += ' ' + ' '.join([w[0] for w in s[s.index(n) + 1:]]).strip() return txt else: print('\t\tNo people found to resolve, using default') for n in s: if re.match(r'(PRP.*|WP.*)', n[1]): person = self.determiner.default if n[0].lower() in ['he', 'she', 'his', 'her']: if n[0].lower() in self.determiner.guessed_gender: print('\t\t Matched preposition to poi, updating string') txt = ' '.join([w[0] for w in s[:s.index(n) + 1]]).strip() txt += ' (' + person + ')' txt += ' ' + ' '.join([w[0] for w in s[s.index(n) + 1:]]).strip() return txt elif n[0].lower() in ['who', 'whom']: print('\t\t Matched preposition to poi, updating string') txt = ' '.join([w[0] for w in s[:s.index(n) + 1]]).strip() txt += ' (' + person + ')' txt += ' ' + ' '.join([w[0] for w in s[s.index(n) + 1:]]).strip() return txt return None
def search( word ): maxCost = int(len(word) * .6) # build first row currentRow = range( len(word) + 1 ) results = {} # recursively search each branch of the trie for letter in trie.children: searchRecursive( trie.children[letter], letter, word, currentRow, results, maxCost ) if not results.keys(): return None results = results[min(results.keys())] if len(results) > 1: best = 0 for result in results: ratio = fuzz.partial_ratio(word, result) if ratio > best: best_result = result best = ratio else: best_result = results[0] return best_result
def addUtter(self, utter, translations): output = {'utter_index': utter['utter_index']} top_hyp = '' if len(translations['translated']) > 0: top_hyp = translations['translated'][0]['hyp'] topic = utter['segment_info']['topic'] if utter['segment_info']['target_bio'] == 'B': self.frame = {} if topic in self.tagsets: for slot in self.tagsets[topic]: for value in self.tagsets[topic][slot]: ratio = fuzz.partial_ratio(value, top_hyp) if ratio > 80: if slot not in self.frame: self.frame[slot] = [] if value not in self.frame[slot]: self.frame[slot].append(value) if topic == 'ATTRACTION' and 'PLACE' in self.frame and 'NEIGHBOURHOOD' in self.frame and self.frame['PLACE'] == self.frame['NEIGHBOURHOOD']: del self.frame['PLACE'] output['frame_label'] = self.frame return output
def fw_partial_ratio(question1, question2): fuzzy = [] for q1, q2 in zip(question1, question2): partial_ratio = fuzz.partial_ratio(str(q1), str(q2)) / 100 fuzzy.append([partial_ratio]) print("Created fuzz partial_ratio feature") return np.array(fuzzy)
def addUtter(self, utter, translations): output = {'utter_index': utter['utter_index']} transcript = utter['transcript'] topic = utter['segment_info']['topic'] if utter['segment_info']['target_bio'] == 'B': self.frame = {} if topic in self.translated_tagsets: for slot in self.translated_tagsets[topic]: for value_obj in self.translated_tagsets[topic][slot]: entry_en = value_obj['entry_en'] if len(value_obj['translated_cn']) > 0: top_hyp = value_obj['translated_cn'][0] ratio = fuzz.partial_ratio(top_hyp, transcript) if ratio > 80: if slot not in self.frame: self.frame[slot] = [] if entry_en not in self.frame[slot]: self.frame[slot].append(entry_en) if topic == 'ATTRACTION' and 'PLACE' in self.frame and 'NEIGHBOURHOOD' in self.frame and self.frame['PLACE'] == self.frame['NEIGHBOURHOOD']: del self.frame['PLACE'] output['frame_label'] = self.frame return output
def fuzzy(products_name_set, listings): """ The function that uses Levenstein distance to determine matching pairs of products and listings :param products_name_set: Indexed product names(For faster matching) :param listings: Listings to be matched :return: A dictionary containg the matched product with all its listings """ final_products = defaultdict(list) for listing in listings: possible_products = set() for product_name in products_name_set: token_set_ratio = fuzz.token_set_ratio(listing["new_title"], product_name) if token_set_ratio is 100: possible_products.add(product_name) #More than one possible product found if len(possible_products) > 1: for possible_product in possible_products: partial_ratio = fuzz.partial_ratio(listing["new_title"], possible_product) if partial_ratio is 100: final_products[possible_product].append(listing) else: for possible_product in possible_products: final_products[possible_product].append(listing) return final_products
def getRatio(var1, var2, alg): r1test = 40 r2test = 100 r3test = 100 r4test = 90 # 85 is probably too low --- too many FP # let's keep alg as a dummy, but it may be unimportant # it seems that the quality of results can be improved if two (or) # -- more results are correlated: [1] can be lowered as long as [4] remains high r1 = fuzz.ratio(var1,var2) r2 = fuzz.partial_ratio(var1,var2) r3 = fuzz.token_sort_ratio(var1,var2) r4 = fuzz.token_set_ratio(var1,var2) if r1 >= r1test: if r4 >= r4test: ratio = 100 #reportRatio(var1, var2) else: ratio = 0 else: ratio = 0 return(ratio)
def scorePage(page, title, year): defaultBonus = 10 bonuses = [] bonusToScore = {'filmInTitle': defaultBonus, 'filmInSummary': 20, 'yearInTitle': defaultBonus, 'yearInSummary': 20} pageScore = 0 # TODO: consider how (film) or year in the page.title could negatively affect fuzz.ratio. Consider replacing '(film)' with ''. pageScore = max( fuzz.partial_ratio(title, page.summary), fuzz.ratio(title, page.title)) if 'film' in page.title: bonuses.append('filmInTitle') pageScore += bonusToScore['filmInTitle'] if 'film' in page.summary: bonuses.append('filmInSummary') pageScore += bonusToScore['filmInSummary'] if str(year) in page.title: bonuses.append('yearInTitle') pageScore += bonusToScore['yearInTitle'] if str(year) in page.summary: bonuses.append('yearInSummary') pageScore += bonusToScore['yearInSummary'] pageConfidence = pageScore / (100 + sum(bonusToScore.values())) safePrint('\t\tPage Title-> ' + page.title) print('\t\tPage Bonus-> ' + ','.join(bonuses)) safePrint('\t\tConfidence-> ' + str(pageConfidence)) return pageConfidence
def cycle(source, header, new_folder): folder_path = os.path.expanduser('~') + "/Desktop/" + source + "/" file_names = seek(folder_path) save_pathway = direct(new_folder) problem_pathway = direct("Problem/") for x in range(0, len(file_names)): print('Cycle start') # Catches errors with the try, except structure. try: filename = os.path.splitext(file_names[x])[0] case = None issues = ['Summa', 'VCU'] for key in issues: if fuzz.partial_ratio(filename, key) > 85: case = "odd_header" raw_data, date_mode = read_txt_file("%s%s" % (folder_path, file_names[x])) col = order(general_parse(simplify(header, colify(raw_data, case)), header, date_mode), header) row = rowify(col) convert(row, filename, save_pathway) except: # Move file to the problem folder. shutil.copy("%s%s" % (folder_path, file_names[x]), "%s%s" % (problem_pathway, file_names[x])) for file in file_names: os.remove("%s%s" % (folder_path, file))
def _employees(self, company_name="", keyword=""): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"' args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) results = Google().search(qry, 10) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) if " " in company_name: results['company_score'] = [fuzz.partial_ratio(_name, company) for company in results.company] else: results['company_score'] = [fuzz.ratio(_name, company) for company in results.company] if keyword != "": results['score'] = [fuzz.ratio(keyword, title) for title in results.title] results = results[results.score > 75] results = results[results.company_score > 64] results = results.drop_duplicates() data = {'data': results.to_dict('r'), 'company_name':company_name} CompanyExtraInfoCrawl()._persist(data, "employees", "") job = rq.get_current_job() print job.meta.keys() if "queue_name" in job.meta.keys(): if RQueue()._has_completed(job.meta["queue_name"]): q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"]) return results
def getId(self, title): apiArgs = {'api_key' : self.api_key, 'query' : title} query = API_URL + self.api_search + "?" + urlencode(apiArgs) apiRequest = Request(query, headers=HEADERS) result = urlopen(apiRequest).read() data = json.loads(result) movieId = None found = {} alt = {} for i in data['results']: if i is None: continue if fuzz.token_sort_ratio(title, i[self.title]) == 100: movieId = str(i['id']) found[movieId] = {'title' : i[self.title], 'date' : i[self.date]} elif fuzz.token_sort_ratio(title, i[self.title]) > 85 and fuzz.partial_ratio(title, i[self.title]) > 90: altId = str(i['id']) alt[altId] = {'title' : i[self.title], 'date' : i[self.date]} if len(found) == 1: return movieId elif len(found) > 1: print "DUPLICATES FOUND, ENTER THE ID OR -1 TO SKIP" movieId = self.movieSelect(found) elif len(alt) > 0: print "ALTERNATES FOUND, ENTER THE ID OR -1 TO SKIP" movieId = self.movieSelect(alt) return movieId
def parselocation(htmldoc): location_path = base_xpath + '''/div[@class='html5-section body']/div[@class='polizeimeldung'][2]''' elem_location = htmldoc.xpath(location_path) if elem_location[0].text is not None: location = elem_location[0].text else: return None, '' matches = [] boundaries = [] district_descriptions = '' from fuzzywuzzy import fuzz for district in districts: ratio = fuzz.partial_ratio(district.name, location) if ratio >= 90: matches.append(district) district.geometry.transform(4326) boundaries.append(district.geometry) district_descriptions += district.name if len(boundaries) == 0: return None, '' boundary = boundaries[0] for district_boundary in boundaries[1:]: boundary.union(district_boundary) return boundary, district_descriptions
def rankNotes(self): self.note_counter = Counter() for note in self.patient_data['Notes']: for i in range(len(self.top)): self.note_counter[note] += fuzz.partial_ratio( self.top[i][0], note) * (len(self.top) - i)
def get_kind(kind): kind_list = ["生之花", "死之羽", "时之沙", "空之杯", "理之冠"] for set_kind in kind_list: if partial_ratio(set_kind, kind) > 80: return set_kind
def testEmptyStringsScore0(self): self.assertEqual(fuzz.ratio("", ""), 0) self.assertEqual(fuzz.partial_ratio("", ""), 0)
def testPartialRatio(self): self.assertEqual(fuzz.partial_ratio(self.s1, self.s3), 100)
def predict_chip_dict(wdir, input_pattern_str, bamExt, fromBam=None): """ Predict a chip_dict from set of bam files ChIP input/control samples are identified from input_pattern (default: 'input') for each sample then the best input sample (by fuzzywuzzy score) is selected chip_dict is written as yaml to workflow workingdir predicts whether a sample is broad or narrow based on histone mark pattern """ pat = "|".join(re.split(',| |\\||;', input_pattern_str)) input_pat = r".*(" + pat + ")" clean_pat = r"" + pat + "" pat1 = re.compile(clean_pat, re.IGNORECASE) if fromBam: infiles = sorted(glob.glob(os.path.join(fromBam, '*' + bamExt))) else: infiles = sorted( glob.glob(os.path.join(wdir, 'filtered_bam/', '*.bam'))) samples = get_sample_names_bam(infiles, bamExt) chip_dict_pred = {} chip_dict_pred["chip_dict"] = {} print( "---------------------------------------------------------------------------------------" ) print("Predict Chip-seq sample configuration") print( "---------------------------------------------------------------------------------------" ) print("\nSearch for Input/control samples...") input_samples = set([]) for i in samples: if re.match(input_pat, i, re.IGNORECASE): print("...found: ", i) input_samples.add(i) print("\nTry to find corresponding ChIP samples...") for i in samples: if i in input_samples: continue print( "\n sample: ", i, ) matches_sim = {} for j in input_samples: c_clean = pat1.sub("", j) sim1 = fuzz.ratio(c_clean, i) + fuzz.partial_ratio( c_clean, i) + fuzz.token_sort_ratio( c_clean, i) + fuzz.token_set_ratio(c_clean, i) matches_sim[j] = sim1 / 4 sim = 0 final_matches = set([]) for key, value in sorted(matches_sim.items(), key=lambda k: (k[1], k[0]), reverse=True): if value >= sim: final_matches.add(key) print(" top matching input sample by score: %s = %s" % (key, value)) sim = value tmp = ':'.join(list(final_matches)) if len(final_matches) > 1: tmp = "__PLEASE_SELECT_ONLY_ONE_CONTROL__:" + tmp elif len(final_matches) == 0: print("No control sample found!") chip_dict_pred["chip_dict"][i] = {} chip_dict_pred["chip_dict"][i]['control'] = tmp if re.match(".*(H3K4me1|H3K36me3|H3K9me3|H3K27me3).*", i, re.IGNORECASE): chip_dict_pred["chip_dict"][i]['broad'] = True else: chip_dict_pred["chip_dict"][i]['broad'] = False outfile = os.path.join(wdir, "chip_seq_sample_config.PREDICTED.yaml") write_configfile(outfile, chip_dict_pred) print( "---------------------------------------------------------------------------------------" ) print("Chip-seq sample configuration is written to file ", outfile) print( "Please check and modify this file - this is just a guess! Then run the workflow with it." ) print( "---------------------------------------------------------------------------------------" )
def get_data(url, interest): url = str(url) interest = str(interest) print interest response = requests.get(url) soup = BeautifulSoup(response.content, 'lxml') details = [] contact_details = [] names = [] research = [] emails = [] links = [] for id in soup.find_all("a", href=True): if "./" in id['href']: if "Dr." in id.text: links.append("http://ee.iitd.ernet.in/people" + id['href'].replace("./", "/")) names.append(id.text.replace("\n", "")) for name in soup.find_all("td", {'width': '70%'}): details.append(name.text) tmp = name.text a = tmp.find("Area") p = tmp.find("Phone") e = tmp.find("Email") research.append(tmp[a + 5:]) contact_details.append(tmp[p:e].replace("\n", "")) emails.append(tmp[e:a - 9].replace("\n", "")) research.pop(0) emails.pop(0) contact_details.pop(0) fields = [ ] # 2-d array which consists all research areas in a list on that index! pos = 0 i = 0 for data in research: pos = 0 fields.append([]) j = 0 x = 0 for a in data: if a == '(': x += 1 if a == ')': x -= 1 if a == ',': if x == 0: fields[i].append(data[pos:j]) if data[j + 1] == " ": pos = j + 2 else: pos = j + 1 if j == len(data) - 1: fields[i].append(data[pos:j]) j += 1 i += 1 list = [] for i in xrange(0, len(names)): list.append([]) list[i].append(names[i].strip()) list[i].append(emails[i].strip()) list[i].append(links[i].strip()) list[i].append(contact_details[i].strip()) list[i].append(fields[i]) i = 0 k = 0 final_list = [] for x in list: count = 0 for y in x[4]: rat = fuzz.partial_ratio(interest.lower(), y.lower()) if count == 1: break if rat >= 80: final_list.append({}) final_list[k]['name'] = names[i] final_list[k]['email'] = emails[i] final_list[k]['link'] = links[i] final_list[k]['contact_detail'] = contact_details[i] final_list[k]['field'] = fields[i] count += 1 k = k + 1 i += 1 pprint(final_list) return final_list
from fuzzywuzzy import fuzz from fuzzywuzzy import process # Simple Ratio r = fuzz.ratio("this is a test", "this is a test!") print(r) # Partial Ratio r = fuzz.partial_ratio("this is a test", "this is a test!") print(r) # Token Sort Ratio r1 = fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear") r2 = fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear") print(r1, r2) # Token Set Ratio r1 = fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") r2 = fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear") print(r1, r2)
def WRatio(s1, s2, force_ascii=True): """ Return a measure of the sequences' similarity between 0 and 100, using different algorithms. **Steps in the order they occur** #. Run full_process from utils on both strings #. Short circuit if this makes either string empty #. Take the ratio of the two processed strings (fuzz.ratio) #. Run checks to compare the length of the strings * If one of the strings is more than 1.5 times as long as the other use partial_ratio comparisons - scale partial results by 0.9 (this makes sure only full results can return 100) * If one of the strings is over 8 times as long as the other instead scale by 0.6 #. Run the other ratio functions * if using partial ratio functions call partial_ratio, partial_token_sort_ratio and partial_token_set_ratio scale all of these by the ratio based on length * otherwise call token_sort_ratio and token_set_ratio * all token based comparisons are scaled by 0.95 (on top of any partial scalars) #. Take the highest value from these results round it and return it as an integer. :param s1: :param s2: :param force_ascii: Allow only ascii characters :type force_ascii: bool :return: """ p1 = utils.full_process(s1, force_ascii=force_ascii) p2 = utils.full_process(s2, force_ascii=force_ascii) if not utils.validate_string(p1): return 0 if not utils.validate_string(p2): return 0 # should we look at partials? try_partial = True unbase_scale = .60 partial_scale = .90 base = fuzz.ratio(p1, p2) len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) # if strings are similar length, don't use partials if abs(len(p2) - len(p1)) <= 1: try_partial = True partial_scale = 0.95 unbase_scale = 0.65 if abs(len(p2) - len(p1)) <= 2 and max(len(p2), len(p1)) > 6: try_partial = False if abs(len(p2) - len(p1)) >= 3 and max(len(p2), len(p1)) > 6: try_partial = True partial_scale = 0.85 if len_ratio > 2: try_partial = True partial_scale = 0.65 # if one string is much much shorter than the other if len_ratio > 8: partial_scale = .60 if try_partial: partial = fuzz.partial_ratio(p1, p2) * partial_scale ptsor = fuzz.partial_token_sort_ratio(p1, p2, full_process=False) \ * unbase_scale * partial_scale ptser = fuzz.partial_token_set_ratio(p1, p2, full_process=False) \ * unbase_scale * partial_scale return utils.intr(max(base, partial, ptsor, ptser)) else: tsor = fuzz.token_sort_ratio(p1, p2, full_process=False) * unbase_scale tser = fuzz.token_set_ratio(p1, p2, full_process=False) * unbase_scale return utils.intr(max(base, tsor, tser))
def testPartialRatioUnicodeString(self): s1 = "\u00C1" s2 = "ABCD" score = fuzz.partial_ratio(s1, s2) self.assertEqual(0, score)
def fuzzycheck(string1, string2): PartialRatio = fuzz.partial_ratio(string1, string2) if PartialRatio > 90: return True else: return False
def similar(a, b): return fuzz.partial_ratio(a, b)
import requests from BeautifulSoup import BeautifulSoup import re import webbrowser import os url = 'https://en.wikipedia.org/w/api.php?action=query&list=random&rnnamespace=0&rnlimit=10&format=xml' response = requests.get(url) html = response.content soup = BeautifulSoup(html) ids = [] for item in soup.findAll(id=re.compile("[0-9]")): ids.append(item['id']) titles = [] for name in soup.findAll(title=re.compile(".*")): titles.append(name['title']) print('') for i in range(len(titles)): ans = raw_input("Do you want to read about " + titles[i] + "?") if fuzz.partial_ratio(ans, "Yes") > 50: soup = BeautifulSoup( requests.get('https://en.wikipedia.org/wiki?curid=' + ids[i]).content) result = soup.find("div", {"id": "mw-content-text"}).find('p').text print(result) os.system('P:\Python\Projects\wiki_v2.py')
if not oldAttrname[0] in outputData2[key].keys(): outputData2[key][oldAttrname[0]] = 0 outputData2[key][oldAttrname[0]] += 1 else: if not key in outputData2[key].keys(): outputData2[key][key] = 0 outputData2[key][key] += 1 for key, value in outputData2.items(): keyList= list(value.keys()) for k2 in keyList: if outputData2[key][k2] < 3: score2 = fuzz.partial_ratio(key, k2) if score2 < 65: if not key in keySimInv[k2].keys(): print(f"Elimino {k2} da {key}") del outputData2[key][k2] else: print(key, k2, score2) CommonUtilities.writeDictToJson(outputData2, f"{PHASE_3_SOURCE_DIR}/big_clusterkey_2.json") ###Passo 3 Calcolo lo score | Per ogni chiave confronto la lista delle chiavi valore con tutte le altre ###Score calcolato come % di inclusione dell 'insieme piu piccolo keylist = list(outputData2.keys()) for k1 in keylist:
data['len_char_q1'] = data.question1.apply( lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_char_q2'] = data.question2.apply( lambda x: len(''.join(set(str(x).replace(' ', ''))))) data['len_word_q1'] = data.question1.apply(lambda x: len(str(x).split())) data['len_word_q2'] = data.question2.apply(lambda x: len(str(x).split())) data['common_words'] = data.apply(lambda x: len( set(str(x['question1']).lower().split()).intersection( set(str(x['question2']).lower().split()))), axis=1) data['fuzz_qratio'] = data.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply( lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply( lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply( lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply( lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
def create_new_features(self): # normal custom features self.data["question1"] = self.data["question1"].fillna("").apply( self.question_preprocess) self.data["question2"] = self.data["question2"].fillna("").apply( self.question_preprocess) self.data['freq_qid1'] = self.data.groupby( ['qid1'])['qid1'].transform('count') self.data['freq_qid2'] = self.data.groupby('qid2')['qid2'].transform( 'count') self.data['q1len'] = self.data['question1'].str.len() self.data['q2len'] = self.data['question1'].str.len() self.data['q1_n_words'] = self.data['question1'].apply( lambda row: len(row.split(" "))) self.data['q2_n_words'] = self.data['question2'].apply( lambda row: len(row.split(" "))) self.data['word_Common'] = self.data.apply(self.normalized_word_Common, axis=1) self.data['word_share'] = self.data.apply(self.normalized_word_share, axis=1) self.data[ 'word_Total'] = self.data['q1_n_words'] + self.data['q2_n_words'] self.data[ 'freq_q1+q2'] = self.data['freq_qid1'] + self.data['freq_qid2'] self.data['freq_q1-q2'] = abs(self.data['freq_qid1'] - self.data['freq_qid2']) # advanced features self.token_features = self.data.apply( lambda x: self.get_token_features(x["question1"], x["question2"]), axis=1) self.data["cwc_min"] = list(map(lambda x: x[0], self.token_features)) self.data["cwc_max"] = list(map(lambda x: x[1], self.token_features)) self.data["csc_min"] = list(map(lambda x: x[2], self.token_features)) self.data["csc_max"] = list(map(lambda x: x[3], self.token_features)) self.data["ctc_min"] = list(map(lambda x: x[4], self.token_features)) self.data["ctc_max"] = list(map(lambda x: x[5], self.token_features)) self.data["last_word_eq"] = list( map(lambda x: x[6], self.token_features)) self.data["first_word_eq"] = list( map(lambda x: x[7], self.token_features)) self.data["abs_len_diff"] = list( map(lambda x: x[8], self.token_features)) self.data["mean_len"] = list(map(lambda x: x[9], self.token_features)) self.data["token_set_ratio"] = self.data.apply( lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1) self.data["token_sort_ratio"] = self.data.apply( lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1) self.data["fuzz_ratio"] = self.data.apply( lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1) self.data["fuzz_partial_ratio"] = self.data.apply( lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1) self.data["longest_substr_ratio"] = self.data.apply( lambda x: self.get_longest_substr_ratio(x["question1"], x[ "question2"]), axis=1) return self.data
splitted_attribute_where = i.split(" ") for e in splitted_attribute_where: if e in filtered_sentence_where: filtered_sentence_where.remove(e) if Where_attribute_exist == True: continue words_matching_dic_where = {} att_Dict_where = {} for n in DF_att: ## iterating on tables names #print(n) #print(DF_att[n]) for word1, word2 in product(filtered_sentence_where, DF_att[n]): Ratio = fuzz.ratio(word1.lower(), word2.lower()) Partial_Ratio = fuzz.partial_ratio(word1.lower(), word2.lower()) Token_Sort_Ratio = fuzz.token_sort_ratio(word1, word2) Token_Set_Ratio = fuzz.token_set_ratio(word1, word2) if Ratio >= 65: print(word1 + " matches with attribute " + word2 + " with ratio " + str(Ratio) + " in the where clause ") list_attributes_matched.append(word2) ID_list = [" ID ", "_ID"] ID_conflict = False for d in ID_list: if d in word2 and d not in word1: ID_conflict = True elif d in word1 and d in word2: mapping_Dic["Where"].append(word2) # Df_name.append(n)
def imageanalise(id_gen,source_pdfs,folders): from fuzzywuzzy import fuzz from fuzzywuzzy import process # import glob import pyocr import pyocr.builders import re from PIL import Image from os import system # Creating a report file id_source,spath,pages,id_gen,destination,types,docid,outcome GenerateReport('ID Source','Source Path','Pages','ID Destination','Destination','Type','Doc ID','Outcome','', True) id_gen = int(id_gen) # Verify subfolder in main folder for i,folder in folders: vals = pathmapping(folder,'croped/croped_val1_*.jpg',False,True) pdf_pages_number = [] # Check for validation images inside folder for numpage,val in reversed(list(vals)): print(val) green_grade = 0 im = Image.open(val) jpg = val.replace('croped/croped_val1_','') # Saving PDF pages pdf_pages_number.append(numpage) # Check for green grade in image for pixel in im.getdata(): if (pixel[1]>(pixel[2]+10) and pixel[1]>(pixel[0]+10)): green_grade += 1 # Check text inside main area of analises if (green_grade >=200): # Build txt image in order to be analised cropimage(folder,jpg,100,120,700,270,'croped_txt1_') jpg_text = val.replace('val1','txt1') # Convert image into text mode tools = pyocr.get_available_tools()[0] text_txt1 = tools.image_to_string(Image.open(jpg_text), builder=pyocr.builders.DigitBuilder()) if fuzz.partial_ratio('INSTALACAO', text_txt1) > 70: id_gen = GenerateDoc(id_gen,source_pdfs,'LI',folder,jpg,280,400,715,475,pdf_pages_number,None) print('\n ============ DOCUMENT FOUND (LI) =========== \n') elif fuzz.partial_ratio('OPERACAO', text_txt1) > 70: id_gen = GenerateDoc(id_gen,source_pdfs,'LO',folder,jpg,280,400,715,475,pdf_pages_number,None) print('\n ============ DOCUMENT FOUND (LO) =========== \n') elif fuzz.partial_ratio('PREVIA', text_txt1) > 70: id_gen = GenerateDoc(id_gen,source_pdfs,'LP',folder,jpg,280,400,715,475,pdf_pages_number,None) print('\n ============ DOCUMENT FOUND (LP) =========== \n') elif fuzz.partial_ratio('LOCALIZACAO', text_txt1) > 70: id_gen = GenerateDoc(id_gen,source_pdfs,'LL',folder,jpg,100,410,715,475,pdf_pages_number,None) print('\n ============ DOCUMENT FOUND (LL) =========== \n') else: id_gen = GenerateDoc(id_gen,source_pdfs,'ERROR',folder,jpg,350,410,715,475,pdf_pages_number,None) print('\n ============ DOCUMENT FOUND (NL) =========== \n') else: jpg_text2 = val.replace('val1','txt2') tools = pyocr.get_available_tools()[0] text_txt2 = tools.image_to_string(Image.open(jpg_text2), builder=pyocr.builders.DigitBuilder()) if fuzz.partial_ratio('Sistema de Tratamento de Efluentes', text_txt2) > 70: id_gen = GenerateDoc(id_gen,source_pdfs,'STE',folder,jpg,380,60,620,130,pdf_pages_number,None) print('\n ============ DOCUMENT FOUND (STE) =========== \n') else: jpg_text3 = val.replace('val1','txt3') tools = pyocr.get_available_tools()[0] text_txt3 = tools.image_to_string(Image.open(jpg_text3), builder=pyocr.builders.DigitBuilder()) startnum = val.rfind('_') endnum = val.rfind('.') doc_num = re.findall(r'r\d+/\d+|$', text_txt3) doc_num = ''.join(doc_num[0]) doc_num = doc_num.replace('/','.') if doc_num == '': doc_num = str(id_gen) if fuzz.partial_ratio('LICENGA ESPECIAL', text_txt3) > 70: id_gen = GenerateDoc(id_gen,source_pdfs,'LE',None,jpg,0,0,0,0,pdf_pages_number,doc_num) print('\n ============ DOCUMENT FOUND (LE) =========== \n') elif int(val[startnum+1:endnum])==0: id_gen = GenerateDoc(id_gen,source_pdfs,'NotRecon',None,jpg,0,0,0,0,pdf_pages_number,doc_num) print('\n ============ DOCUMENT NOT FOUND =========== \n') system('rm -r docclass/')
def email_match(authors, emails): author_list = authors[:] # create a copy of authors (not changing the input) # result = reordered emails list result = [''] * len(author_list) matrix = [] for email in emails: ratios = [] for author in author_list: try: email_id = email.split('@')[0] try: lname = author.split(', ')[0].lower() fname = author.split(', ')[1].lower() except: fname = author.split(' ')[0].lower() lname = author.split(' ')[1].lower() initial = ''.join( [i[0].lower() for i in re.findall(r"[\w']+", fname)]) + ''.join( [j[0].lower() for j in re.findall(r"[\w']+", lname)]) f_lastname = fname[0] + lname name = fname + lname ratios.append([ fuzz.partial_ratio(lname, email_id), fuzz.partial_ratio(fname, email_id), fuzz.partial_ratio(initial, email_id), fuzz.partial_ratio(f_lastname, email_id), fuzz.ratio(name, email_id) ]) # ratios.append([fuzz.ratio(lname, email_id), fuzz.ratio(fname, email_id), # fuzz.ratio(initial, email_id), fuzz.ratio(f_lastname, email_id), # fuzz.ratio(name, email_id)]) except: ratios.append([0] * 5) ratios = np.array(ratios) ratios = np.transpose(ratios) matrix.extend(ratios) matrix = np.array(matrix) indices = {} for score in sorted(set(matrix.flat), reverse=True): cord = np.where(matrix == score) for c in list(zip(cord[0], cord[1])): if c[0] // 5 not in indices.keys() and c[1] not in indices.values( ): indices[c[0] // 5] = c[1] if len(indices) == len(emails): break for k, v in indices.items(): result[v] = emails[k] return result
def get_categories(part_info: dict, supplier_only=False) -> list: ''' Find categories from part supplier data, use "somewhat automatic" matching ''' categories = [None, None] try: supplier_category = str(part_info['category']) supplier_subcategory = str(part_info['subcategory']) except KeyError: return categories # Return supplier category, if match not needed if supplier_only: categories[0] = supplier_category categories[1] = supplier_subcategory return categories function_filter = False # TODO: Make 'filter_parameter' user defined? filter_parameter = 'Function Type' ### Check existing matches # Load inversed category map category_map = config_interface.load_supplier_categories_inversed( supplier_config_path=settings.CONFIG_DIGIKEY_CATEGORIES) try: for inventree_category in category_map.keys(): for key, inventree_subcategory in category_map[ inventree_category].items(): if supplier_subcategory == key: categories[0] = inventree_category # Check if filtering by function if inventree_subcategory.startswith( config_interface.FUNCTION_FILTER_KEY): function_filter = True # Save subcategory if not function filtered if not function_filter: categories[1] = inventree_subcategory break except: pass ### Function Filter if not categories[1] and function_filter: cprint( f'[INFO]\tSubcategory is filtered using "{filter_parameter}" parameter', silent=settings.SILENT, end='') # Load parameter map parameter_map = config_interface.load_category_parameters( categories[0], settings.CONFIG_DIGIKEY_PARAMETERS) # Build compare list compare = [] for supplier_parameter, inventree_parameter in parameter_map.items(): if (supplier_parameter in part_info['parameters'].keys() and \ inventree_parameter == filter_parameter): compare.append(part_info['parameters'][supplier_parameter]) # Load subcategory map category_map = config_interface.load_supplier_categories( supplier_config_path=settings.CONFIG_DIGIKEY_CATEGORIES)[ categories[0]] for inventree_subcategory in category_map.keys(): for item in compare: fuzzy_match = fuzz.partial_ratio(inventree_subcategory, item) display_result = f'"{inventree_subcategory}" ?= "{item}"'.ljust( 50) cprint(f'{display_result} => {fuzzy_match}', silent=settings.HIDE_DEBUG) if fuzzy_match >= settings.CATEGORY_MATCH_RATIO_LIMIT: categories[1] = inventree_subcategory.replace( config_interface.FUNCTION_FILTER_KEY, '') break if categories[1]: cprint(f'\t[ PASS ]', silent=settings.SILENT) break if not categories[1] and function_filter: cprint(f'\t[ FAILED ]', silent=settings.SILENT) ### Automatic Match if not (categories[0] and categories[1]): # Load category map category_map = config_interface.load_supplier_categories( supplier_config_path=settings.CONFIG_DIGIKEY_CATEGORIES) def find_supplier_category_match(supplier_category: str): # Check for match with Inventree categories category_match = None subcategory_match = None for inventree_category in category_map.keys(): fuzzy_match = fuzz.partial_ratio(supplier_category, inventree_category) display_result = f'"{supplier_category}" ?= "{inventree_category}"'.ljust( 50) cprint(f'{display_result} => {fuzzy_match}', silent=settings.HIDE_DEBUG) if fuzzy_match < settings.CATEGORY_MATCH_RATIO_LIMIT and \ category_map[inventree_category]: # Compare to subcategories for inventree_subcategory in category_map[ inventree_category]: fuzzy_match = fuzz.partial_ratio( supplier_category, inventree_subcategory) display_result = f'"{supplier_category}" ?= "{inventree_subcategory}"'.ljust( 50) cprint(f'{display_result} => {fuzzy_match}', silent=settings.HIDE_DEBUG) if fuzzy_match >= settings.CATEGORY_MATCH_RATIO_LIMIT: subcategory_match = inventree_subcategory break if fuzzy_match >= settings.CATEGORY_MATCH_RATIO_LIMIT: category_match = inventree_category break return category_match, subcategory_match # Find category and subcategories match category, subcategory = find_supplier_category_match(supplier_category) if category: categories[0] = category if subcategory: categories[1] = subcategory # Run match with supplier subcategory if not categories[0] or not categories[1]: category, subcategory = find_supplier_category_match( supplier_subcategory) if category and not categories[0]: categories[0] = category if subcategory and not categories[1]: categories[1] = subcategory # Final checks if not categories[0]: cprint( f'[INFO]\tWarning: "{part_info["category"]}" did not match any supplier category ', silent=settings.SILENT) else: cprint(f'[INFO]\tCategory: "{categories[0]}"', silent=settings.SILENT) if not categories[1]: cprint( f'[INFO]\tWarning: "{part_info["subcategory"]}" did not match any supplier subcategory ', silent=settings.SILENT) else: cprint(f'[INFO]\tSubcategory: "{categories[1]}"', silent=settings.SILENT) return categories
def hosts(year): year = str(year) lst = [] file_name = 'pruned_tweets_' + year + '.json' with open(file_name, encoding="utf8") as infile: for line in infile: text = json.loads(line)['text'] lst.append(text.lower()) infile.close() # run it through tagger w/ "host" tweets relevant_tweets = [] for tweet in lst: if "hosted" in tweet: relevant_tweets.append(tweet) #print(len(relevant_tweets), " is the number of tweets containing host") #print("\n\nTagging them with spacy now...") countt = 0 shorter_list = [] tinacount = 0 yikes = False while countt < len(relevant_tweets): ppl_lst = tagger(relevant_tweets[countt]) if ppl_lst != []: for person in ppl_lst: if "'" in person: person = person[:person.index("'")] if "’" in person: person = person[:person.index("’")] flag = False for ii in range(len(shorter_list)): if person in shorter_list[ii][ 0] and "http" not in person and len( person.split()) < 3: shorter_list[ii][1] += 1 flag = True if not flag and "http" not in person and len( person.split()) < 3: shorter_list.append([person, 1]) countt += 1 shorter_list.sort(key=lambda x: -x[1]) shorter_list = shorter_list[:10] curr = len(shorter_list) - 1 new_shorter_list = [] while curr >= 0: name = shorter_list[curr][0] score = shorter_list[curr][1] chk = curr - 1 found = False while chk >= 0: name2 = shorter_list[chk][0] #check if fuzz is above a threshold threshold = max(fuzz.partial_ratio(name, name2), fuzz.ratio(name, name2)) if threshold > 85: found = True #name at top is not a good name if len(name2.split()) < 2: shorter_list[chk][0] = name shorter_list[chk][1] += score break chk -= 1 if found == False: new_shorter_list.append([name, score]) curr -= 1 new_shorter_list.sort(key=lambda x: -x[1]) if new_shorter_list[0][1] > 4 * new_shorter_list[1][1]: #print("Host is", new_shorter_list[0][0].title()) return [new_shorter_list[0][0].title()] else: return [new_shorter_list[0][0].title(), new_shorter_list[1][0].title()]
def fitness(self, txt, qst): self.qstType(qst) if self.thisType == 'UK': _, sim = self.bin_answer(qst, txt) return sim > self.threshold qstType = self.thisType self.candidateAnswer = [] self.candidateSentence = [] extendList = [] for thisSent in [txt]: extendList.append(thisSent) thisParseTree = self.qgPipeline.getParseTree(thisSent) no_conj_list = self.qgPipeline.splitConj(thisParseTree) simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list) for i in simpl_sents: extendList.append(i) # pdb.set_trace() for txt in extendList: # print(txt) tree = self.sNLP.parser_sents([ txt, ]) for i in tree: self.dropTotal = 0 self.dropFlag = 1 while self.dropFlag: self.findFlag = 0 nowTree = copy.deepcopy(i) self.dropTime = 0 nowTree = self.dropFragment(nowTree, qstType) if self.dropTime <= self.dropTotal: self.dropFlag = 0 self.dropTotal += 1 best_dis = 0 best_ans = '_' best_candi = None best_sen = None for i in range(len(self.candidateSentence)): nowSentence = ' '.join(self.candidateSentence[i]) score = fuzz.partial_ratio(self.qstSim, nowSentence) this_ans = ' '.join(self.candidateAnswer[i]) # print(this_ans, best_ans, score, best_dis) if self.qstSim == None: continue if this_ans == None: continue if (score >= best_dis): if score == best_dis and len(this_ans) >= len( best_ans) and self.thisType in ['WHADVP', 'WHPP']: continue if score == best_dis and len(this_ans) <= len( best_ans) and self.thisType in ['WHNP']: continue best_dis = score best_sen = nowSentence best_ans = this_ans return self.threshold < best_dis
def run_topical_analysis(string): '''Searches for a user given string and performs sentiment analysis for it.''' sia = SIA() print("Searching for " + string + "...") with open(r"out/dataset.csv", "r") as infile_posts: with open(r"out/dataset_comments.csv", "r") as infile_comments: post_reader = csv.reader(infile_posts) comment_reader = csv.reader(infile_comments) include_list = [] positive_count = 0 negative_count = 0 total_count = 0 for row in post_reader: print("Analyzing posts and comment rows: " + str(total_count + 1), end="\r") match_post_title = fuzz.partial_ratio(string, row[0]) match_post_flair = fuzz.partial_ratio(string, row[4]) if row[6] == "''": match_post_selftext = 0 if match_post_flair >= 85 or match_post_title >= 85: resultult_0 = sia.polarity_scores(row[0]) resultult_1 = None include_list.append(row[7]) else: continue else: match_post_selftext = fuzz.partial_ratio(string, row[6]) if match_post_flair >= 85 or match_post_title >= 85 or match_post_selftext >= 85: resultult_0 = sia.polarity_scores(row[0]) resultult_1 = sia.polarity_scores(row[4]) include_list.append(row[7]) else: continue if resultult_0['compound'] > 0.2: with open(r"out/positive_list_" + "%r" % string + r".txt", "a", encoding="utf-8") as outfile_posts: outfile_posts.write(row[0] + "\n") positive_count += 1 total_count += 1 elif resultult_0['compound'] < -0.2: with open(r"out/negative_list_" + "%r" % string + r".txt", "a", encoding="utf-8") as outfile_posts: outfile_posts.write(row[0] + "\n") negative_count += 1 total_count += 1 if resultult_1 is not None: if resultult_1['compound'] > 0.2: with open(r"out/positive_list_" + "%r" % string + r".txt", "a", encoding="utf-8") as outfile_posts: outfile_posts.write(row[0] + "\n") positive_count += 1 elif resultult_1['compound'] < -0.2: with open(r"out/negative_list_" + "%r" % string + r".txt", "a", encoding="utf-8") as outfile_posts: outfile_posts.write(row[0] + "\n") negative_count += 1 for row in comment_reader: print("Analyzing posts and comment rows: " + str(total_count + 1), end="\r") if row[1] in include_list: total_count += 1 resultult = sia.polarity_scores(row[0]) if resultult['compound'] > 0.2: with open(r"out/positive_list_" + "%r" % string + r".txt", "a", encoding="utf-8") as outfile_comments: outfile_comments.write(row[0] + "\n") positive_count += 1 elif resultult['compound'] < -0.2: with open(r"out/negative_list_" + "%r" % string + r".txt", "a", encoding="utf-8") as outfile_comments: outfile_comments.write(row[0] + "\n") negative_count += 1 print("\nDone.") plot_word_types(total_count, negative_count, positive_count, string)
def answer(self, txtList, qst): self.head = word_tokenize(qst)[0].lower() self.qstType(qst) if self.thisType == 'UK': best_score = 0 best_ans = 'Yes' best_sent = '_' for txt in txtList: ans, sim = self.bin_answer(qst, txt) if sim > best_score: best_ans = ans best_score = sim best_sent = txt #print('=======') #print(best_sent) #print(qst) print(best_ans + '.') #print(best_score) #print('=======') return qstType = self.thisType self.candidateAnswer = [] self.candidateSentence = [] extendList = [] for thisSent in txtList: thisSent = self.preProcessText(thisSent) if (len(word_tokenize(thisSent)) < 4 or len(word_tokenize(thisSent)) > 25): continue extendList.append(thisSent) thisParseTree = self.qgPipeline.getParseTree(thisSent) no_conj_list = self.qgPipeline.splitConj(thisParseTree) simpl_sents = self.qgPipeline.simplify_sentence(no_conj_list) for i in simpl_sents: extendList.append(i) # pdb.set_trace() for txt in extendList: # print(txt) tree = self.sNLP.parser_sents([ txt, ]) for i in tree: self.dropTotal = 0 self.dropFlag = 1 while self.dropFlag: self.findFlag = 0 nowTree = copy.deepcopy(i) self.dropTime = 0 nowTree = self.dropFragment(nowTree, qstType) if self.dropTime <= self.dropTotal: self.dropFlag = 0 self.dropTotal += 1 best_dis = 0 best_candi = None best_sen = None best_ans = '_' for i in range(len(self.candidateSentence)): nowSentence = ' '.join(self.candidateSentence[i]) # print(nowSentence) # print(self.qstSim) score = fuzz.partial_ratio(self.qstSim, nowSentence) # print(score) # print('----------') this_ans = ' '.join(self.candidateAnswer[i]) # print(this_ans, best_ans, score, best_dis) if self.qstSim == None: continue if this_ans == None: continue if (score >= best_dis): if score == best_dis and len(this_ans) >= len( best_ans) and self.thisType in ['WHADVP', 'WHPP']: continue if score == best_dis and len(this_ans) <= len( best_ans) and self.thisType in ['WHNP']: continue if self.head == 'who': ners = getExhaustiveNERs(this_ans) #print(this_ans, ners[0]) if 'PERSON' not in ners[0] and 'ORGANIZATION' not in ners[ 0]: if score - best_dis < 10: continue else: score = score - 10 if self.head == 'when': ners = getExhaustiveNERs(this_ans) if 'DATE' not in ners[0]: if score - best_dis < 10: continue else: score = score - 10 if self.head == 'where': ners = getExhaustiveNERs(this_ans) if 'LOCATION' not in ners[0] and 'CITY' not in ners[ 0] and 'ORGANIZATION' not in ners[ 0] and 'STATE_OR_PROVINCE' not in ners[ 0] and 'COUNTRY' not in ners[0]: if score - best_dis < 10: continue else: score = score - 10 best_dis = score best_sen = nowSentence best_ans = this_ans #print('++++++++++++++++++') #print(qst) #print(best_dis) #print(best_sen) if best_ans == '_': print('I cannot answer that question: ' + qst) else: print(best_ans.capitalize() + '.')
def bin_answer(self, question, sent): #print(question, sent) qstTree = self.sNLP.dependency_parse(question) qstTree = qstTree.__next__() qstTree = list(qstTree.triples()) sentTree = self.sNLP.dependency_parse(sent) sentTree = sentTree.__next__() sentTree = list(sentTree.triples()) #print(qstTree, sentTree) qstSub = [] sentSub = [] flag = False neg = False for x in qstTree: # print(x) if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']: qstSub.append(self.parseDep(x)) if x[1] == 'neg': neg = True for x in sentTree: if x[1] in ['nsubj', 'nsubjpass', 'csubj', 'csubjpass']: sentSub.append(self.parseDep(x)) if self.parseDep(x) in qstSub: flag = True #print(qstSub) #print(sentSub) if flag: if neg: return ('No', 100) else: return ('Yes', 100) bin_tags = set( ["did", 'do', 'does', 'are', 'is', 'have', 'was', 'were', 'has']) question = question.lower() sent = sent.lower() q_tokens = word_tokenize(question) s_tokens = word_tokenize(sent) negations = set(['not', 'never', "aren't"]) ans = '' # case 1: negations for neg in negations: if (neg in q_tokens) and (neg not in s_tokens): if ans == "No": ans = "Yes" else: ans = "No" if (neg in q_tokens) and (neg in s_tokens): if ans == "Yes": ans = "No" else: ans = "Yes" # case 2: similarity sim = fuzz.partial_ratio(question, sent) if sim > 90: ans = "Yes" else: ans = "No" return (ans, sim)
max_fuzz_token_sort_ratio = 0.0 for parag_sent in segmenter.split(paragraph): parag_stems = stemmize(parag_sent) # chars_dist = edit_distance( normalize_word2(quest), normalize_word2(parag), substitution_cost=1, transpositions=True) # min_chars_edit_dist = min( min_chars_edit_dist, chars_dist ) shingles3 = distance.jaccard( quest_shingles3, get_shingles3(parag_stems)) fuzz_qratio = 0.01 * fuzz.QRatio(quest_stems, parag_stems) max_fuzz_qratio = max( max_fuzz_qratio, fuzz_qratio ) fuzz_WRatio = 0.01 * fuzz.WRatio(quest_stems, parag_stems) max_fuzz_WRatio = max( max_fuzz_WRatio, fuzz_WRatio ) fuzz_partial_ratio = 0.01 * fuzz.partial_ratio(quest_stems, parag_stems) max_fuzz_partial_ratio = max( max_fuzz_partial_ratio, fuzz_partial_ratio ) fuzz_partial_token_set_ratio = 0.01 * fuzz.partial_token_set_ratio(quest_stems, parag_stems) max_fuzz_partial_token_set_ratio = max( max_fuzz_partial_token_set_ratio, fuzz_partial_token_set_ratio) fuzz_partial_token_sort_ratio = 0.01 * fuzz.partial_token_sort_ratio(quest_stems, parag_stems) max_fuzz_partial_token_sort_ratio = max( max_fuzz_partial_token_sort_ratio, fuzz_partial_token_sort_ratio ) fuzz_token_set_ratio = 0.01 * fuzz.token_set_ratio(quest_stems, parag_stems) max_fuzz_token_set_ratio = max( max_fuzz_token_set_ratio, fuzz_token_set_ratio) fuzz_token_sort_ratio = 0.01 * fuzz.token_sort_ratio(quest_stems, parag_stems) max_fuzz_token_sort_ratio = max( max_fuzz_token_sort_ratio, fuzz_token_sort_ratio ) df.loc[index, 'max_shingles3_str'] = max_shingles3
def main(): """Entry point to the script This function does the following: a. calls the file_download function to download the files from a web location b. generates file download success report c. checks for plagiarism d. generates plagiarism check report """ import datetime try: test_takers_list = test_takers() types_of_encoding = ["utf-8", "cp1252", "cp850", "utf8"] # creating required directories if not os.path.exists(rep_dir): os.mkdir(rep_dir) if not os.path.exists(answer_folder): os.mkdir(answer_folder) date_time_stamp_raw = str(datetime.datetime.now()) date_time_stamp = date_time_stamp_raw.replace(":", ".") ans_folder_name = os.path.join(answer_folder, "Answers_" + date_time_stamp) if not os.path.exists(ans_folder_name): os.mkdir(ans_folder_name) rep_folder_name = os.path.join(rep_dir, "Report_" + date_time_stamp) if not os.path.exists(rep_folder_name): os.mkdir(rep_folder_name) # downloading answers and working on success report file with open( os.path.join(rep_dir, rep_folder_name, "report " + date_time_stamp + ".csv"), "w") as report_file: report_file.write("_" * 75 + "\n") report_file.write(" Test Taker " + "| Tasks " + " | Status |" + " File Name " + "\n") report_file.write("-" * 75 + "\n") for test_taker in tqdm(test_takers_list): for tasks_folder in tasks_folders: if "." in test_taker: usr_folder = web_url + test_taker folder_url = web_url + test_taker + "/" + tasks_folder else: usr_folder = web_url + "~" + test_taker folder_url = web_url + "~" + test_taker + "/" + tasks_folder if file_fldr_exists(usr_folder): folder_name = file_download(folder_url) if folder_name != 0: os.rename(tasks_folder, tasks_folder + ".html") existing_files = filenames_from_html( os.path.join(os.getcwd(), tasks_folder + ".html")) os.remove( os.path.join(os.getcwd(), tasks_folder + ".html")) if len(existing_files) > 0: for file_name in existing_files: file_url = folder_url + "/" + file_name file_name = file_download(file_url) dest_usr = os.path.join( ans_folder_name, test_taker) if not os.path.exists(dest_usr): os.mkdir(dest_usr) dest_task = os.path.join( ans_folder_name, test_taker, tasks_folder) if not os.path.exists(dest_task): os.mkdir(dest_task) try: move_file(file_name, test_taker, tasks_folder, ans_folder_name) # Report specific data success_text_report = " " + test_taker + " | " + tasks_folder + " | Files successfully downloaded | " + file_name + "\n" report_file.write(success_text_report) except Exception: success_text_report = " " + str( test_taker ) + " | " + str( tasks_folder ) + " | Files too big to download | " + str( file_name) + "\n" report_file.write(success_text_report) pass else: # Report specific data no_files_found = " " + test_taker + " | " + tasks_folder + " |No files found in the folder to download |" + "\n" report_file.write(no_files_found) else: # Report specific data folder_not_found = " " + test_taker + " | " + tasks_folder + " | Folder named " + tasks_folder + " not found |" + "\n" report_file.write(folder_not_found) else: # Report specific data error_dwnld_file = " " + test_taker + " | " + tasks_folder + " |Can't access url or user folder not found|" + "\n" report_file.write(error_dwnld_file) report_file.write(" " + "." * 75 + "\n") report_file.write(" " + "-" * 75 + "\n") report_file.close() if easygui.ynbox( "Done downloading files and creating report. \n\nDo you want to run the plagiarism check now?", "Run plagiarism check?", choices=("[<F1>]Yes", "[<F2>]No"), default_choice="[<F1>]Yes", cancel_choice="[<F2>]No"): if not hash_check: combs = {} final_results = {} for Answer_folder in tqdm( retrieve_folder_content(answer_folder)): for Student_folder in retrieve_folder_content( Answer_folder): for Task_folder in retrieve_folder_content( Student_folder): for Ans_file in retrieve_folder_content( Task_folder, True): # Student_folder2 is the student folder to compare the Ans_file content with for Student_folder2 in retrieve_folder_content( Answer_folder): if Student_folder2 != Student_folder: # Task_folder2 is the Task folder inside the Student_folder2 to compare the Ans_file with for Task_folder2 in retrieve_folder_content( Student_folder2): if os.path.basename( Task_folder2 ) == os.path.basename(Task_folder): stu_fol_1 = os.path.basename( Student_folder) stu_fol_2 = os.path.basename( Student_folder2) # stu_tskfile_1 = os.path.basename(Ans_file) temp_comb = [ os.path.basename( Task_folder) + "_" + stu_fol_1 + "_" + stu_fol_2, os.path.basename( Task_folder) + "_" + stu_fol_2 + "_" + stu_fol_1 ] if temp_comb[0] not in combs: for Ans_file2 in retrieve_folder_content( Task_folder2, True): # with open(Ans_file2, 'r') as fp2: # with open(Ans_file, 'r') as fp: # s = fp.read() # s_tocomp = fp2.read() # result = fuzz.ratio(s, s_tocomp) # combs.update( # {temp_comb[0]: result, temp_comb[1]: result}) # final_results.update({temp_comb[0]: result}) for encoding_type in types_of_encoding: if temp_comb[ 0] not in combs: with codecs.open( Ans_file, encoding =encoding_type, errors= 'replace' ) as fp: for encoding_type in types_of_encoding: if temp_comb[ 0] not in combs: with codecs.open( Ans_file2, encoding =encoding_type, errors ='replace' ) as fp2: # with open(Ans_file, 'r') as fp: # with open(Ans_file2, 'r') as fp2: s = fp.read( ) # print(s.encode('utf-8')) s_tocomp = fp2.read( ) # print(s_tocomp.encode('utf-8')) if type_of_check == "Simple Ratio": result = fuzz.ratio( s, s_tocomp ) elif type_of_check == "Partial Ratio": result = fuzz.partial_ratio( s, s_tocomp ) elif type_of_check == "Token Sort Ratio": result = fuzz.token_sort_ratio( s, s_tocomp ) elif type_of_check == "Token Set Ratio": result = fuzz.token_set_ratio( s, s_tocomp ) combs.update({ temp_comb[0]: result, temp_comb[1]: result }) final_results.update({ temp_comb[0]: result }) #print(final_results) else: combs = {} final_results = {} for Answer_folder in tqdm( retrieve_folder_content(answer_folder)): for Student_folder in retrieve_folder_content( Answer_folder): ## html_td = "<tr> <td> " + student_folder + "</td>" for Task_folder in retrieve_folder_content( Student_folder): for Ans_file in retrieve_folder_content( Task_folder, True): # Student_folder2 is the student folder to compare the Ans_file content with for Student_folder2 in retrieve_folder_content( Answer_folder): if Student_folder2 != Student_folder: # Task_folder2 is the Task folder inside the Student_folder2 to compare the Ans_file with for Task_folder2 in retrieve_folder_content( Student_folder2): if os.path.basename( Task_folder2 ) == os.path.basename(Task_folder): stu_fol_1 = os.path.basename( Student_folder) stu_fol_2 = os.path.basename( Student_folder2) # stu_tskfile_1 = os.path.basename(Ans_file) temp_comb = [ os.path.basename( Task_folder) + "_" + stu_fol_1 + "_" + stu_fol_2, os.path.basename( Task_folder) + "_" + stu_fol_2 + "_" + stu_fol_1 ] if temp_comb[0] not in combs: for Ans_file2 in retrieve_folder_content( Task_folder2, True): for encoding_type in types_of_encoding: if temp_comb[ 0] not in combs: with codecs.open( Ans_file, encoding =encoding_type, errors= 'replace' ) as fp: for encoding_type in types_of_encoding: if temp_comb[ 0] not in combs: with codecs.open( Ans_file2, encoding =encoding_type, errors ='replace' ) as fp2: # with open(Ans_file, 'r') as fp: # with open(Ans_file2, 'r') as fp2: s_buf_raw = fp.read( ) s_buf = s_buf_raw.encode( 'utf-8' ) hasher = hashlib.md5( ) hasher.update( s_buf ) s = hasher.digest( ) s_tocomp_buf_raw = fp2.read( ) s_tocomp_buf = s_tocomp_buf_raw.encode( 'utf-8' ) hasher = hashlib.md5( ) hasher.update( s_tocomp_buf ) s_tocomp = hasher.digest( ) if type_of_check == "Simple Ratio": result = fuzz.ratio( s, s_tocomp ) elif type_of_check == "Partial Ratio": result = fuzz.partial_ratio( s, s_tocomp ) elif type_of_check == "Token Sort Ratio": result = fuzz.token_sort_ratio( s, s_tocomp ) elif type_of_check == "Token Set Ratio": result = fuzz.token_set_ratio( s, s_tocomp ) combs.update({ temp_comb[0]: result, temp_comb[1]: result }) final_results.update({ temp_comb[0]: result }) # final_results.update({"comb": temp_comb[0], "similarity": result}) ## html_td = html_td + </tr> #print(final_results) # Creating HTML file with plagiarism check results h_H2 = "<h2> %s </h2>" h_div = "<div> %s </div>" t_table = "<table> %s </table>" t_row = "<tr> %s </tr>" t_header = "<th bgcolcor=\"#F5F1F1\"> %s </th>" t_data = "<td> %s </td>" t_data_red = "<td bgcolor=\"#FF6747\"> %s </td>" html_beg = """ <html> <head> <title>Plagiarism Results</title> <style> table { font-family: arial, sans-serif; border-collapse: collapse; width: 100%; } td, th { border: 1px solid #dddddd; text-align: left; padding: 8px; } h1 { color: #111; font-family: 'Helvetica Neue', sans-serif; font-size: 80px; font-weight: bold; letter-spacing: -1px; line-height: 1; text-align: center; } h2 { color: #111; font-family: 'Open Sans', sans-serif; font-size: 30px; font-weight: bold; line-height: 32px; margin: 0 0 10px; text-align: left; } tr:nth-child(even) { background-color: #dddddd; } </style> </head> <body> <h1> Plagiarism Check Results </h1> """ html_end = """ </body> <html> """ html_div = "" for tasks_folder in tasks_folders: # task_spec_dict = {} before_ = [] _after = [] for key in final_results: if tasks_folder in key: # task_spec_dict.update({key: final_results[key]}) if key.split("_")[1] not in _after: before_.append(key.split("_")[1]) if key.split("_")[2] not in before_: _after.append(key.split("_")[2]) #print(task_spec_dict) before_ = list(set(before_)) _after = list(set(_after)) heading = h_H2 % tasks_folder tables = "" table_rows = "" table_headers = "" table_headers += t_header % "Test Takers" for b in before_: table_headers += t_header % b table_rows += t_row % table_headers table_headers = "" for a in _after: new_data = t_data % a for b in before_: if a != b: try: sim_res = final_results[tasks_folder + "_" + b + "_" + a] except Exception as e: sim_res = 0 #print(e) pass if sim_res > 80: new_data += t_data_red % sim_res else: new_data += t_data % sim_res table_rows += t_row % new_data tables += tables + t_table % table_rows html_div += heading + h_div % tables + "<br>" + "<br>" whole_html = html_beg + html_div + html_end f = open("plagiarism_check.html", "w+") f.write(whole_html) f.close() # # Generating csv from dictionary combs # final_results_json = json.dumps(final_results) # final_results = json.loads(final_results_json) # # f = csv.writer(open("plagiarism_check.csv", "wb+")) # # # Write CSV Header # f.writerow(["student1_student2_task#", "similarity"]) # # for final_result in final_results: # f.writerow([final_result.key(), # final_result.value()]) else: pass easygui.msgbox( "Success! running the script \nCheck download report in Reports folder.", "Run Result") except Exception as err: easygui.msgbox("Error!" + "\n" + str(err), "Run Result") raise SystemExit(str(err))
# Predicting a new result y_pred_p1_rf = regressor.predict(X_test) y_pred_p1_rf = round_val(y_pred_p1_rf) cm_p1_rf = confusion_matrix(y_test, y_pred_p1_rf) accuracy(cm_p1_rf) "70%" """"""""""""""" Phase 2: Feature Extraction """"""""""""""""""""""""""""" data_p2 = data data_p2 ['fw_qratio'] = data_p2.apply(lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data_p2 ['fw_WRatio'] = data_p2.apply(lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) data_p2 ['fw_par_ratio'] = data_p2.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data_p2 ['fw_par_token_set_ratio'] = data_p2.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data_p2 ['fw_par_token_sort_ratio'] = data_p2.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data_p2 ['fw_token_set_ratio'] = data_p2.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data_p2 ['fw_token_sort_ratio'] = data_p2.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data_p2.to_csv('quora_features_extraction_p2.csv', index=False) list(data_p2) data_p2_upd = data_p2 data_p2_upd = data_p2_upd.drop(['id','question1','question2'], axis=1) list(data_p2_upd) data_p1_p2 = pd.concat([data.reset_index(drop=True), data_p2_upd], axis=1) list(data_p1_p2)
def linking_data_one_file(id2question, index_ent, index_reach, index_names, ent_resultpath): ent_lineids, id2queries = get_query_texts( ent_resultpath) # ent_lineids may have some examples missing id2mids = {} data = defaultdict(list) for i, lineid in enumerate(ent_lineids): if not lineid in id2question.keys(): continue if i % 1000 == 0: print("line {}".format(i)) truth_mid, truth_name, truth_rel, question = id2question[lineid] queries = id2queries[lineid] C = [] # candidate entities C_counts = [] C_scored = [] for query_text in queries: query_tokens = query_text.split() N = min(len(query_tokens), 3) for n in range(N, 0, -1): ngrams_set = find_ngrams(query_tokens, n) # print("ngrams_set: {}".format(ngrams_set)) for ngram_tuple in ngrams_set: ngram = " ".join(ngram_tuple) # unigram stopwords have too many candidates so just skip over if ngram in stopwords: continue # print("ngram: {}".format(ngram)) try: cand_mids = index_ent[ngram] # search entities except: continue C.extend(cand_mids) if (len(C) > 0): break #early termination for mid in set(C): count_mid = C.count( mid) # count number of times mid appeared in C C_counts.append((mid, count_mid)) for mid, count_mid in C_counts: if mid in index_names.keys(): cand_ent_name = pick_best_name(question, index_names[mid]) if cand_ent_name == truth_name: # if name is correct, we are good data['exact_name_match'].append(1) else: data['exact_name_match'].append(0) # if not exact match but close match if cand_ent_name != truth_name and fuzz.ratio( cand_ent_name, truth_name) >= 60: data['partial_name_match'].append(1) else: data['partial_name_match'].append(0) if mid == truth_mid: data['true_label'].append(1) else: data['true_label'].append(0) data['lineid'].append(lineid) data['query'].append(query_text) data['length_name'].append(len(cand_ent_name.split())) data['length_question'].append(len(question.split())) data['length_query'].append(len(query_tokens)) data['tf'].append(count_mid) data['idf'].append( calc_idf(question, cand_ent_name, index_ent)) data['sques'].append( fuzz.ratio(cand_ent_name, question) / 100.0) data['squer'].append( fuzz.ratio(cand_ent_name, query_text) / 100.0) data['pques'].append( fuzz.partial_ratio(cand_ent_name, question) / 100.0) data['pquer'].append( fuzz.partial_ratio(cand_ent_name, query_text) / 100.0) df = pd.DataFrame(data) return df
.intersection(set(str(x['question2']) .lower().split()))), axis=1) fs_1 = ['len_q1', 'len_q2', 'diff_len', 'len_char_q1', 'len_char_q2', 'len_word_q1', 'len_word_q2', 'common_words'] pprint(fs_1) print('---- Computed ----') print('---- Computing fuzzy features ----') data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio( str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio( str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
def fuzzy(i): j = fuzz.partial_ratio(i[0].upper(), i[1].upper()) return j