def test_decorate(): @proto.decorate def test(): r'''Test function test 1 x = x + 1 test 2 x = x - 2 test x y = 10 ''' expected = [ (1, var('x')), (2, var('x')), (var('x'), var('y')), ] actual = [pattern for pattern, _, _ in test.patterns] for e, a in zip(expected, actual): assert match(e, a) assert match(a, e) assert test(1, 2) == 3 assert test(2, 1) == -1 assert test(5, 6) == 10
def get_refmod_mine_nlm_matches(model1, model2): matches = [] nodes1temp = commonFunctions.get_nodes(model1, consideredComponents) nodes2temp = commonFunctions.get_nodes(model2, consideredComponents) nodes1 = create_refmod_nodes(nodes1temp) nodes2 = create_refmod_nodes(nodes2temp) entries1 = [node_a.labelTokens for node_a in nodes1] entries2 = [node_b.labelTokens for node_b in nodes2] global wiki_entries wiki_entries = fetch_entry_from_wiktionary(entries1 + entries2) for n1 in nodes1: for n2 in nodes2: if is_identical_condition(n1, n2): matches.append( matcher.match(node1=model1.diagram_graph.node[n1.id], node2=model2.diagram_graph.node[n2.id], score=1.0)) continue if is_cross_category_condition(n1, n2): matches.append( matcher.match(node1=model1.diagram_graph.node[n1.id], node2=model2.diagram_graph.node[n2.id], score=1.0)) continue return matches
def test_decorate_method(): class TestClass(object): def __init__(self, value): self.value = value @proto.decorate_method def get_value(): r'''Returns first argument, gets instance attribute 'value' otherwise. value x = x value = self.value ''' matcher = TestClass.__dict__['get_value'] expected = [(var('x'), ), tuple()] actual = [pattern for pattern, _, _ in matcher.patterns] for e, a in zip(expected, actual): assert match(e, a) assert match(a, e) v = TestClass(2) assert v.get_value(1) == 1 assert v.get_value() == 2
def test_matcher_gets_reset_after_complex_match(self): matcher = self.example_comment_matcher() self.assertEqual( ' first comment ', matcher.match('/* first comment */').raw_value) self.assertEqual( ' second comment ', matcher.match('/* second comment */').raw_value)
def main(): st.title("Dataset") data, titles, texts = utils.load_data() # careful, loads 12 GB of data preprocess_wiki_text(data, "preped_wikitexts.jsonl") # Prep text from Wiki get_infobox_data( "./data/preped_wikitexts.jsonl", "./data/matched_texts.jsonl") # Get Articles and Infoboxs mapped # or use for speedup => get_infobox_data_multi("./data/preped_wikitexts.jsonl", "./data/matched_texts.jsonl") matcher.match( "./data/matched_texts.jsonl", "/data/train_data.jsonl" ) # match sentences and triple (infobox values) => saving in data as jsonl
def tampilDeadline(usrMsg): deadline = db.getList_Daftar_Tugas_Status(False) for i in range(len(deadline)): #General untuk tugas if(matcher.match(usrMsg,"tugas")): if(matcher.match(usrMsg,deadline[i][2].lower())): return deadline[i][1] #Spesifik, tucil, tubes atau pr else: if(matcher.match(usrMsg,deadline[i][2].lower()) and matcher.match(usrMsg,deadline[i][3].lower())): return deadline[i][1] return "Tidak ada deadline itu"
def callback_boxes(message): d = jsonpickle.decode(message.data) print("@", time.time(), "\n", d) matched = match(d, clusters) fus_pub.publish(jsonpickle.encode(matched))
def test_link_is_encoded_as_phrase(self): links_match = [] for para in paras_from('bookmark with link.docx'): phrase_contents = in_para_allcontent.pick_contents\ (in_para_allcontent.contentlist(para), lambda x: x["type"] == "phrase") for content in phrase_contents: links_match.append( matcher.match(in_para_phrase.content_regex, content)) self.assertAllAreOk(links_match)
def test_bookmark_is_encoded_as_anchor(self): anchors_match = [] for para in paras_from('anchor.docx'): anchor_contents = in_para_allcontent.pick_contents\ (in_para_allcontent.contentlist(para), lambda x: x['type'] == "anchor") for content in anchor_contents: anchors_match.append( matcher.match(in_para_bookmark.content_regex, content)) self.assertAllAreOk(anchors_match)
def test_reference_is_encoded_as_external(self): extrefs_match = [] for para in paras_from('externalref.docx'): extref_contents = in_para_allcontent.pick_contents\ (in_para_allcontent.contentlist(para), lambda x: x["type"] == "extref") for content in extref_contents: extrefs_match.append( matcher.match(in_para_externalref.content_regex, content)) self.assertAllAreOk(extrefs_match)
def test_link_to_html_is_encoded_as_phrase(self): links_match = [] para_with_link = paras_from('link to html.docx')[0] phrase_contents = in_para_allcontent.pick_contents\ (in_para_allcontent.contentlist(para_with_link), lambda x: x["type"] == "phrase") for content in phrase_contents: links_match.append( matcher.match(in_para_phrase.content_regex, content)) self.assertAllAreOk(links_match)
def daftar_katakunci(text): data = [] textlist = text.split(" ") for kata in bd.getList_Kata_Penting()[1:]: for i in range(len(textlist)): index = matcher.match(textlist[i].lower(), kata.lower()) if (index and len(kata) + 1 >= len(textlist[i]) + 1): data.append(kata) break if (index): break for kata in bd.getList_Kata_Tampil_Deadline(): for i in range(len(textlist)): index = matcher.match(textlist[i].lower(), kata.lower()) if (index): data.append(kata) return data
def process(usrMsg): result = at.ValidasiInput(usrMsg) if(result =="-1"): text = str(usrMsg).lower() #Menampilkan help for pattern in kata_help: if (matcher.match(text, pattern)) : return help() #Menandai task selesai for pattern in kata_task_selesai: if (matcher.match(text, pattern)) : return tandaiTask(text) #Menampilkan tanggal deadline suatu task for pattern in kata_tampil_deadline: if (matcher.match(text, pattern)) : return tampilDeadline(text) kata_penting = db.getList_Kata_Help() kata_penting += db.getList_Kata_Tampil_Deadline() kata_penting += db.getList_Kata_Task_Selesai() kata_input = text.split(" ") found = False for kata in kata_input: for pattern in kata_penting: if kata not in kata_penting: if matcher.similarity(pattern, kata) >= 0.75: text = text.replace(kata, pattern) found = True if found: return "Mungkin maksud kamu:\n" + text return "Maaf, pesan tidak dikenali" else: return result
def diundurTask(usrMsg): found = False text = str(usrMsg).split(" ") for i in range(len(text)): if (matcher.match(text[i], "undur") or matcher.match(text[i], "ubah")): if (len(text[i + 2]) > 2): tanggal = text[i + 2] else: bulan_int = bulan.get(text[i + 3].lower()) tanggal = text[i + 2] + "/" + bulan_int + "/" + text[i + 4] (tgl, bln, th) = re.split("/", tanggal) date = datetime.date(int(th), int(bln), int(tgl)) bd.update_Daftar_Tugas(text[i - 1], date) found = True output = "Deadline Tugas ID " + text[i - 1] + "<br>" output += "berhasil diperbarui menjadi " + str(date) + " <br>" return output if (found == False): return "-1"
def prune_results(res_1, res_2, threshold=3.0): to_return = set() res_1_all_values = [] for value in res_1.values(): res_1_all_values = res_1_all_values + value res_2_all_values = [] for value in res_1.values(): res_2_all_values = res_2_all_values + value for entity in res_1.keys(): do_rule_3 = res_1[entity].__len__() for match in res_1[entity]: # RULE 1 if res_1_all_values.__contains__(match) and res_2_all_values.__contains__(match): to_return.add( matcher.match(node1=entity.node, node2=get_node_from_entities(match.id2, res_2.keys()), score=1.0)) # RULE 2 elif res_1_all_values.__contains__(match) or res_2_all_values.__contains__(match): if match.score >= threshold: to_return.add( matcher.match(node1=entity.node, node2=get_node_from_entities(match.id2, res_2.keys()), score=1.0)) else: do_rule_3 = do_rule_3 - 1 # RULE 3 if do_rule_3 == 0: for match in get_two_best(res_1[entity]): to_return.add( matcher.match(node1=entity.node, node2=get_node_from_entities(match.id2, res_2.keys()), score=1.0)) for entity in res_2.keys(): do_rule_3 = res_2[entity].__len__() for match in res_2[entity]: # RULE 1 if res_1_all_values.__contains__(match) and res_2_all_values.__contains__( match): to_return.add( matcher.match(node1=entity.node, node2=get_node_from_entities(match.id2, res_1.keys()), score=1.0)) # RULE 2 elif res_1_all_values.__contains__(match) or res_2_all_values.__contains__(match): if match.score >= threshold: to_return.add( matcher.match(node1=entity.node, node2=get_node_from_entities(match.id2, res_1.keys()), score=1.0)) else: do_rule_3 = do_rule_3 - 1 # RULE 3 if do_rule_3 == 0: for match in get_two_best(res_2[entity]): to_return.add( matcher.match(node1=entity.node, node2=get_node_from_entities(match.id2, res_1.keys()), score=1.0)) return to_return
def getMatchSSSmatches(diagram1, diagram2, threshold=0.5): # bag has structure: {nodeId: normalizedLabel} bag1 = extract_normalize_step(diagram1, activeComponents) bag2 = extract_normalize_step(diagram2, activeComponents) matches_alpha = calculate_similarity_step(bag1, bag2) # Identify step matches_final = [] for m in matches_alpha: if m.similiraty_score >= threshold: matches_final.append( matcher.match(node1=diagram1.diagram_graph.node[m.id1], node2=diagram2.diagram_graph.node[m.id2], score=m.similiraty_score)) return matches_final
def student_listing_matcher(student, listing): """ Match student to listing by student fields and desired listing fields :param student: Student obj :param listing: Listing obj :return: ratio between 0 and 1 """ same_job_type = False for i in student.looking_for: if i in listing.job_type: same_job_type = True if not same_job_type: return 0 if len(student.skills) == 0: return 0 skill_ratio = matcher.match(student.skills, listing.desired_skills) return skill_ratio
def antaraTanggal_Jenis(text, jenis): data = [] textlist = text.split(" ") for i in range(len(textlist)): index = matcher.match(textlist[i].lower(), "antara") if (index): if len(textlist[i + 1]) > 2: data.append(textlist[i + 1]) data.append(str(textlist[i + 3]).replace("?", "")) else: bulan_int = bulan.get(textlist[i + 1].lower()) tanggal = textlist[i + 1] + "/" + bulan_int + "/" + textlist[i + 3] data.append(tanggal) bulan_int = bulan.get(textlist[i + 3].lower()) tanggal = textlist[i + 1] + "/" + bulan_int + "/" + textlist[i + 3] data.append(tanggal) break if (len(data) == 2): (tgl, bln, th) = re.split("/", data[0]) date1 = datetime.date(int(th), int(bln), int(tgl)) (tgl, bln, th) = re.split("/", data[1]) date2 = datetime.date(int(th), int(bln), int(tgl)) # rubah rertun list output = "[Menampilkan daftar " + jenis + str(date1) + " - " + str( date2) + "] <br>" daftar = bd.getList_Daftar_Tugas_Jenis_tgl(jenis, date1, date2, False) if (len(daftar) == 0): return "Tidak ada " + jenis + " antara " + str( date1) + " - " + str(date2) for tugas in daftar: output += "(ID: " + tugas[0] + ") " + tugas[1] + " " + tugas[ 2] + " " + tugas[3] + " <br>" return output else: return "-1"
def filelist(self, roots, **kwargs): ''' Parameters ---------- roots: file_roots, pillar_roots, cache_roots, etc to walk kwargs: Contains any extra variables to pass to element ''' for root, abspath in walk(roots): element = self.element(root, abspath, **kwargs) if self.match_each and not all( matcher.match([element], self.pattern)): continue self.add_element(element, **kwargs) return self.as_sequence
def get_matches_by_combine(activity_pairs_all, bots_results, bot_thresholds, best, second_best, model_pairs): final_matches = dict() for pair in model_pairs: final_matches[pair] = [] for act_pair in activity_pairs_all: # bots_results[best][act_pair] , return similarity score of act_pair in best BOT configufarion if bots_results[best][act_pair] >= bot_thresholds[best][0] or bots_results[second_best][act_pair] >= \ bot_thresholds[second_best][0]: sim_score = max(bots_results[best][act_pair], bots_results[second_best][act_pair]) node1 = act_pair.model_pair.bpmn1.diagram_graph.node[ act_pair.node1_id] node2 = act_pair.model_pair.bpmn2.diagram_graph.node[ act_pair.node2_id] final_matches[act_pair.model_pair].append( matcher.match(node1=node1, node2=node2, score=sim_score)) return final_matches
def get_triple_s_matches(bpmn1, bpmn2, syntactic_weight=0.5, semantic_weight=0.35, ratio_weight=0.05, position_weight=0.1, threshold=0.5): matches = [] nodes1 = commonFunctions.get_nodes(bpmn1, activeComponents) nodes2 = commonFunctions.get_nodes(bpmn2, activeComponents) graph1 = commonFunctions.get_graph_with_id_nodes(bpmn1) graph2 = commonFunctions.get_graph_with_id_nodes(bpmn2) for node_a in nodes1: for node_b in nodes2: l1 = node_a['node_name'].lower() l2 = node_b['node_name'].lower() syntactic_score = get_syntactic_score(l1, l2) semantic_score = get_semantic_score(l1, l2) structural_score = get_structural_score( node_a, node_b, graph_a=graph1, graph_b=graph2, bpmn_a=bpmn1, bpmn_b=bpmn2, ratio_weight=ratio_weight, position_weight=position_weight) final_score = syntactic_weight * syntactic_score + semantic_weight * semantic_score + structural_score if final_score >= threshold: matches.append( matcher.match(node1=node_a, node2=node_b, score=final_score)) return matches
def haritask(text): found = False textlist = text.split(" ") for i in range(len(textlist)): index = matcher.match(textlist[i].lower(), "hari") if (index): found = True N = int(textlist[i - 1]) dateEnd = nHariKedepan(N) output = "[Menampilkan Tugas " + str(N) + " hari ke depan]<br>" daftar = bd.getList_Daftar_Tugas_tgl(datetime.date.today(), dateEnd, 0) if (len(daftar) == 0): return "Tidak ada " + "deadline" + " " + str( N) + " hari ke depan" for tugas in daftar: output += "(ID: " + tugas[0] + ") " + tugas[1] + " " + tugas[ 2] + " " + tugas[3] + " <br>" return output if (not found): return "-1"
def filelist(self, roots, **kwargs): ''' Parameters ---------- roots: file_roots, pillar_roots, cache_roots, etc to walk kwargs: Contains any extra variables to pass to element ''' for root, abspath in walk(roots): element = self.element(root, abspath, **kwargs) if self.match_each and not all( matcher.match( [element], self.pattern ) ): continue self.add_element(element, **kwargs) return self.as_sequence
def minggutask_Jenis(text, jenis): found = False textlist = text.split(" ") for i in range(len(textlist)): index = matcher.match(textlist[i].lower(), "minggu") if (index): found = True N = int(textlist[i - 1]) dateEnd = nHariKedepan(N * 7) output = "[Menampilkan " + jenis + " " + str( N) + " minggu ke depan]<br>" daftar = bd.getList_Daftar_Tugas_Jenis_tgl(jenis, datetime.date.today(), dateEnd, 0) if (len(daftar) == 0): return "Tidak ada " + jenis + " " + str(N) + " minggu ke depan" for tugas in daftar: output += "(ID: " + tugas[0] + ")" + tugas[1] + " " + tugas[ 2] + " " + tugas[3] + "<br>" return output if (not found): return "-1"
def parse(lines): """Parses the given text lines and returns an AST that represents the simple HTML document from the text. Raises a ParseError if parsing fails. Raises a TokenizeError if tokenizing fails.""" return match(SimpHtmlParser().parse(lines))
import matcher classSize = 4 matcher.studentMaker('A', 90, 'male', 5, 4, 4, [2], [4]) matcher.studentMaker('B', 90, 'female', 5, 4, 4, [1], []) matcher.studentMaker('C', 65, 'male', 3, 2, 1, [4], []) matcher.studentMaker('D', 65, 'female', 3, 2, 1, [3], [1]) #A and C #B and D matcher.groupMaker(classSize) matcher.match() matcher.printMatch()
numPics = len(imageFiles) print numPics button_pressed = 1 button_released = 0 i = 0 while True: try: button_status = controller.check_button() if button_status == button_pressed: print "Got new photo!" controller.turn_light_on() if (i >= numPics): i = 0 imagePath = (imageFiles)[i] print "Matching photo " + imagePath template = cv2.imread('badguy.jpg') matcher.match(imagePath, template) i = i + 1 button_status = button_released else: controller.turn_light_off() except KeyboardInterrupt, SystemExit: controller.turn_light_off() controller.turn_buzzer_off() sys.exit() break except (IOError, TypeError) as e: print("Error")
def test_matches_and_has_remaining_text(self): matcher = self.example_string_matcher() matcher.match("'foo\\'s bar and fig\\'s foo' and stuff") self.assertEqual( " and stuff", matcher.remaining_text)
def test_match_for_complex_delims_remaining_text(self): matcher = self.example_comment_matcher() matcher.match('/* comment */this text remains') self.assertEqual('this text remains', matcher.remaining_text)
def test_match_calculates_correct_remaining_text(self): matcher = self.example_exact_literal_matcher() matcher.match('->later text') self.assertEqual('later text', matcher.remaining_text)
def test_matches_has_correct_remaining_value(self): matcher = self.example_while_matcher() matcher.match("foobar and stuff") self.assertEqual( " and stuff", matcher.remaining_text)
def validate_model( prob_thresh=load_config()["machine_learning"]["prboability_thresholds"]["general"], test=False ): """Compares new model with status quo production model and compiles/reports the results. Based on results, will either replace model and archive old one or just maintain status quo. Parameters: - `prob_thresh` (float): probability threshold which the classifier will use to determine whether or not there is a match. - `test` (bool): whether in testing or not, will dtermine flow of operations and mute emails appropriately. """ match_query = """ SELECT company_projects.job_number, company_projects.city, company_projects.address, company_projects.title, company_projects.owner, company_projects.contractor, company_projects.engineer, company_projects.address_lat, company_projects.address_lng, company_projects.receiver_emails_dump, web_certificates.url_key, web_certificates.cert_id, attempted_matches.ground_truth, attempted_matches.multi_phase, web_certificates.pub_date, web_certificates.source, CONCAT(base_urls.base_url, web_certificates.url_key) AS link FROM web_certificates LEFT JOIN attempted_matches ON web_certificates.cert_id = attempted_matches.cert_id LEFT JOIN company_projects ON attempted_matches.project_id = company_projects.project_id LEFT JOIN base_urls ON base_urls.source = web_certificates.source WHERE company_projects.closed=1 AND attempted_matches.ground_truth=1 AND attempted_matches.multi_phase=0 AND attempted_matches.validate=1 """ corr_web_certs_query = """ SELECT web_certificates.* FROM web_certificates LEFT JOIN attempted_matches ON web_certificates.cert_id = attempted_matches.cert_id LEFT JOIN company_projects ON attempted_matches.project_id = company_projects.project_id LEFT JOIN base_urls ON base_urls.source = web_certificates.source WHERE company_projects.closed=1 AND attempted_matches.ground_truth=1 AND attempted_matches.multi_phase=0 AND attempted_matches.validate=1 """ with create_connection() as conn: validate_company_projects = pd.read_sql(match_query, conn) validate_web_df = pd.read_sql(corr_web_certs_query, conn) new_results = match( version="new", company_projects=validate_company_projects, df_web=validate_web_df, test=True, prob_thresh=prob_thresh, ) analysis_df = pd.merge( new_results[['job_number', 'cert_id', 'pred_prob', 'pred_match', 'total_score']], validate_company_projects[['job_number', 'cert_id', 'ground_truth']], how='left', on=['job_number', 'cert_id'] ) analysis_df['ground_truth'] = analysis_df.ground_truth.apply(lambda x: 1 if x == 1.0 else 0) tp = len(analysis_df[(analysis_df.pred_match == 1) & (analysis_df.ground_truth == 1)]) fp = len(analysis_df[(analysis_df.pred_match == 1) & (analysis_df.ground_truth == 0)]) tn = len(analysis_df[(analysis_df.pred_match == 0) & (analysis_df.ground_truth == 0)]) fn = len(analysis_df[(analysis_df.pred_match == 0) & (analysis_df.ground_truth == 1)]) if fn: logger.warning(f"match for project #{list(analysis_df[(analysis_df.pred_match == 0) & (analysis_df.ground_truth == 1)]['job_number'])} was not detected.") logger.info(f"true postives: {tp}") logger.info(f"false postives: {fp}") logger.info(f"true negatives: {tn}") logger.info(f"false negatives: {fn}") recall = tp / (tp + fn) precision = tp / (tp + fp) logger.info(f"recall: {recall}") logger.info(f"precision: {precision}") min_prob = min(analysis_df[analysis_df.ground_truth == 1.0]['pred_prob']) logger.info(f"minimum probability threshhold to acheive 100% recall: {min_prob}") analysis_df['adj_pred_match'] = analysis_df.pred_prob.apply(lambda x: x >= min_prob) avg_prob = mean(analysis_df[analysis_df.ground_truth == 1.0]['pred_prob']) logger.debug(analysis_df[analysis_df.adj_pred_match]) signal_and_noise = analysis_df[analysis_df.pred_prob > -0.1] signal = signal_and_noise[signal_and_noise.ground_truth == 1.0]['pred_prob'] noise = signal_and_noise[signal_and_noise.ground_truth != 1.0]['pred_prob'] interval = 0.1 bottom_ranges = np.arange(0, 1, interval) ground_truths, false_matches = [], [] for bottom_range in bottom_ranges: bottom_range = round(bottom_range, 1) upper_range = round((bottom_range + interval), 1) if bottom_range == 0.0: # capture all the false matches scored at exactly 0 bottom_range = -0.1 ground_truths.append(len([value for value in signal if value <= upper_range and value > bottom_range])) false_matches.append(len([value for value in noise if value <= upper_range and value > bottom_range])) df = pd.DataFrame({ 'probability score' : bottom_ranges, 'true match' : ground_truths, 'false match' : false_matches }) p1 = plt.bar(df['probability score'], df['true match'], width=0.07, align='edge', color=(112/255, 94/255, 204/255, 1)) p2 = plt.bar(df['probability score'], df['false match'], width=0.07, align='edge', bottom=df['true match'], color=(112/255, 94/255, 134/255, 1)) t = plt.axvline(x=prob_thresh, color=(70/255, 70/255, 80/255, 1), linestyle='--') plt.ylabel('# of matches') plt.xlabel('predicted probability of match') ax = plt.axes() ax.xaxis.set_major_locator(MaxNLocator(integer=True)) # ax.set_yscale('log', nonposy='clip') # too glitchy to use plt.xticks([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]) plt.title('Precision Spread on Validation Data\n') plt.legend((p1[0], p2[0]), ('true match', 'false match')) # ax = plt.axes() # for spine in ax.spines: # ax.spines[spine].set_visible(False) legend = plt.legend((p1[0], p2[0], t), ('true match', 'false match', 'decision threshold'), frameon=1) frame = legend.get_frame() frame.set_alpha(0) if not test: # will also display inside jupyter notebook regardless (if %matplotlib inline) plt.savefig('static/precision_spread.png', transparent=True, dpi=300) if recall < 1.0: adj_tp = len(analysis_df[(analysis_df.adj_pred_match == 1) & (analysis_df.ground_truth == 1)]) adj_fp = len(analysis_df[(analysis_df.adj_pred_match == 1) & (analysis_df.ground_truth == 0)]) adj_tn = len(analysis_df[(analysis_df.adj_pred_match == 0) & (analysis_df.ground_truth == 0)]) adj_fn = len(analysis_df[(analysis_df.adj_pred_match == 0) & (analysis_df.ground_truth == 1)]) logger.info(f"adjusted true postives: {adj_tp}") logger.info(f"adjusted false postives: {adj_fp}") logger.info(f"adjusted true negatives: {adj_tn}") logger.info(f"adjusted false negatives: {adj_fn}") adj_recall = adj_tp / (adj_tp + adj_fn) adj_precision = adj_tp / (adj_tp + adj_fp) logger.info(f"adjusted recall: {adj_recall}") logger.info(f"adjusted precision: {adj_precision}") logger.info(f"Would have had {adj_fp} false positives ({adj_precision}% precision) if threshold was adjusted down to acheive 100%") try: sq_results = match( version="status_quo", company_projects=validate_company_projects, df_web=validate_web_df, test=True, prob_thresh=prob_thresh, ) except FileNotFoundError: logger.info( "could not find any status quo models to use for baseline validation." ) if not test: logger.info("adopting new model by default and skipping rest of validation") for filename in ["rf_model.pkl", "rf_features.pkl"]: os.rename("new_" + filename, filename) return # exit function because there is no basline to validate against else: logger.info( "will keep testing validation using new model as baseline. Just for testing purposes." ) sq_results = match( version="new", company_projects=validate_company_projects, df_web=validate_web_df, test=True, prob_thresh=prob_thresh, ) sq_analysis_df = pd.merge( sq_results[['job_number', 'cert_id', 'pred_prob', 'pred_match', 'total_score']], validate_company_projects[['job_number', 'cert_id', 'ground_truth']], how='left', on=['job_number', 'cert_id'] ) sq_analysis_df['ground_truth'] = sq_analysis_df.ground_truth.apply(lambda x: 1 if x == 1.0 else 0) sq_tp = len(sq_analysis_df[(sq_analysis_df.pred_match == 1) & (sq_analysis_df.ground_truth == 1)]) sq_fp = len(sq_analysis_df[(sq_analysis_df.pred_match == 1) & (sq_analysis_df.ground_truth == 0)]) sq_tn = len(sq_analysis_df[(sq_analysis_df.pred_match == 0) & (sq_analysis_df.ground_truth == 0)]) sq_fn = len(sq_analysis_df[(sq_analysis_df.pred_match == 0) & (sq_analysis_df.ground_truth == 1)]) if sq_fn: logger.warning(f"match for project #{list(sq_analysis_df[(sq_analysis_df.pred_match == 0) & (sq_analysis_df.ground_truth == 1)]['job_number'])} was not detected.") logger.info(f"true postives: {sq_tp}") logger.info(f"false postives: {sq_fp}") logger.info(f"true negatives: {sq_tn}") logger.info(f"false negatives: {sq_fn}") sq_recall = sq_tp / (sq_tp + sq_fn) sq_precision = sq_tp / (sq_tp + sq_fp) logger.info(f"recall: {sq_recall}") logger.info(f"precision: {sq_precision}") sq_min_prob = min(sq_analysis_df[sq_analysis_df.ground_truth == 1.0]['pred_prob']) logger.info(f"minimum probability threshhold to acheive 100% recall: {sq_min_prob}") sq_analysis_df['adj_pred_match'] = sq_analysis_df.pred_prob.apply(lambda x: x >= sq_min_prob) sq_avg_prob = mean(sq_analysis_df[sq_analysis_df.ground_truth == 1.0]['pred_prob']) logger.debug(sq_analysis_df[sq_analysis_df.adj_pred_match]) update_results({ "probability threshold": prob_thresh, "SMOTE": load_config()["machine_learning"]["use_smote"], "100% recall acheived" : True if int(recall) == 1 else False, 'minimum probability required for status quo model' : sq_min_prob, 'minimum probability required for new model' : min_prob, 'average probability required for status quo model' : sq_avg_prob, 'average probability required for new model' : avg_prob, 'false positives with status quo' : sq_fp, 'false positives with new' : fp, 'precision': precision, }) if recall < 1.0: logger.warning( "100% recall not acheived with new model - archiving it " "and maintaining status quo!" ) if test: logger.info("skipping files transfers because running in test mode") else: for artifact in ["model", "features"]: os.rename( f"new_rf_{artifact}.pkl", f"model_archive/rf_new_{artifact}-{datetime.datetime.now().date()}.pkl", ) else: logger.info("100% recall acheived! Adopting new model and archiving old one.") if test: logger.info("skipping files transfers because running in test mode") else: for artifact in ["model", "features"]: os.rename( f"rf_{artifact}.pkl", f"model_archive/rf_{artifact}-{datetime.datetime.now().date()}.pkl", ) os.rename(f"new_rf_{artifact}.pkl", f"rf_{artifact}.pkl") for metric, new, sq in zip( ("false positive(s)", "max threshold", "average prediction probability"), (fp, min_prob, avg_prob), (sq_fp, sq_min_prob, sq_avg_prob), ): if metric == "false positive(s)": if new <= sq: good_outcome = True else: good_outcome = False elif new >= sq: good_outcome = True else: good_outcome = False if good_outcome: logger.info( f"New model produced {new} {metric}, " f"which is better or equal to status quo of {sq}." ) else: logger.warning( f"Might want to investigate new model - new model produced " f"{new} {metric}, compared to status quo of {sq}" )
def do(row): a, b = row if a[1] and b[1]: if match(a[1], b[1]) > 0: return a[0], b[0]
def test_truth_table(self): build_train_set() train_model(prob_thresh=prob_thresh) match_query = """ SELECT company_projects.*, web_certificates.url_key FROM web_certificates LEFT JOIN attempted_matches ON web_certificates.cert_id = attempted_matches.cert_id LEFT JOIN company_projects ON attempted_matches.project_id = company_projects.project_id LEFT JOIN base_urls ON base_urls.source = web_certificates.source WHERE company_projects.closed=1 AND attempted_matches.ground_truth=1 AND attempted_matches.multi_phase=0 AND attempted_matches.validate=0 """ corr_web_certs_query = """ SELECT web_certificates.* FROM web_certificates LEFT JOIN attempted_matches ON web_certificates.cert_id = attempted_matches.cert_id LEFT JOIN company_projects ON attempted_matches.project_id = company_projects.project_id LEFT JOIN base_urls ON base_urls.source = web_certificates.source WHERE company_projects.closed=1 AND attempted_matches.ground_truth=1 AND attempted_matches.multi_phase=0 AND attempted_matches.validate=0 """ with create_connection() as conn: test_company_projects = pd.read_sql(match_query, conn) test_web_df = pd.read_sql(corr_web_certs_query, conn) test_web_df = wrangle(test_web_df) results = match( company_projects=test_company_projects, df_web=test_web_df, test=True, prob_thresh=prob_thresh, version="new", ) # confrim 100% recall with below assert qty_actual_matches = int(len(results)**0.5) qty_found_matches = results[results.pred_match == 1].title.nunique() self.assertTrue( qty_found_matches == qty_actual_matches, msg= f"qty_found_matches({qty_found_matches}) not equal qty_actual_matches({qty_actual_matches})", ) # make sure not more than 25% false positives with below assert false_positives = len( results[results.pred_match == 1]) - qty_found_matches self.assertTrue( false_positives <= round(qty_actual_matches * 0.25, 1), msg= f"found too many false positives ({false_positives}) out of total test projects ({qty_actual_matches})", ) # test single sample sample_company = pd.DataFrame( { "cert_id": "99999", "project_id": "99999", "job_number": "2387", "city": "Ottawa", "address": "2562 Del Zotto Ave., Ottawa, Ontario", "title": "DWS Building Expansion", "owner": "Douglas Stalker", "contractor": "GNC", "engineer": "Goodkey", "address_lat": 45.312234, "address_lng": -75.623789, "receiver_emails_dump": "{'alex': '*****@*****.**'}", "closed": "0", }, index=range(1), ) sample_web = pd.DataFrame( { "cert_id": "99998", "pub_date": "2019-03-06", "city": "Ottawa-Carleton", "address": "2562 Del Zotto Avenue, Gloucester, Ontario", "title": "Construct a 1 storey storage addition to a 2 storey office/industrial building", "owner": "Doug Stalker, DWS Roofing", "contractor": "GNC Constructors Inc.", "engineer": None, "address_lat": 45.312234, "address_lng": -75.623789, "url_key": "B0046A36-3F1C-11E9-9A87-005056AA6F02", "source": "dcn", }, index=range(1), ) is_match, prob = match( company_projects=sample_company, df_web=sample_web, test=True, version="new").iloc[0][["pred_match", "pred_prob"]] self.assertTrue( is_match, msg= f"Project #{sample_company.job_number} did not match successfully. Match probability returned was {prob}.", ) # test same sample but using db retreival results = match( company_projects=sample_company, since="2019-03-05", until="2019-03-07", test=True, version="new", ) prob_from_db_cert = ( results[results.contractor == "gnc"].iloc[0].pred_prob ) #'gnc' is what is returned from the wrangling funcs self.assertTrue(round(prob, 2) == round(prob_from_db_cert, 2)) # make sure validation runs validate_model(prob_thresh=prob_thresh, test=True)
def match_results(content_regex, object_list): matched_results = [] for list_member in object_list: matched_results.append(matcher.match(content_regex, list_member)) return matched_results
def get_opbot_matches(dataset, model_pairs=None): # define models pairs, model1--model2 # get model - activities # to model1--model add acivity pairs, model1_task1--model2_task1 ... ect models = dataset # i assume the models are loaded already if model_pairs is None: model_pairs = get_model_pairs(models) model_activity = {} # model_id:{node_id:node} for model_name in models.keys(): model_activity[model_name] = cf.get_hashable_nodes( models[model_name], active_components) final_matches = dict() for pair in model_pairs: final_matches[pair] = [] # make copy of nodes kyes, later delete keys of nodes that have been filtered out nodes_keys = {} for pair in model_pairs: nodes_keys[pair] = {} nodes_keys[pair][pair.model1] = model_activity[ pair.model1].keys() # .copy() nodes_keys[pair][pair.model2] = model_activity[ pair.model2].keys() # .copy() # create a list of all tokenised labels from all models # used to create coocurance dictionary all_labels = [] for model_id in models.keys(): for node_id in model_activity[model_id].keys(): all_labels.append( cf.get_tokens_without_stop_words( model_activity[model_id][node_id]['node_name'].lower())) all_words_set = set() # used later to create coocurance dict for m_pair in model_pairs: model1 = m_pair.model1 model2 = m_pair.model2 for node1 in model_activity[model1].keys(): tokens = cf.get_tokens_without_stop_words( model_activity[model1][node1]['node_name']) for node2 in model_activity[model2].keys(): for a_word in tokens + cf.get_tokens_without_stop_words( model_activity[model2][node2]['node_name']): all_words_set.add(a_word.lower()) # Filtering step if model_activity[model1][node1][ 'node_name'] == model_activity[model2][node2][ 'node_name']: # nodes with identical labels are matched final_matches[m_pair].append( matcher.match(model_activity[model1][node1], model_activity[model2][node2], 1.0)) if nodes_keys[m_pair][model1].__contains__(node1): nodes_keys[m_pair][model1].remove(node1) if nodes_keys[m_pair][model2].__contains__(node2): nodes_keys[m_pair][model2].remove(node2) # normalise labels for remaning nodes # create variable , {model: {node_id:normalised_label}} labels = {} for model_name in models: labels[model_name] = {} for node in model_activity[model_name].keys(): labels[model_name][node] = cf.stemming( cf.get_tokens_without_stop_words( model_activity[model_name][node]['node_name'].lower())) # Extract activity pairs activity_pairs_all = [] for pair in model_pairs: nodes1_keys = nodes_keys[pair][pair.model1] nodes2_keys = nodes_keys[pair][pair.model2] for node1 in nodes1_keys: for node2 in nodes2_keys: graph1 = cf.get_graph_with_id_nodes(pair.bpmn1) graph2 = cf.get_graph_with_id_nodes(pair.bpmn2) n1 = model_activity[pair.model1][node1] n2 = model_activity[pair.model2][node2] tokens1 = cf.get_tokens_without_stop_words( model_activity[pair.model1][node1]['node_name'].lower()) tokens2 = cf.get_tokens_without_stop_words( model_activity[pair.model2][node2]['node_name'].lower()) st1 = cf.get_current_to_start_node(node=n1, graph=graph1, diagram=pair.bpmn1) st2 = cf.get_current_to_start_node(node=n2, graph=graph2, diagram=pair.bpmn2) new_activity_pair = activity_pair( pair=pair, node1=node1, label1=labels[pair.model1][node1], st1=st1, tokens1=tokens1, node2=node2, label2=labels[pair.model2][node2], st2=st2, tokens2=tokens2) activity_pairs_all.append(new_activity_pair) pair.activities_pairs.append(new_activity_pair) coocurance = cco.get_coocccurance_dict(all_words_set, all_labels) cco_matcher = cco.cco_occurance_similarity_calculator(coocurance, 2) # calculate similarities bots_results = {} bots_results['B1'] = get_BOT_results(activity_pairs_all, models, 'LIN', False) bots_results['B2'] = get_BOT_results(activity_pairs_all, models, 'LIN', True) bots_results['B3'] = get_BOT_results(activity_pairs_all, models, 'LEV', False) bots_results['B4'] = get_BOT_results(activity_pairs_all, models, 'LEV', True) bots_results['B5'] = get_BOT_results(activity_pairs_all, models, 'CCO', False, cco_matcher) bots_results['B6'] = get_BOT_results(activity_pairs_all, models, 'CCO', True, cco_matcher) # determine thresholds bot_thresholds = {} bot_thresholds['B1'] = determine_thresholds(bot_results=bots_results['B1'], models=model_pairs, pre_t_min=0.6, pre_t_max=1) bot_thresholds['B2'] = determine_thresholds(bot_results=bots_results['B2'], models=model_pairs, pre_t_min=0.6, pre_t_max=1) bot_thresholds['B3'] = determine_thresholds(bot_results=bots_results['B3'], models=model_pairs, pre_t_min=0.6, pre_t_max=1) bot_thresholds['B4'] = determine_thresholds(bot_results=bots_results['B4'], models=model_pairs, pre_t_min=0.6, pre_t_max=1) bot_thresholds['B5'] = determine_thresholds(bot_results=bots_results['B5'], models=model_pairs, pre_t_min=0.7, pre_t_max=1) bot_thresholds['B6'] = determine_thresholds(bot_results=bots_results['B6'], models=model_pairs, pre_t_min=0.7, pre_t_max=1) # rank bot results best, second_best = get_two_best_thresholds(bot_thresholds, bots_results) # determine alignments f_matches = get_matches_by_combine(activity_pairs_all, bots_results, bot_thresholds, best, second_best, model_pairs) # return to_return = {} for k in final_matches.keys(): to_return[k] = final_matches[k] + f_matches[k] # it returns {bpmn_models_pair:list_of_matches} return to_return
def add(text): data = [] textlist = text.split(" ") #jenis matkul tugas dan tugas for kata in bd.getList_Kata_Penting(): for i in range(len(textlist)): index = matcher.match(textlist[i].lower(), kata.lower()) if (index): data.append(textlist[i + 1]) data.append(str(textlist[i]).lower()) nama = textlist[i + 2] pada_tgl = ["pada", "tanggal"] for kata2 in pada_tgl: for k in range(len(textlist[i + 2 + 1:])): index2 = matcher.match(textlist[i + 2 + k].lower(), kata2.lower()) if (index2): for j in range(k - 1): nama += " " + textlist[i + 2 + j + 1] break if (index2): break data.append(nama) break if (index): break #tanggal Tugas for kata in bd.getList_Kata_Tampil_Deadline(): for i in range(len(textlist)): index = matcher.match(textlist[i].lower(), kata.lower()) if (index): if (len(textlist[i + 1]) > 2): data.insert(0, textlist[i + 1]) else: bulan_int = bulan.get(textlist[i + 2].lower()) tanggal = textlist[ i + 1] + "/" + bulan_int + "/" + textlist[i + 3] data.insert(0, tanggal) break if (index is not False): break if (len(data) == 4): (tgl, bln, th) = re.split("/", data[0]) date = datetime.date(int(th), int(bln), int(tgl)) N = len(bd.getList_Daftar_Tugas()) + 1 if (bd.IsInputValid(date, data[1], data[2], data[3], False)): bd.upsert_Daftar_Tugas(N, date, data[1], data[2], data[3], False) output = "[ ===== Berhasil Ditambahkan =====]<br>" output += "(ID: " + str(N) + ") " + str( date) + " " + data[1] + " " + data[2] + " " + data[3] + "<br>" return output else: return "Data yang sama telah digunakan" else: return "-1"
def ValidasiInput(text): data = daftar_katakunci(text) if (matcher.match(str(text).lower(), "undur") or matcher.match(str(text).lower(), "ubah")): if ("pada" not in data): return """Gunakan kata "pada" sebelum tanggal""" else: return diundurTask(text) else: if len( data ) <= 2 and "antara" not in data and "depan" not in data and "deadline" not in data: if (len(data) == 1): if ("tanggal" not in data and "pada" not in data): return """Gunakan kata ["tangal", "pada"] sebelum tanggal""" else: return """Gunakan kata ["tubes", "tucil", "kuis", "ujian", "pr"]""" else: return add(text) elif ("antara" in data): if ("tubes" in data): return antaraTanggal_Jenis(text, "tubes") elif ("tucil" in data): return antaraTanggal_Jenis(text, "tucil") elif ("kuis" in data): return antaraTanggal_Jenis(text, "kuis") elif ("ujian" in data): return antaraTanggal_Jenis(text, "ujian") elif ("pr" in data): return antaraTanggal_Jenis(text, "pr") else: return antaraTanggal(text) elif ("depan" in data): if ("hari" in data): if ("tubes" in data): return haritask_Jenis(text, "tubes") elif ("tucil" in data): return haritask_Jenis(text, "tucil") elif ("kuis" in data): return haritask_Jenis(text, "kuis") elif ("ujian" in data): return haritask_Jenis(text, "ujian") elif ("pr" in data): return haritask_Jenis(text, "pr") else: return haritask(text) elif ("minggu" in data): if ("tubes" in data): return minggutask_Jenis(text, "tubes") elif ("tucil" in data): return minggutask_Jenis(text, "tucil") elif ("kuis" in data): return minggutask_Jenis(text, "kuis") elif ("ujian" in data): return minggutask_Jenis(text, "ujian") elif ("pr" in data): return minggutask_Jenis(text, "pr") else: return minggutask(text) else: return "-1" elif ("deadline" in data): if ("hari" in data): if ("tubes" in data): return hariIni_Jenis("tubes") elif ("tucil" in data): return hariIni_Jenis("tucil") elif ("kuis" in data): return hariIni_Jenis("kuis") elif ("ujian" in data): return hariIni_Jenis("ujian") elif ("pr" in data): return hariIni_Jenis("pr") else: return hariIni() elif ("sejauh" in data): if ("tubes" in data): return sejauhIni_Jenis("tubes") elif ("tucil" in data): return sejauhIni_Jenis("tucil") elif ("kuis" in data): return sejauhIni_Jenis("kuis") elif ("ujian" in data): return sejauhIni_Jenis("ujian") elif ("pr" in data): return sejauhIni_Jenis("pr") else: return sejauhIni() else: return "-1" else: return "-1"