def __init__(self, atom_type, fingerprint_method, n, k, hash_len, confidence_method, suspect_file_list, source_file_list, search_method, search_n=1): self.suspicious_path_start = ExtrinsicUtility.CORPUS_SUSPECT_LOC self.corpus_path_start = ExtrinsicUtility.CORPUS_SRC_LOC source_dirs = os.listdir(self.corpus_path_start) self.mid = fingerprintstorage.get_mid(fingerprint_method, n, k, atom_type, hash_len) self.base_atom_type = atom_type self.fingerprint_method = fingerprint_method self.n = n self.k = k self.hash_len = hash_len self.confidence_method = confidence_method self.suspect_file_list = suspect_file_list self.source_file_list = source_file_list self.evaluator = fingerprint_extraction.FingerprintEvaluator(source_file_list, fingerprint_method, self.n, self.k) self.search_method = search_method self.search_n = search_n
def test(method, n, k, atom_type, hash_size, confidence_method, num_files="all", search_method='normal', search_n=5, save_to_db=True, ignore_high_obfuscation=False, show_false_negpos_info=False, get_best_of=False): session = Session() # Get the list of suspect files to test on source_file_list, suspect_file_list = ExtrinsicUtility().get_corpus_files(n = num_files, include_txt_extension = False) # Confirm that these suspects and enough source documents have been populated num_suspect_documents = len(suspect_file_list) num_source_documents = len(source_file_list) mid = fingerprintstorage.get_mid(method, n, k, atom_type, hash_size) num_populated_suspects = fingerprintstorage.get_number_suspects(mid) num_populated_sources = fingerprintstorage.get_number_sources(mid) if num_populated_suspects < num_suspect_documents or num_populated_sources < num_source_documents: raise ValueError("Not all of the documents used in this test have been populated (only "+str(num_populated_sources)+" sources, "+str(num_populated_suspects)+" suspects have been populated). Populate them first with fingerprintstorage.") # If the search method is two level, we need to check that additional things are in the database if search_method == "two_level_ff" or search_method == "two_level_pf": full_mid = fingerprintstorage.get_mid(method, n, k, "full", hash_size) para_mid = fingerprintstorage.get_mid(method, n, k, "paragraph", hash_size) num_populated_full_suspects = fingerprintstorage.get_number_suspects(full_mid) num_populated_para_suspects = fingerprintstorage.get_number_suspects(para_mid) num_populated_full_sources = fingerprintstorage.get_number_sources(full_mid) num_populated_para_sources = fingerprintstorage.get_number_sources(para_mid) num_populated_sources = num_populated_full_sources num_populated_suspects = num_populated_full_suspects if num_populated_full_suspects < num_suspect_documents or num_populated_para_suspects < num_suspect_documents \ or num_populated_full_sources < num_source_documents or num_populated_para_sources < num_source_documents \ or num_populated_para_sources < num_populated_full_sources \ or num_populated_para_suspects < num_populated_full_suspects: raise ValueError("Not all of the documents used in this test have been populated (only "+str(num_populated_sources)+" sources, "+str(num_populated_suspects)+" suspects have been populated). Populate them first with fingerprintstorage.") print suspect_file_list print "Testing first", suspect_file_list, "suspect files against", num_populated_sources, "source documents." tester = ExtrinsicTester(atom_type, method, n, k, hash_size, confidence_method, suspect_file_list, source_file_list, search_method, search_n) roc_auc, source_accuracy, true_source_accuracy, roc_path, prf_path, thresholds, precisions, recalls, fmeasures = tester.evaluate(session, ignore_high_obfuscation, show_false_negpos_info, get_best_of) # Save the result if save_to_db: with psycopg2.connect(user = username, password = password, database = dbname.split("/")[1], host="localhost", port = 5432) as conn: conn.autocommit = True with conn.cursor() as cur: for i in range(len(thresholds)): threshold = thresholds[i] prec = precisions[i] recall = recalls[i] fmeasure = fmeasures[i] query = "INSERT INTO extrinsic_results (method_name, n, k, atom_type, hash_size, simmilarity_method, suspect_files, source_files, auc, true_source_accuracy, source_accuracy, search_method, search_n, ignore_high_obfuscation, roc_path, prf_fig_path, threshold, precision, recall, fmeasure) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);" args = (method, n, k, atom_type, hash_size, confidence_method, num_files, num_populated_sources, roc_auc, true_source_accuracy, source_accuracy, search_method, search_n, ignore_high_obfuscation, roc_path, prf_path, threshold, prec, recall, fmeasure) cur.execute(query, args) print 'ROC auc:', roc_auc print 'Source Accuracy:', source_accuracy print 'True Source Accuracy:', true_source_accuracy
def get_trials(self, session, fingerprint_m=None): ''' For each suspect document, split the document into atoms and classify each atom as plagiarized or not-plagiarized. Build a list of classifications and a list of the ground truths for each atom of each document. ''' classifications = [] actuals = [] classifications_dict = {} actuals_dict = {} if fingerprint_m is None: fingerprint_m = self.fingerprint_method outer_search_level_mid = fingerprintstorage.get_mid(fingerprint_m, self.n, self.k, "full", self.hash_len) for fi, f in enumerate(self.suspect_file_list, 1): print doc_name = f.replace(self.suspicious_path_start, "") if ".txt" in doc_name: doc_name = doc_name.replace(".txt", "") if self.search_method == 'two_level_ff': print '%d/%d Classifying %s (%s)' % (fi, len(self.suspect_file_list), doc_name, self.search_method) acts = ground_truth._query_ground_truth(doc_name, self.base_atom_type, session, self.suspicious_path_start).get_ground_truth(session) actuals += acts actuals_dict[f] = acts doc_classifications = [] # first, get a list of the most similar full documents to this document full_atom_classifications = self.evaluator.classify_passage(doc_name, "full", 0, fingerprint_m, self.n, self.k, self.hash_len, "containment", outer_search_level_mid) top_docs = full_atom_classifications[:self.search_n] dids = [x[0][2] for x in top_docs] # now, compare all paragraphs in the most similar documents to this paragraph for atom_index in xrange(len(acts)): atom_classifications = self.evaluator.classify_passage(doc_name, "paragraph", atom_index, fingerprint_m, self.n, self.k, self.hash_len, self.confidence_method, self.mid, dids=dids) # print 'atom_classifications:', atom_classifications # top_source is a tuple with the form ((source_doc_name, atom_index), confidence, suspect_filename) top_source = atom_classifications[0] source_filename, source_atom_index, did, suspect_filename, atom_index = top_source[0] confidence = top_source[1] classifications.append(top_source) doc_classifications.append(top_source) print 'atom index:', str(atom_index+1) + '/' + str(len(acts)) print 'confidence (actual, guess):', acts[atom_index], (confidence, source_filename, source_atom_index) classifications_dict[f] = doc_classifications elif self.search_method == 'two_level_pf': print '%d/%d Classifying %s (%s)' % (fi, len(self.suspect_file_list), doc_name, self.search_method) acts = ground_truth._query_ground_truth(doc_name, self.base_atom_type, session, self.suspicious_path_start).get_ground_truth(session) actuals += acts actuals_dict[f] = acts doc_classifications = [] for atom_index in xrange(len(acts)): # first, find most similar documents to this paragraph full_atom_classifications = self.evaluator.classify_passage(doc_name, "full", atom_index, fingerprint_m, self.n, self.k, self.hash_len, "containment", fingerprintstorage.get_mid(fingerprint_m, self.n, self.k, "full", self.hash_len), passage_atom_type="paragraph", passage_mid=fingerprintstorage.get_mid(fingerprint_m, self.n, self.k, "paragraph", self.hash_len)) top_docs = full_atom_classifications[:self.search_n] dids = [x[0][2] for x in top_docs] # don't compare at the paragraph level if no full documents had any similarity to the paragraph if top_docs[0][1] == 0: top_source = top_docs[0] else: # now, compare this paragraph to all paragraphs in <top_docs> atom_classifications = self.evaluator.classify_passage(doc_name, "paragraph", atom_index, fingerprint_m, self.n, self.k, self.hash_len, self.confidence_method, self.mid, dids=dids) # print 'atom_classifications:', atom_classifications # top_source is a tuple with the form ((source_doc_name, atom_index), confidence, suspect_filename) top_source = atom_classifications[0] source_filename, source_atom_index, did, suspect_filename, atom_index = top_source[0] confidence = top_source[1] classifications.append(top_source) doc_classifications.append(top_source) print 'atom index:', str(atom_index+1) + '/' + str(len(acts)) print 'confidence (actual, guess):', acts[atom_index], (confidence, source_filename, source_atom_index) classifications_dict[f] = doc_classifications else: acts = ground_truth._query_ground_truth(f, self.base_atom_type, session, self.suspicious_path_start).get_ground_truth(session) actuals += acts actuals_dict[f] = acts doc_classifications = [] print f print '%d/%d Classifying %s' % (fi, len(self.suspect_file_list), doc_name) for atom_index in xrange(len(acts)): atom_classifications = self.evaluator.classify_passage(doc_name, self.base_atom_type, atom_index, fingerprint_m, self.n, self.k, self.hash_len, self.confidence_method, self.mid) # print atom_classifications # top_source is a tuple with the form ((source_doc_name, atom_index), confidence) top_source = atom_classifications[0] source_filename, source_atom_index, did, suspect_filename, atom_index = top_source[0] confidence = top_source[1] classifications.append(top_source) doc_classifications.append(top_source) print 'atom index:', str(atom_index+1) + '/' + str(len(acts)) print 'confidence (actual, guess):', acts[atom_index][0], (confidence, source_filename, source_atom_index) classifications_dict[f] = doc_classifications # classifications = self.screen_crap(classifications, classifications_dict) return classifications, actuals, classifications_dict, actuals_dict