def explore_training_corpus(n=1000): ''' ''' util = IntrinsicUtility() training_texts = util.get_n_training_files(n) training_xmls = [s.replace('txt', 'xml') for s in training_texts] file_lengths = [] pct_plags = [] total_paragraphs = [] for text_file, xml_file in zip(training_texts, training_xmls): with file(text_file) as f: text = f.read() paragraphs_spans = tokenize(text, 'paragraph') num_paragraphs = len(paragraphs_spans) text_len = len(text) plag_spans = util.get_plagiarized_spans(xml_file) plag_len = sum([end - start for start, end in plag_spans]) plag_pct = float(plag_len) / text_len file_lengths.append(text_len) pct_plags.append(plag_pct) total_paragraphs.append(num_paragraphs) #outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv') outfile = 'training_lengths.csv' f = file(outfile, 'wb') f.write('file_num, length, pct_plag, num_paragraphs\n') for i in xrange(len(file_lengths)): line = '%i, %i, %f, %i\n' % (i, file_lengths[i], pct_plags[i], total_paragraphs[i]) f.write(line) f.close() return zip(file_lengths, pct_plags)
def explore_training_corpus(n=1000): """ """ util = IntrinsicUtility() training_texts = util.get_n_training_files(n) training_xmls = [s.replace("txt", "xml") for s in training_texts] file_lengths = [] pct_plags = [] total_paragraphs = [] for text_file, xml_file in zip(training_texts, training_xmls): with file(text_file) as f: text = f.read() paragraphs_spans = tokenize(text, "paragraph") num_paragraphs = len(paragraphs_spans) text_len = len(text) plag_spans = util.get_plagiarized_spans(xml_file) plag_len = sum([end - start for start, end in plag_spans]) plag_pct = float(plag_len) / text_len file_lengths.append(text_len) pct_plags.append(plag_pct) total_paragraphs.append(num_paragraphs) # outfile = os.path.join(os.path.dirname(__file__), 'training_lengths.csv') outfile = "training_lengths.csv" f = file(outfile, "wb") f.write("file_num, length, pct_plag, num_paragraphs\n") for i in xrange(len(file_lengths)): line = "%i, %i, %f, %i\n" % (i, file_lengths[i], pct_plags[i], total_paragraphs[i]) f.write(line) f.close() return zip(file_lengths, pct_plags)
def main(m, training_percent = 0.7): random.seed(1337) suspects_base_path = "/copyCats/pan-plagiarism-corpus-2009/external-detection-corpus/suspicious-documents/" suspects_dirs = ["part1/", "part2/", "part3/", "part4/", "part5/", "part6/", "part7/", "part8/"] sources_base_path = "/copyCats/pan-plagiarism-corpus-2009/external-detection-corpus/source-documents/" sources_dirs = ["part1/", "part2/", "part3/", "part4/", "part5/", "part6/", "part7/", "part8/"] # Without extensions all_base_files = [] all_files = [] # list of tuples where tuple[0] is the absolute path of the text document and tuple[1] is the absolute path of the xml file # Put all the suspect files in a list for d in suspects_dirs: p = os.path.join(suspects_base_path, d) for f in os.listdir(p): all_base_files.append(os.path.splitext(f)[0]) if f[-4:] == ".txt": all_files.append((p+f, (p+f)[:-4]+".xml")) # Make sure all of these files actually exist worked = True for suspect in all_files: if not os.path.exists(suspect[0]): worked = False print ".txt file does not exist:", suspect[0] if not os.path.exists(suspect[1]): worked = False print ".xml file does not exist:", suspect[1] assert(worked) # shuffle and take files from the front of the list print 'Shuffling ', len(all_files), 'suspect files...' random.shuffle(all_files) print 'Grabbing all valid suspects...' # grab n files with plagiarism training_suspect_partition = [] for filepaths in all_files: plag_spans = IntrinsicUtility.get_plagiarized_spans(filepaths[1]) if len(plag_spans) > 0: # make sure it's at least m paragraphs f = open(filepaths[0], 'r') text = f.read() f.close() paragraphs = tokenize(text, 'paragraph') if len(paragraphs) > m: continue training_suspect_partition.append(filepaths) if len(training_suspect_partition) % 10 == 0: print len(training_suspect_partition) print len(training_suspect_partition) # print 'Writing partitions to disk...' # suspect_training_file = file("crisp_extrinsic_training_suspect_files.txt", 'w') # for suspect in training_suspect_partition: # rel_path_start = suspect[0].index('/part') # suspect_training_file.write(suspect[0][rel_path_start:-4] + '\n') # suspect_training_file.close() print 'Determining source documents for training partition...' training_sources = {} training_sources_suspects = {} num_files = 0 for filenames in training_suspect_partition: tree = ET.parse(filenames[1]) for feature in tree.iter("feature"): if feature.get("name") == "artificial-plagiarism" and feature.get("source_reference") and feature.get("source_reference")[:-4] not in training_sources: # figure out which partX the doc is in...so annoying... for p in sources_dirs: if os.path.exists(sources_base_path + p + feature.get("source_reference")): short_name = "/" + p + feature.get("source_reference")[:-4] long_name = sources_base_path + p + feature.get("source_reference") training_sources[short_name] = 1 if filenames[1] not in training_sources_suspects: training_sources_suspects[filenames[1]] = [long_name] else: training_sources_suspects[filenames[1]].append(long_name) num_files += 1 if num_files%100 == 0: print num_files, sys.stdout.flush() print print len(training_sources.keys()), 'sources for the training partition were found...' print 'Removing invalid suspects because of short sources...' # get rid of the ones that are too long... final_training_suspect_partition = [] for _, xml in training_suspect_partition: # are all of its sources < m paragraphs? short_enough = True for source_filename in training_sources_suspects[xml]: f = open(source_filename, 'r') text = f.read() f.close() paragraphs = tokenize(text, 'paragraph') if len(paragraphs) > m: short_enough = False break if short_enough: final_training_suspect_partition.append(xml) print 'Constructing final source partition...' final_training_source_partition = [] for suspect in final_training_suspect_partition: for long_name in training_sources_suspects[suspect]: short_name = '/' + re.sub(sources_base_path, '', long_name) if short_name not in final_training_source_partition: final_training_source_partition.append(short_name) print 'Converting suspects names.......' final_training_suspect_partition = ['/' + re.sub('.xml', '', re.sub(suspects_base_path, '', xml)) for xml in final_training_suspect_partition] print len(final_training_suspect_partition), final_training_suspect_partition print len(final_training_source_partition), final_training_source_partition print 'Writing suspect documents to disk...' suspects_training_file = file("crisp_corpus_suspect_files.txt", 'w') for filename in final_training_suspect_partition: suspects_training_file.write(filename + '\n') suspects_training_file.close() print 'Writing source documents to disk...' sources_training_file = file("crisp_corpus_source_files.txt", 'w') for filename in final_training_source_partition: sources_training_file.write(filename + '\n') sources_training_file.close()