def solution_similarity_stats(dataset='air/solutions_preprocessed'): """ Plots histogram of solution-solution similarity distribution of a dataset. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset (documents, labels) = data.read_files(corpus_path) print '> Creating vector representations..' vectors = freq_representation.text_to_vector(documents, freq_representation.FrequencyMetrics.TF_IDF) print '> Calculating similarities..' distances = scipy.spatial.distance.cdist(vectors.T, vectors.T, 'cosine') diag = numpy.diag([2.0]*len(distances),0) # move similarities of "self" to -1 distances = distances + diag similarities = 1.0 - distances similarities = similarities.ravel() similarities = [s for s in similarities if s >= 0] print plotter.histogram(similarities,'similarity','# matches','',bins=150) print print max(similarities) print min(similarities) print float(sum(similarities))/len(similarities) num = len([sim for sim in similarities if sim < 0.23]) print 'fraction sims < .23:', float(num)/len(similarities)
def solution_similarity_stats(dataset='air/solutions_preprocessed'): """ Plots histogram of solution-solution similarity distribution of a dataset. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset (documents, labels) = data.read_files(corpus_path) print '> Creating vector representations..' vectors = freq_representation.text_to_vector( documents, freq_representation.FrequencyMetrics.TF_IDF) print '> Calculating similarities..' distances = scipy.spatial.distance.cdist(vectors.T, vectors.T, 'cosine') diag = numpy.diag([2.0] * len(distances), 0) # move similarities of "self" to -1 distances = distances + diag similarities = 1.0 - distances similarities = similarities.ravel() similarities = [s for s in similarities if s >= 0] print plotter.histogram(similarities, 'similarity', '# matches', '', bins=150) print print max(similarities) print min(similarities) print float(sum(similarities)) / len(similarities) num = len([sim for sim in similarities if sim < 0.23]) print 'fraction sims < .23:', float(num) / len(similarities)
def dataset_stats(dataset): """ Print and plot statistics for a given dataset. A histogram is plotted with the document length distribution of the data. """ print '> Reading data..', dataset corpus_path = '../data/'+dataset (documents, labels) = data.read_files(corpus_path) file_names = data.get_file_names(corpus_path) lengths = [] empty = 0 for i,d in enumerate(documents): d = preprocess.tokenize_tokens(d) lengths.append(len(d)) if len(d)==0: print file_names[i],'is empty' empty += 1 lengths = numpy.array(lengths) print '# documents:',len(documents) print '# empty documents:',empty print '# words:',sum(lengths) print 'length avg:',lengths.mean() print 'length stddev:',lengths.std() print print 'document lengths (sorted):',sorted(lengths) plotter.histogram(lengths,'# tokens','# documents','',bins=80)
def dataset_stats(dataset): """ Print and plot statistics for a given dataset. A histogram is plotted with the document length distribution of the data. """ print '> Reading data..', dataset corpus_path = '../data/' + dataset (documents, labels) = data.read_files(corpus_path) file_names = data.get_file_names(corpus_path) lengths = [] empty = 0 for i, d in enumerate(documents): d = preprocess.tokenize_tokens(d) lengths.append(len(d)) if len(d) == 0: print file_names[i], 'is empty' empty += 1 lengths = numpy.array(lengths) print '# documents:', len(documents) print '# empty documents:', empty print '# words:', sum(lengths) print 'length avg:', lengths.mean() print 'length stddev:', lengths.std() print print 'document lengths (sorted):', sorted(lengths) plotter.histogram(lengths, '# tokens', '# documents', '', bins=80)
def plot_lengths(dataset, plot_type, **plargs): import plotter import numpy as np lengths = pickle_from_file('output/'+plot_type+'-lengths/'+dataset) plotter.histogram(lengths, **plargs) print 'tot', len(lengths) print 'max', max(lengths) lengths = np.array(lengths) num = len([x for x in lengths if x > 1000]) print '# > 1000:', num num = len([x for x in lengths if x > 150]) print '# > 150:', num
def main(): runner = Runner() runner.E0 = E0 runner.EB = EB runner.B0 = 20 * co.micro runner.L = 30 runner.U0 = BETA * co.c runner.THETA = 0.0 runner.list_clear() runner.init_list(0, 10, 10000 * co.kilo * co.eV, 2500) runner.set_emfield_func('front') init_front(runner) counter = Counter(runner) runner.inner_hooks.append(counter) n = 10 runner.prepare_data(tfraction=0.0) plotter.phases(runner) plotter.front(runner) plotter.field(runner) runner.output_n = 1000 runner.max_particles = 5000 for i in range(n): runner(10 * co.nano) growth, b = simple_regression(counter.t[-10:], log(counter.n[-10:])) logging.info("growth rate = {:g} /ns".format(growth * co.nano)) update_front(runner, growth) tfraction = float(i + 1) / n runner.prepare_data(tfraction) plotter.phases(runner) plotter.histogram(runner) plotter.front(runner) plotter.field(runner) plotter.save_all() pylab.show()
def plot_sentence_lengths(datafile=None): """ Function for plotting histogram of sentence lengths within a given dataset. """ if datafile is None: import preprocess print '> reading data..' path = '../data/tasa/TASA900_text' texts, labels = data.read_files(path) sentence_lengths = [] print '> counting lengths..' for text in texts: sentences = preprocess.tokenize_sentences(text) for sentence in sentences: tokens = preprocess.tokenize_tokens(sentence) sentence_lengths.append(len(tokens)) data.pickle_to_file(sentence_lengths, 'output/tasa_sentence_lengths.pkl') else: sentence_lengths = data.pickle_from_file(datafile) plotter.histogram(sentence_lengths, 'sentence length (tokens)', '# sentences', bins=70)
def main(): runner = Runner() runner.B0 = B0 runner.E0 = E0 runner.EB = EB runner.U0 = co.c runner.L = L runner.list_clear() runner.particle_weight(1e9) runner.init_list(0, 10, 10000 * co.kilo * co.eV, 1000) runner.set_emfield_func('const') counter = Counter(runner) runner.save_to(sys.argv[1]) runner.inner_hooks.append(counter) runner.prepare_data(tfraction=0.0) plotter.phases(runner) runner.output_n = 4000 runner.max_particles = 5000 n = 250 for i in range(n): runner(50 * co.nano) tfraction = float(i + 1) / n runner.prepare_data(tfraction) runner.save() plotter.phases(runner) plotter.histogram(runner) plotter.save_all() pylab.show()
def main(): runner = Runner() runner.B0 = B0 runner.E0 = E0 runner.EB = EB runner.U0 = co.c runner.L = L runner.list_clear() runner.particle_weight(1e9); runner.init_list(0, 10, 10000 * co.kilo * co.eV, 1000) runner.set_emfield_func('const') counter = Counter(runner) runner.save_to(sys.argv[1]) runner.inner_hooks.append(counter) runner.prepare_data(tfraction=0.0) plotter.phases(runner) runner.output_n = 4000 runner.max_particles = 5000 n = 250 for i in range(n): runner(50 * co.nano) tfraction = float(i + 1) / n runner.prepare_data(tfraction) runner.save() plotter.phases(runner) plotter.histogram(runner) plotter.save_all() pylab.show()
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') from libs import search_references_meta if search_references_meta.is_quast_first_run: nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file( fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict( container.kind, container.region_list, reference_chromosomes.keys()) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # for cumulative plots: files_genes_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed process_results = Parallel(n_jobs=n_jobs)( delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [ process_results[i][1] for i in range(len(process_results)) ] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ ref_lengths[i][ref] for i in range(len(ref_lengths)) ] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.iteritems(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write( '================================================================================================================\n' ) for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip( aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: import plotter if genes_container.region_list: plotter.genes_operons_plot( len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram( aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot( len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram( aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') logger.print_timestamp() logger.info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.iteritems(): res_file.write('\t' + chr_name + ' (' + str(chr_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, reference_chromosomes.keys()) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('================================================================================================================\n') # for cumulative plots: files_genes_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results_genes_operons_tuples = Parallel(n_jobs=n_jobs)(delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: import plotter if genes_container.region_list: plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.info('Done.')
jacexact_match = jaccdistances.count(0) #UPDATING OVERALL STATS all_distances.extend(levdistances) all_jac_distances.extend(jaccdistances) class_counter += 1 mean_counter += distr[0] pairs_counter += len(profilePairs) match_counter += exact_match jacmatch_counter += jacexact_match jacmean_counter += jaccdistr[0] # #PLOTTING JACCARD CURRENT CLASS HISTOGRAM # title = str(sns)[1:-1].replace("'","").replace(",","-").replace(" ","") histogram(jaccdistances, 10, 'Distance', 'Username pairs', title, 'jac_plot/', range = ([0,1])) # #PLOTTING LEVENSTHEIN CURRENT CLASS HISTOGRAM histogram(levdistances, max(levdistances), 'Distance', 'Username pairs', title, 'lev_plot/' ) # #PRINT STATS # print("SNSs: {0} - #username pairs: {1}".format(sns,nPairs)) # print("Distribution (Levenshtein distance): Mean, StandardDeviation, Median, Min, Max") # print(distr) # print("Exact match (Levenshtein distance = 0):") # print("#n: {0} - Percentage : {1}".format(exact_match, exact_match/nPairs)) # print("Distance distribution:") # print(distanceDistribution) # print("Exact match (Jaccard distance = 0):") # print("#n: {0} - Percentage : {1}".format(jacexact_match, jacexact_match/nPairs)) # print("\r\n")