Example #1
0
def solution_similarity_stats(dataset='air/solutions_preprocessed'):
    """
    Plots histogram of solution-solution similarity distribution of a dataset.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset
    (documents, labels) = data.read_files(corpus_path)

    print '> Creating vector representations..'
    vectors = freq_representation.text_to_vector(documents, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Calculating similarities..'
    distances = scipy.spatial.distance.cdist(vectors.T, vectors.T, 'cosine')
    diag = numpy.diag([2.0]*len(distances),0) # move similarities of "self" to -1
    distances = distances + diag
    similarities = 1.0 - distances
    similarities = similarities.ravel()
    similarities = [s for s in similarities if s >= 0]
    print plotter.histogram(similarities,'similarity','# matches','',bins=150)
    print
    print max(similarities)
    print min(similarities)
    print float(sum(similarities))/len(similarities)
    num = len([sim for sim in similarities if sim < 0.23])
    print 'fraction sims < .23:', float(num)/len(similarities)
Example #2
0
def solution_similarity_stats(dataset='air/solutions_preprocessed'):
    """
    Plots histogram of solution-solution similarity distribution of a dataset.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset
    (documents, labels) = data.read_files(corpus_path)

    print '> Creating vector representations..'
    vectors = freq_representation.text_to_vector(
        documents, freq_representation.FrequencyMetrics.TF_IDF)

    print '> Calculating similarities..'
    distances = scipy.spatial.distance.cdist(vectors.T, vectors.T, 'cosine')
    diag = numpy.diag([2.0] * len(distances),
                      0)  # move similarities of "self" to -1
    distances = distances + diag
    similarities = 1.0 - distances
    similarities = similarities.ravel()
    similarities = [s for s in similarities if s >= 0]
    print plotter.histogram(similarities,
                            'similarity',
                            '# matches',
                            '',
                            bins=150)
    print
    print max(similarities)
    print min(similarities)
    print float(sum(similarities)) / len(similarities)
    num = len([sim for sim in similarities if sim < 0.23])
    print 'fraction sims < .23:', float(num) / len(similarities)
Example #3
0
def dataset_stats(dataset):
    """
    Print and plot statistics for a given dataset.
    A histogram is plotted with the document length distribution of the data.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/'+dataset
    (documents, labels) = data.read_files(corpus_path)
    file_names = data.get_file_names(corpus_path)
    lengths = []
    empty = 0
    for i,d in enumerate(documents):
        d = preprocess.tokenize_tokens(d)
        lengths.append(len(d))
        if len(d)==0:
            print file_names[i],'is empty'
            empty += 1
    lengths = numpy.array(lengths)
    print '# documents:',len(documents)
    print '# empty documents:',empty
    print '# words:',sum(lengths)
    print 'length avg:',lengths.mean()
    print 'length stddev:',lengths.std()
    print
    print 'document lengths (sorted):',sorted(lengths)
    plotter.histogram(lengths,'# tokens','# documents','',bins=80)
Example #4
0
def dataset_stats(dataset):
    """
    Print and plot statistics for a given dataset.
    A histogram is plotted with the document length distribution of the data.
    """
    print '> Reading data..', dataset
    corpus_path = '../data/' + dataset
    (documents, labels) = data.read_files(corpus_path)
    file_names = data.get_file_names(corpus_path)
    lengths = []
    empty = 0
    for i, d in enumerate(documents):
        d = preprocess.tokenize_tokens(d)
        lengths.append(len(d))
        if len(d) == 0:
            print file_names[i], 'is empty'
            empty += 1
    lengths = numpy.array(lengths)
    print '# documents:', len(documents)
    print '# empty documents:', empty
    print '# words:', sum(lengths)
    print 'length avg:', lengths.mean()
    print 'length stddev:', lengths.std()
    print
    print 'document lengths (sorted):', sorted(lengths)
    plotter.histogram(lengths, '# tokens', '# documents', '', bins=80)
Example #5
0
def plot_lengths(dataset, plot_type, **plargs):
    import plotter
    import numpy as np
    lengths = pickle_from_file('output/'+plot_type+'-lengths/'+dataset)
    plotter.histogram(lengths, **plargs)
    print 'tot', len(lengths)
    print 'max', max(lengths)
    lengths = np.array(lengths)
    num = len([x for x in lengths if x > 1000])
    print '# > 1000:', num
    num = len([x for x in lengths if x > 150])
    print '# > 150:', num
Example #6
0
def main():
    runner = Runner()

    runner.E0 = E0
    runner.EB = EB
    runner.B0 = 20 * co.micro
    runner.L = 30
    runner.U0 = BETA * co.c
    runner.THETA = 0.0

    runner.list_clear()
    runner.init_list(0, 10, 10000 * co.kilo * co.eV, 2500)
    runner.set_emfield_func('front')

    init_front(runner)

    counter = Counter(runner)

    runner.inner_hooks.append(counter)
    n = 10

    runner.prepare_data(tfraction=0.0)
    plotter.phases(runner)
    plotter.front(runner)
    plotter.field(runner)

    runner.output_n = 1000
    runner.max_particles = 5000

    for i in range(n):
        runner(10 * co.nano)

        growth, b = simple_regression(counter.t[-10:], log(counter.n[-10:]))
        logging.info("growth rate = {:g} /ns".format(growth * co.nano))

        update_front(runner, growth)
        tfraction = float(i + 1) / n

        runner.prepare_data(tfraction)
        plotter.phases(runner)
        plotter.histogram(runner)
        plotter.front(runner)
        plotter.field(runner)

    plotter.save_all()
    pylab.show()
Example #7
0
def plot_sentence_lengths(datafile=None):
    """
    Function for plotting histogram of sentence lengths within a given dataset.
    """
    if datafile is None:
        import preprocess
        print '> reading data..'
        path = '../data/tasa/TASA900_text'
        texts, labels = data.read_files(path)
        sentence_lengths = []
        print '> counting lengths..'
        for text in texts:
            sentences = preprocess.tokenize_sentences(text)
            for sentence in sentences:
                tokens = preprocess.tokenize_tokens(sentence)
                sentence_lengths.append(len(tokens))
        data.pickle_to_file(sentence_lengths, 'output/tasa_sentence_lengths.pkl')
    else:
        sentence_lengths = data.pickle_from_file(datafile)
    plotter.histogram(sentence_lengths, 'sentence length (tokens)', '# sentences', bins=70)
Example #8
0
def main():
    runner = Runner()

    runner.B0 = B0
    runner.E0 = E0
    runner.EB = EB
    runner.U0 = co.c
    runner.L = L

    runner.list_clear()
    runner.particle_weight(1e9)
    runner.init_list(0, 10, 10000 * co.kilo * co.eV, 1000)
    runner.set_emfield_func('const')

    counter = Counter(runner)

    runner.save_to(sys.argv[1])

    runner.inner_hooks.append(counter)

    runner.prepare_data(tfraction=0.0)
    plotter.phases(runner)

    runner.output_n = 4000
    runner.max_particles = 5000

    n = 250
    for i in range(n):
        runner(50 * co.nano)

        tfraction = float(i + 1) / n

        runner.prepare_data(tfraction)
        runner.save()

        plotter.phases(runner)
        plotter.histogram(runner)

    plotter.save_all()
    pylab.show()
Example #9
0
def main():
    runner = Runner()

    runner.B0    = B0
    runner.E0    = E0
    runner.EB    = EB
    runner.U0    = co.c
    runner.L     = L

    runner.list_clear()
    runner.particle_weight(1e9);
    runner.init_list(0, 10, 10000 * co.kilo * co.eV, 1000)
    runner.set_emfield_func('const')

    counter = Counter(runner)

    runner.save_to(sys.argv[1])

    runner.inner_hooks.append(counter)

    runner.prepare_data(tfraction=0.0)
    plotter.phases(runner)

    runner.output_n = 4000
    runner.max_particles = 5000

    n = 250
    for i in range(n):
        runner(50 * co.nano)

        tfraction = float(i + 1) / n

        runner.prepare_data(tfraction)
        runner.save()

        plotter.phases(runner)
        plotter.histogram(runner)

    plotter.save_all()
    pylab.show()
Example #10
0
def plot_sentence_lengths(datafile=None):
    """
    Function for plotting histogram of sentence lengths within a given dataset.
    """
    if datafile is None:
        import preprocess
        print '> reading data..'
        path = '../data/tasa/TASA900_text'
        texts, labels = data.read_files(path)
        sentence_lengths = []
        print '> counting lengths..'
        for text in texts:
            sentences = preprocess.tokenize_sentences(text)
            for sentence in sentences:
                tokens = preprocess.tokenize_tokens(sentence)
                sentence_lengths.append(len(tokens))
        data.pickle_to_file(sentence_lengths,
                            'output/tasa_sentence_lengths.pkl')
    else:
        sentence_lengths = data.pickle_from_file(datafile)
    plotter.histogram(sentence_lengths,
                      'sentence length (tokens)',
                      '# sentences',
                      bins=70)
Example #11
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath,
       genome_stats_dirpath):

    nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath,
                                       'nucmer_output')
    from libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    reference_chromosomes = {}
    genome_size = 0
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = genome_stats_dirpath + '/genome_info.txt'
    res_file = open(result_fpath, 'w')

    genes_container = FeatureContainer(genes_fpaths, 'gene')
    operons_container = FeatureContainer(operons_fpaths, 'operon')
    for container in [genes_container, operons_container]:
        if not container.fpaths:
            logger.notice('No file with ' + container.kind + 's provided. '
                          'Use the -' + container.kind[0].capitalize() +
                          ' option '
                          'if you want to specify it.',
                          indent='  ')
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(
                fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No ' + container.kind + 's were loaded.',
                           indent='  ')
            res_file.write(container.kind + 's loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' ' +
                        container.kind + 's')
            res_file.write(container.kind + 's loaded: ' +
                           str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(
                container.kind, container.region_list,
                reference_chromosomes.keys())

    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        if genes_container.fpaths:
            report.add_field(reporting.Fields.REF_GENES,
                             len(genes_container.region_list))
        if operons_container.fpaths:
            report.add_field(reporting.Fields.REF_OPERONS,
                             len(operons_container.region_list))

    # for cumulative plots:
    files_genes_in_contigs = {
    }  #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    process_results = Parallel(n_jobs=n_jobs)(
        delayed(process_single_file)(
            contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
            reference_chromosomes, genes_container, operons_container)
        for index, contigs_fpath in enumerate(aligned_contigs_fpaths))
    num_nf_errors += len([res for res in process_results if res is None])
    logger._num_nf_errors = num_nf_errors
    process_results = [res for res in process_results if res]
    if not process_results:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    ref_lengths = [process_results[i][0] for i in range(len(process_results))]
    results_genes_operons_tuples = [
        process_results[i][1] for i in range(len(process_results))
    ]
    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [
            ref_lengths[i][ref] for i in range(len(ref_lengths))
        ]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) +
                       ' bp, maximal covered length: ' + str(aligned_len) +
                       ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' +
                   str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write(
        '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' %
        ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial',
         'operons', 'partial'))
    res_file.write(
        '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' %
        ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write(
        '================================================================================================================\n'
    )

    for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(
            aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_genes_in_contigs[contigs_fpath] = genes_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        full_found_genes.append(sum(genes_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        covered_bp = results["covered_bp"]
        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)
        genome_fraction = float(covered_bp) * 100 / float(genome_size)
        duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) +
                             report.get_field(reporting.Fields.MISINTERNALOVERLAP) +
                             report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) -
                             report.get_field(reporting.Fields.UNALIGNEDBASES)) /\
                            ((genome_fraction / 100.0) * float(genome_size))

        res_file.write('%-25s| %-10s| %-12s| %-10s|' %
                       (assembly_name[:24], '%3.5f%%' % genome_fraction,
                        '%1.5f' % duplication_ratio, gaps_count))

        report.add_field(reporting.Fields.MAPPEDGENOME,
                         '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO,
                         '%.3f' % duplication_ratio)
        genome_mapped.append(genome_fraction)

        for (field, full,
             part) in [(reporting.Fields.GENES, genes_full, genes_part),
                       (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if genes_container.region_list:
        ref_genes_num = len(genes_container.region_list)
    else:
        ref_genes_num = None

    if operons_container.region_list:
        ref_operons_num = len(operons_container.region_list)
    else:
        ref_operons_num = None

    # saving json
    if json_output_dirpath:
        if genes_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath,
                                                aligned_contigs_fpaths,
                                                'genes',
                                                files_genes_in_contigs,
                                                ref_genes_num)
        if operons_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath,
                                                aligned_contigs_fpaths,
                                                'operons',
                                                files_operons_in_contigs,
                                                ref_operons_num)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        if genes_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath,
                                                aligned_contigs_fpaths,
                                                'genes',
                                                files_genes_in_contigs,
                                                ref_genes_num)
        if operons_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath,
                                                aligned_contigs_fpaths,
                                                'operons',
                                                files_operons_in_contigs,
                                                ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        import plotter
        if genes_container.region_list:
            plotter.genes_operons_plot(
                len(genes_container.region_list), aligned_contigs_fpaths,
                files_genes_in_contigs,
                genome_stats_dirpath + '/genes_cumulative_plot', 'genes')
            plotter.histogram(
                aligned_contigs_fpaths, full_found_genes,
                genome_stats_dirpath + '/complete_genes_histogram',
                '# complete genes')
        if operons_container.region_list:
            plotter.genes_operons_plot(
                len(operons_container.region_list), aligned_contigs_fpaths,
                files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.histogram(
                aligned_contigs_fpaths, full_found_operons,
                genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths,
                          genome_mapped,
                          genome_stats_dirpath + '/genome_fraction_histogram',
                          'Genome fraction, %',
                          top_value=100)

    logger.main_info('Done.')
Example #12
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath):

    nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output')

    logger.print_timestamp()
    logger.info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    reference_chromosomes = {}
    genome_size = 0
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = genome_stats_dirpath + '/genome_info.txt'
    res_file = open(result_fpath, 'w')
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        res_file.write('\t' + chr_name + ' (' + str(chr_len) + ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n')

    genes_container = FeatureContainer(genes_fpaths, 'gene')
    operons_container = FeatureContainer(operons_fpaths, 'operon')
    for container in [genes_container, operons_container]:
        if not container.fpaths:
            logger.notice('No file with ' + container.kind + 's provided. '
                          'Use the -' + container.kind[0].capitalize() + ' option '
                          'if you want to specify it.', indent='  ')
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No ' + container.kind + 's were loaded.', indent='  ')
            res_file.write(container.kind + 's loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's')
            res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, reference_chromosomes.keys())

    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        if genes_container.fpaths:
            report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list))
        if operons_container.fpaths:
            report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list))

    # header
    res_file.write('\n\n')
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial'))
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('================================================================================================================\n')

    # for cumulative plots:
    files_genes_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    results_genes_operons_tuples = Parallel(n_jobs=n_jobs)(delayed(process_single_file)(
        contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
        reference_chromosomes, genes_container, operons_container)
        for index, contigs_fpath in enumerate(aligned_contigs_fpaths))

    for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_genes_in_contigs[contigs_fpath] = genes_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        full_found_genes.append(sum(genes_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        covered_bp = results["covered_bp"]
        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)
        genome_fraction = float(covered_bp) * 100 / float(genome_size)
        duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) +
                             report.get_field(reporting.Fields.MISINTERNALOVERLAP) +
                             report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) -
                             report.get_field(reporting.Fields.UNALIGNEDBASES)) /\
                            ((genome_fraction / 100.0) * float(genome_size))

        res_file.write('%-25s| %-10s| %-12s| %-10s|'
        % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count))

        report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio)
        genome_mapped.append(genome_fraction)

        for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part),
            (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if genes_container.region_list:
        ref_genes_num = len(genes_container.region_list)
    else:
        ref_genes_num = None

    if operons_container.region_list:
        ref_operons_num = len(operons_container.region_list)
    else:
        ref_operons_num = None

    # saving json
    if json_output_dirpath:
        if genes_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        if genes_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        import plotter
        if genes_container.region_list:
            plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs,
                genome_stats_dirpath + '/genes_cumulative_plot', 'genes')
            plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram',
                '# complete genes')
        if operons_container.region_list:
            plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram',
            'Genome fraction, %', top_value=100)

    logger.info('Done.')
Example #13
0
      jacexact_match = jaccdistances.count(0)

      #UPDATING OVERALL STATS
      all_distances.extend(levdistances)
      all_jac_distances.extend(jaccdistances)
      class_counter += 1
      mean_counter += distr[0]
      pairs_counter += len(profilePairs)
      match_counter += exact_match
      jacmatch_counter += jacexact_match
      jacmean_counter += jaccdistr[0]


      # #PLOTTING JACCARD CURRENT CLASS HISTOGRAM
      # title = str(sns)[1:-1].replace("'","").replace(",","-").replace(" ","")
      histogram(jaccdistances, 10, 'Distance', 'Username pairs', title, 'jac_plot/', range = ([0,1]))

      # #PLOTTING LEVENSTHEIN CURRENT  CLASS HISTOGRAM
      histogram(levdistances, max(levdistances), 'Distance', 'Username pairs', title, 'lev_plot/' )

      # #PRINT STATS
      # print("SNSs: {0} - #username pairs: {1}".format(sns,nPairs))
      # print("Distribution (Levenshtein distance): Mean, StandardDeviation, Median, Min, Max")
      # print(distr)
      # print("Exact match (Levenshtein distance = 0):")
      # print("#n: {0} - Percentage : {1}".format(exact_match, exact_match/nPairs))
      # print("Distance distribution:")
      # print(distanceDistribution)
      # print("Exact match (Jaccard distance = 0):")
      # print("#n: {0} - Percentage : {1}".format(jacexact_match, jacexact_match/nPairs))
      # print("\r\n")