def pe_functional_profiling_pipeline(params):
    """Function calling functional profiling for fastq files"""
    work_dir = os.path.join(params['work_dir'], str(uuid.uuid4()))
    os.mkdir(work_dir)

    config_file = write_config_file(work_dir)
    project_file = write_project_file(params['input_reads'],
                                      params['reference'], work_dir,
                                      params['is_paired_end'])
    project = Project(config_file=config_file, project_file=project_file)

    if params['is_paired_end'] == "1":
        project = fastq_pe_pipeline(project)
    elif params['is_paired_end'] == "0":
        project = fastq_se_pipeline(project)
    else:
        raise ValueError('Wrong values of is_paired_end parameter',
                         params['is_paired_end'])

    out_dir = os.path.join(work_dir, 'out')
    os.mkdir(out_dir)
    # export_reads
    output = {}
    output['krona_charts'] = {}
    out_fwd_fastq = os.path.join(work_dir, 'out_fwd.fastq')

    sample_id = project.list_samples()[0]
    if params['is_paired_end'] == "1":
        out_rev_fastq = os.path.join(work_dir, 'out_rev.fastq')
    else:
        out_rev_fastq = ''

    # write filtered fastq
    write_filtered_fastq(out_fwd_fastq, out_rev_fastq, project)
    output['fwd_reads'] = out_fwd_fastq
    if params['is_paired_end'] == "1":
        output['rev_reads'] = out_rev_fastq

    # Generate output
    out_report = os.path.join(out_dir, 'fama_report.html')
    generate_html_report(out_report, project, params['name2ref'])
    with zipfile.ZipFile(out_report + '.zip',
                         'w',
                         zipfile.ZIP_DEFLATED,
                         allowZip64=True) as zip_file:
        zip_file.write(out_report, 'fama_report.html')
    output['html_report'] = out_report + '.zip'

    # TODO: Krona charts generate_functions_chart(parser_fwd)
    report_files = {}
    if params['is_paired_end'] == "1":
        metric = 'efpkg'
        rawcount_flag = False
        for sample_id in project.list_samples():
            if project.samples[sample_id].rpkg_scaling_factor == 0.0:
                rawcount_flag = True
        if rawcount_flag:
            metric = 'fragmentcount'
    else:
        metric = 'erpkg'
        rawcount_flag = False
        for sample_id in project.list_samples():
            if project.samples[sample_id].rpkg_scaling_factor == 0.0:
                rawcount_flag = True
        if rawcount_flag:
            metric = 'readcount'

    # Create TraitMatrix object
    output['trait_matrix_ref'] = write_trait_matrix(project, params)
    output['functional_profile_ref'] = write_functional_profile(
        project, params, output['trait_matrix_ref'])
    report_files[out_report] = 'fama_report.html'
    project_xlsx_report = sanitize_file_name(
        os.path.join(
            project.options.work_dir,
            project.options.project_name + '_' + metric + '_functions.xlsx'))
    if os.path.exists(project_xlsx_report):
        report_files[project_xlsx_report] = 'Functional_profiles_combined.xlsx'
    else:
        print('Project XLSX file not found:', project_xlsx_report)

    project_xlsx_report = sanitize_file_name(
        os.path.join(
            project.options.work_dir, project.options.project_name + '_' +
            metric + '_functions_taxonomy.xlsx'))
    if os.path.exists(project_xlsx_report):
        report_files[
            project_xlsx_report] = 'Function_taxonomy_profiles_combined.xlsx'
    else:
        print('Project XLSX file not found:', project_xlsx_report)
    for sample_id in project.list_samples():
        sample_xlsx_report = sanitize_file_name(
            os.path.join(project.options.work_dir, sample_id + '_' + metric +
                         '_functions_taxonomy.xlsx'))
        if os.path.exists(sample_xlsx_report):
            report_files[sample_xlsx_report] = sanitize_file_name(
                sample_id + ' function taxonomy profile long.xlsx')
        else:
            print('Sample XLSX file not found:', sample_xlsx_report)

        krona_file = sanitize_file_name(
            os.path.join(
                project.options.work_dir, sample_id + '_' + metric +
                '_functional_taxonomy_profile.xml.html'))
        if os.path.exists(krona_file):
            krona_output = \
                sanitize_file_name(os.path.join(out_dir, sample_id +
                                   '_function_taxonomy_profile_chart.html'))
            shutil.copy2(krona_file, krona_output)
            with zipfile.ZipFile(krona_output + '.zip',
                                 'w',
                                 zipfile.ZIP_DEFLATED,
                                 allowZip64=True) as zip_file:
                zip_file.write(
                    krona_output,
                    sanitize_file_name(
                        sample_id + '_function_taxonomy_profile_chart.html'))
            report_files[krona_output] = \
                sanitize_file_name(sample_id + '_function_taxonomy_profile_chart.html')
            output['krona_charts'][krona_output + '.zip'] = \
                (sanitize_file_name(sample_id + '_function_taxonomy_profile_chart.html'),
                    sample_id + ' function taxonomy chart')
        else:
            print('Krona diagram file not found:', krona_file)

    output_files = list()
    result_file = os.path.join(project.options.work_dir, 'Fama_result.zip')
    with zipfile.ZipFile(result_file,
                         'w',
                         zipfile.ZIP_DEFLATED,
                         allowZip64=True) as zip_file:
        for filename in report_files:
            zip_file.write(filename, report_files[filename])
    output_files.append({
        'path': result_file,
        'name': os.path.basename(result_file),
        'label': os.path.basename(result_file),
        'description': 'Files generated by Fama App'
    })
    output['report_files'] = output_files
    return output
def protein_functional_profiling_pipeline(params):
    """Function calling functional profiling for protein fasta file
    params = {'input_proteins': input_proteins,
               'work_dir': self.shared_folder,
               'reference': fama_reference,
               'ws_name': params['workspace_name'],
               'ws_client': ws_client,
               'featureset_name': params['output_feature_set_name'],
               'annotation_prefix': params['output_annotation_name'],
               'name2ref' : name2ref
             }
    """

    work_dir = os.path.join(params['work_dir'], str(uuid.uuid4()))
    os.mkdir(work_dir)

    config_file = write_config_file(work_dir)
    project_file = write_project_file(params['input_proteins'],
                                      params['reference'], work_dir)
    project = Project(config_file=config_file, project_file=project_file)
    # Run Fama
    project = protein_pipeline(project)

    out_dir = os.path.join(work_dir, 'out')
    os.mkdir(out_dir)
    # export_reads
    output = {}
    output['krona_charts'] = {}

    # Generate output
    report_files = {}
    metric = 'proteincount'
    project_xlsx_report = sanitize_file_name(
        os.path.join(
            project.options.work_dir,
            project.options.project_name + '_' + metric + '_functions.xlsx'))
    if os.path.exists(project_xlsx_report):
        report_files[project_xlsx_report] = 'Functional_profiles_combined.xlsx'
    else:
        print('Project XLSX file not found:', project_xlsx_report)
    project_xlsx_report = sanitize_file_name(
        os.path.join(
            project.options.work_dir, project.options.project_name + '_' +
            metric + '_functions_taxonomy.xlsx'))
    if os.path.exists(project_xlsx_report):
        report_files[
            project_xlsx_report] = 'Function_taxonomy_profiles_combined.xlsx'
    else:
        print('Project XLSX file not found:', project_xlsx_report)
    project_text_report = sanitize_file_name(
        os.path.join(project.options.work_dir, 'all_proteins.list.txt'))
    if os.path.exists(project_text_report):
        report_files[project_text_report] = 'proteins_list.txt'
    else:
        print('Proteins list not found:', project_text_report)

    featureset_elements = {}
    featureset_element_ordering = []
    objects_created = []
    genome_names = {}
    # Get Domain Model Set reference
    dms_ref = get_dms(params['reference'],
                      project.config.get_functions_file(project.collection),
                      params['ws_name'], params['ws_client'])

    for sample_id in project.list_samples():
        annotation_obj_ref, feature_ids, genome_name = \
            save_domain_annotations(project, dms_ref, params['ws_name'],
                                    params['ws_client'], params['annotation_prefix'],
                                    sample_id, params['name2ref'][sample_id])
        genome_names[sample_id] = genome_name
        objects_created.append({
            'ref':
            annotation_obj_ref,
            'description':
            'Functional annotations for genome ' +
            project.samples[sample_id].sample_name
        })
        for feature_id in feature_ids:
            if feature_id not in featureset_elements:
                featureset_elements[feature_id] = []
            featureset_elements[feature_id].append(
                params['name2ref'][sample_id])
            featureset_element_ordering.append(feature_id)

        sample_xlsx_report = sanitize_file_name(
            os.path.join(project.options.work_dir, sample_id + '_' + metric +
                         '_functions_taxonomy.xlsx'))
        if os.path.exists(sample_xlsx_report):
            report_files[sample_xlsx_report] = \
                sanitize_file_name(genome_name + '_function_taxonomy_profile_long.xlsx')
        else:
            print('Sample XLSX file not found:', sample_xlsx_report)
        krona_file = sanitize_file_name(
            os.path.join(
                project.options.work_dir, sample_id + '_' + metric +
                '_functional_taxonomy_profile.xml.html'))

        if os.path.exists(krona_file):
            krona_output = \
                sanitize_file_name(os.path.join(out_dir, genome_name +
                                   '_function_taxonomy_profile_chart.html'))
            shutil.copy2(krona_file, krona_output)
            with zipfile.ZipFile(krona_output + '.zip',
                                 'w',
                                 zipfile.ZIP_DEFLATED,
                                 allowZip64=True) as zip_file:
                zip_file.write(
                    krona_output,
                    sanitize_file_name(
                        genome_name + '_function_taxonomy_profile_chart.html'))
            report_files[krona_output] = \
                sanitize_file_name(genome_name + '_function_taxonomy_profile_chart.html')
            output['krona_charts'][krona_output + '.zip'] = \
                (sanitize_file_name(genome_name + '_function_taxonomy_profile_chart.html'),
                    sample_id + ' function taxonomy chart')
        else:
            print('Krona diagram file not found:', krona_file)

    feature_set_data = {
        'description': 'FeatureSet generated by Fama protein profiling',
        'element_ordering': featureset_element_ordering,
        'elements': featureset_elements
    }

    out_report = os.path.join(out_dir, 'fama_report.html')
    generate_protein_html_report(out_report, project, params['name2ref'])
    with zipfile.ZipFile(out_report + '.zip',
                         'w',
                         zipfile.ZIP_DEFLATED,
                         allowZip64=True) as zip_file:
        zip_file.write(out_report, 'fama_report.html')
    output['html_report'] = out_report + '.zip'
    report_files[out_report] = 'fama_report.html'

    output_files = list()
    result_file = os.path.join(project.options.work_dir, 'Fama_result.zip')
    with zipfile.ZipFile(result_file,
                         'w',
                         zipfile.ZIP_DEFLATED,
                         allowZip64=True) as zip_file:
        for filename in report_files:
            zip_file.write(filename, report_files[filename])
    output_files.append({
        'path': result_file,
        'name': os.path.basename(result_file),
        'label': os.path.basename(result_file),
        'description': 'Files generated by Fama App'
    })
    output['report_files'] = output_files
    output['project'] = project
    output['feature_set_data'] = feature_set_data
    output['objects_created'] = objects_created
    return output
Beispiel #3
0
def make_sample_tax_func_xlsx(project,
                              scores,
                              metric,
                              function_id=None,
                              rank=None):
    """Generates XLSX file for taxa scores for one or all functions in all samples.

    Args:
        project (:obj:'Project'): Project object that stores all annotated reads
        scores (dict[str, dict[str, dict[str, float]]]): outer key is function
        identifier, middle-level key is sample identifier,
        inner key is metric, value id float
        metric (str, optional): acceptable values are 'readcount', 'erpk', 'rpkm',
            'fragmentcount', 'fpk', 'efpk', 'fpkm', 'erpkm', 'efpkm',
            'fpkg', 'rpkg', 'erpkg', 'efpkg', 'proteincount'
        function_id (str, optional): function identifier. If function_id is None, all
            functions will be included into workbook.
        rank (str, optional): taxonomic rank. if rank parameter is not None, the
            resulting XLSX file will contain only entries for this rank.
    """
    if function_id is None:
        if rank is None:
            xlsxfile = sanitize_file_name(
                os.path.join(
                    project.options.work_dir, project.options.project_name +
                    '_' + metric + '_samples_taxonomy.xlsx'))
        else:
            xlsxfile = sanitize_file_name(
                os.path.join(
                    project.options.work_dir, project.options.project_name +
                    '_' + metric + '_samples_' + rank + '_taxonomy.xlsx'))

    else:
        if rank is None:
            xlsxfile = sanitize_file_name(
                os.path.join(
                    project.options.work_dir,
                    function_id + '_' + metric + '_samples_taxonomy.xlsx'))
        else:
            xlsxfile = sanitize_file_name(
                os.path.join(
                    project.options.work_dir, function_id + '_' + metric +
                    '_samples_' + rank + '_taxonomy.xlsx'))

    print('Writing', xlsxfile)
    writer = pd.ExcelWriter(xlsxfile, engine='xlsxwriter')

    for function in sorted(project.ref_data.functions_dict.keys()):
        if function_id is not None and function != function_id:
            continue

        # Subsetting scores
        sample_scores = autovivify(3, float)
        for taxonomy_id in scores.keys():
            if function in scores[taxonomy_id].keys():
                for sample in project.list_samples():
                    if sample in scores[taxonomy_id][function]:
                        for key, val in scores[taxonomy_id][function][
                                sample].items():
                            sample_scores[taxonomy_id][sample][key] = val
                    else:
                        sample_scores[taxonomy_id][sample][metric] = 0.0

        tax_profile = TaxonomyProfile()
        tax_profile.make_function_taxonomy_profile(project.taxonomy_data,
                                                   sample_scores)

        taxonomy_df = tax_profile.convert_profile_into_score_df(metric=metric)

        if rank is None:
            taxonomy_df.to_excel(writer,
                                 sheet_name=function,
                                 merge_cells=False)
        else:
            filtered_df = taxonomy_df[taxonomy_df[('', 'Rank')] == rank]
            filtered_df.to_excel(writer,
                                 sheet_name=function,
                                 merge_cells=False)
        format_taxonomy_worksheet(writer, function)

    # Make 'Average' sheet
    if function_id is None:
        sample_scores = autovivify(3, float)
        for taxonomy_id in scores:
            for function in sorted(project.ref_data.functions_dict.keys()):
                if function in scores[taxonomy_id]:
                    for sample in project.list_samples():
                        if sample in scores[taxonomy_id][function]:
                            for key, val in scores[taxonomy_id][function][
                                    sample].items():
                                sample_scores[taxonomy_id][sample][key] += val
                        else:
                            sample_scores[taxonomy_id][sample][metric] += 0.0
        for taxonomy_id in sample_scores:
            for sample in sample_scores[taxonomy_id]:
                sample_scores[taxonomy_id][sample][metric] = \
                    sample_scores[taxonomy_id][sample][metric] \
                    / len(project.ref_data.functions_dict.keys())

        tax_profile = TaxonomyProfile()
        tax_profile.make_function_taxonomy_profile(project.taxonomy_data,
                                                   sample_scores)

        taxonomy_df = tax_profile.convert_profile_into_score_df(metric=metric)

        if rank is None:
            taxonomy_df.to_excel(writer,
                                 sheet_name='Average',
                                 merge_cells=False)
        else:
            filtered_df = taxonomy_df[taxonomy_df[('', 'Rank')] == rank]
            filtered_df.to_excel(writer,
                                 sheet_name='Average',
                                 merge_cells=False)

        format_taxonomy_worksheet(writer, 'Average')

    writer.save()
Beispiel #4
0
def make_assembly_xlsx(assembler):
    """Generates XLSX file for assembly.

    Args:
        assembler (:obj:'GeneAssembler'): gene assembler object
    """
    xlsxfile = sanitize_file_name(
        os.path.join(assembler.project.options.assembly_dir, 'out',
                     assembler.project.options.project_name +
                     '_assembly.xlsx'))
    xlsxfile = xlsxfile.replace(' ', '_')
    xlsxfile = xlsxfile.replace("'", "")
    xlsxfile = xlsxfile.replace('"', '')
    workbook = xlsxwriter.Workbook(xlsxfile)
    bold = workbook.add_format({'bold': True})
    cell_numformat0 = workbook.add_format()
    cell_numformat0.set_num_format('0')
    cell_numformat1 = workbook.add_format()
    cell_numformat1.set_num_format('0.0')
    cell_numformat5 = workbook.add_format()
    cell_numformat5.set_num_format('0.00000')

    functions_list = set()
    samples_list = sorted(assembler.project.list_samples())
    function_read_counts = autovivify(
        2, float)  # function_read_counts[function][sample]
    gene_rpkm = autovivify(3, float)  # gene_rpkm[function][gene][sample],
    # parameters are RPKM, coverage, identity

    # count reads per function, per sample
    for function in assembler.assembly.reads:
        functions_list.add(function)
        for read in assembler.assembly.reads[function]:
            function_read_counts[function][assembler.assembly.reads[function]
                                           [read]] += 1

    # collect RPKM scores for contigs per function, per sample (for contigs? for genes?)
    # calculate total read count
    total_read_count = 0

    for sample in samples_list:
        total_read_count += assembler.project.options.get_fastq1_readcount(
            sample)
        total_read_count += assembler.project.options.get_fastq2_readcount(
            sample)
    # generate output

    # make worksheet for read counts per function
    reads_worksheet = workbook.add_worksheet('Functions read count')

    row = 0
    col = 0
    reads_worksheet.write(row, col, 'Function', bold)

    for sample in samples_list:
        col += 1
        reads_worksheet.write(row, col, sample, bold)
    col += 1
    reads_worksheet.write(row, col, 'All samples', bold)
    col += 1
    reads_worksheet.write(row, col, 'Assembled reads', bold)
    col += 1
    reads_worksheet.write(row, col, 'Unassembled reads', bold)
    col += 1
    reads_worksheet.write(row, col, 'Definition', bold)

    for function in sorted(functions_list):
        row += 1
        col = 0
        reads_worksheet.write(row, col, function, bold)
        for sample in samples_list:
            col += 1
            if sample in function_read_counts[function]:
                reads_worksheet.write(
                    row, col, function_read_counts[function][sample] * 2,
                    cell_numformat0)
            else:
                reads_worksheet.write(row, col, 0, cell_numformat0)
        col += 1
        all_reads = sum(function_read_counts[function].values()) * 2
        reads_worksheet.write(row, col, all_reads, cell_numformat0)
        col += 1
        assembled_reads = 0
        if function in assembler.assembly.contigs:
            assembled_reads = sum([
                len(c.reads)
                for c in assembler.assembly.contigs[function].values()
            ])
        reads_worksheet.write(row, col, assembled_reads, cell_numformat0)
        col += 1
        reads_worksheet.write(row, col, all_reads - assembled_reads,
                              cell_numformat0)
        col += 1
        reads_worksheet.write(
            row, col,
            assembler.project.ref_data.lookup_function_name(function))

    # adjust column width
    reads_worksheet.set_column(0, 0, 10)
    reads_worksheet.set_column(col, col, 50)

    # make worksheet with contig data
    contigs_worksheet = workbook.add_worksheet('Contigs')

    row = 0
    col = 0
    contigs_worksheet.write(row, col, 'Contig', bold)
    col += 1
    contigs_worksheet.write(row, col, 'Function', bold)
    col += 1
    contigs_worksheet.write(row, col, 'Length', bold)
    col += 1
    contigs_worksheet.write(row, col, 'Read count', bold)
    col += 1
    contigs_worksheet.write(row, col, 'RPKM', bold)
    col += 1
    contigs_worksheet.write(row, col, 'Coverage', bold)
    col += 1
    contigs_worksheet.write(row, col, 'Number of genes', bold)

    for sample in samples_list:
        col += 1
        contigs_worksheet.write(row, col, sample, bold)
        col += 1
        contigs_worksheet.write(row, col, sample, bold)
        col += 1
        contigs_worksheet.write(row, col, sample, bold)

    col += 1
    contigs_worksheet.write(row, col, 'Definition', bold)

    row += 1
    col = 6
    for sample in samples_list:
        col += 1
        contigs_worksheet.write(row, col, 'Read count', bold)
        col += 1
        contigs_worksheet.write(row, col, 'RPKM', bold)
        col += 1
        contigs_worksheet.write(row, col, 'Coverage', bold)

    for function in sorted(functions_list):
        if function in assembler.assembly.contigs:
            for contig in sorted(assembler.assembly.contigs[function].keys()):
                row += 1
                col = 0
                contigs_worksheet.write(row, col, contig, bold)
                col += 1
                contigs_worksheet.write(row, col, function)
                col += 1
                contigs_worksheet.write(
                    row, col,
                    len(assembler.assembly.contigs[function][contig].sequence))
                col += 1
                contigs_worksheet.write(
                    row, col, assembler.assembly.contigs[function]
                    [contig].get_read_count())
                col += 1
                contigs_worksheet.write(
                    row, col, assembler.assembly.contigs[function]
                    [contig].get_rpkm(total_read_count), cell_numformat5)
                col += 1
                contigs_worksheet.write(
                    row, col, assembler.assembly.contigs[function]
                    [contig].get_coverage(), cell_numformat1)
                col += 1
                contigs_worksheet.write(
                    row, col,
                    len(assembler.assembly.contigs[function][contig].genes))
                col += 1

                for sample in samples_list:
                    contigs_worksheet.write(
                        row, col, assembler.assembly.contigs[function]
                        [contig].get_read_count(sample))
                    col += 1
                    contigs_worksheet.write(
                        row, col,
                        assembler.assembly.contigs[function][contig].get_rpkm(
                            assembler.project.options.get_fastq1_readcount(
                                sample), sample), cell_numformat5)
                    col += 1
                    contigs_worksheet.write(
                        row, col, assembler.assembly.contigs[function]
                        [contig].get_coverage(sample), cell_numformat1)
                    col += 1
                contigs_worksheet.write(
                    row, col,
                    assembler.project.ref_data.lookup_function_name(function))

    # adjust column width
    contigs_worksheet.set_column(0, 1, 10)
    contigs_worksheet.set_column(col, col, 50)

    # make worksheet for genes
    genes_worksheet = workbook.add_worksheet('Genes')

    row = 0
    col = 0
    genes_worksheet.write(row, col, 'Gene', bold)
    col += 1
    genes_worksheet.write(row, col, 'Reads function', bold)
    col += 1
    genes_worksheet.write(row, col, 'Contig', bold)
    col += 1
    genes_worksheet.write(row, col, 'Gene start', bold)
    col += 1
    genes_worksheet.write(row, col, 'Gene end', bold)
    col += 1
    genes_worksheet.write(row, col, 'Gene length', bold)
    col += 1
    genes_worksheet.write(row, col, 'Gene strand', bold)
    col += 1
    genes_worksheet.write(row, col, 'Read count', bold)
    col += 1
    genes_worksheet.write(row, col, 'RPKM', bold)
    col += 1
    genes_worksheet.write(row, col, 'Coverage', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama gene status', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama function', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama identity', bold)
    col += 1
    genes_worksheet.write(row, col, 'CDS completeness', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama best hit', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama best hit taxonomy ID', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama best hit organism', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama best hit taxonomy', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama LCA taxonomy ID', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama LCA organism', bold)
    col += 1
    genes_worksheet.write(row, col, 'Fama LCA taxonomy', bold)

    for sample in samples_list:
        col += 1
        genes_worksheet.write(row, col, sample, bold)
        col += 1
        genes_worksheet.write(row, col, sample, bold)
        col += 1
        genes_worksheet.write(row, col, sample, bold)

    col += 1
    genes_worksheet.write(row, col, 'Definition', bold)

    row += 1
    col = 20
    for sample in samples_list:
        col += 1
        genes_worksheet.write(row, col, 'Read count', bold)
        col += 1
        genes_worksheet.write(row, col, 'RPKM', bold)
        col += 1
        genes_worksheet.write(row, col, 'Coverage', bold)

    for function in sorted(functions_list):
        if function not in assembler.assembly.contigs:
            continue
        for contig in sorted(assembler.assembly.contigs[function].keys()):
            for gene_id in sorted(
                    assembler.assembly.contigs[function][contig].genes.keys()):
                gene = assembler.assembly.contigs[function][contig].genes[
                    gene_id]
                row += 1
                col = 0
                # Write Gene ID
                genes_worksheet.write(row, col, gene_id)
                col += 1
                # Write Gene function from read mapping
                genes_worksheet.write(row, col, function)
                col += 1
                # Write Contig ID
                genes_worksheet.write(row, col, contig)
                col += 1
                # Write gene start
                genes_worksheet.write(row, col, int(gene.start))
                col += 1
                # Write gene end
                genes_worksheet.write(row, col, int(gene.end))
                col += 1
                # Write gene length
                gene_length = int(gene.end) - int(gene.start) + 1
                genes_worksheet.write(row, col, gene_length)
                col += 1
                # Write gene strand
                genes_worksheet.write(row, col, gene.strand)
                col += 1
                # Write read count (calculated from read count of contig,
                # adjusted by gene length)
                gene_read_count = assembler.assembly.contigs[function][contig].get_read_count()\
                    * gene_length \
                    / len(assembler.assembly.contigs[function][contig].sequence)
                genes_worksheet.write(row, col, gene_read_count,
                                      cell_numformat1)
                col += 1
                # Write RPKM
                gene_rpkm = assembler.assembly.contigs[function][
                    contig].get_rpkm(total_read_count)
                genes_worksheet.write(row, col, gene_rpkm, cell_numformat5)
                col += 1
                # Write coverage
                genes_worksheet.write(
                    row, col, assembler.assembly.contigs[function]
                    [contig].get_coverage(), cell_numformat1)
                col += 1
                # Write FAMA gene status
                genes_worksheet.write(row, col, gene.status)
                col += 1
                if gene.status == STATUS_GOOD:
                    # Write FAMA predicted functions
                    gene_functions = set(
                        [y for x in gene.hit_list.hits for y in x.functions])
                    genes_worksheet.write(row, col, ','.join(gene_functions))
                    col += 1
                    # Write FAMA identity
                    gene_identity = [x.identity for x in gene.hit_list.hits]
                    genes_worksheet.write(
                        row, col,
                        sum(gene_identity) / len(gene_identity),
                        cell_numformat1)
                    col += 1
                    # Write CDS completeness
                    ref_lengths = [x.s_len for x in gene.hit_list.hits]
                    genes_worksheet.write(
                        row, col,
                        len(gene.protein_sequence) * 100 * len(ref_lengths) /
                        sum(ref_lengths), cell_numformat1)
                    col += 1
                    # Write FAMA best hits
                    fama_hits = [
                        cleanup_protein_id(x.subject_id)
                        for x in gene.hit_list.hits
                    ]
                    genes_worksheet.write(row, col, ','.join(fama_hits))
                    col += 1
                    # Write FAMA taxonomy ID
                    gene_taxonomy = [
                        assembler.project.ref_data.lookup_protein_tax(
                            cleanup_protein_id(x.subject_id))
                        for x in gene.hit_list.hits
                    ]
                    genes_worksheet.write(row, col, ','.join(gene_taxonomy))
                    col += 1

                    # Write Fama best hit organism
                    gene_organism = [
                        assembler.project.taxonomy_data.get_name(x)
                        for x in gene_taxonomy
                    ]
                    genes_worksheet.write(row, col, ','.join(gene_organism))
                    col += 1
                    # Write Fama best hit taxonomy
                    best_hit_taxonomy = [
                        assembler.project.taxonomy_data.get_taxonomy_lineage(x)
                        for x in gene_taxonomy
                    ]
                    genes_worksheet.write(row, col,
                                          '|'.join(best_hit_taxonomy))
                    col += 1

                    # Write Fama LCA taxonomy ID
                    lca_taxonomy_id = gene.taxonomy
                    genes_worksheet.write(row, col, lca_taxonomy_id)
                    col += 1
                    # Write Fama LCA organism
                    lca_organism = assembler.project.taxonomy_data.get_name(
                        lca_taxonomy_id)
                    genes_worksheet.write(row, col, lca_organism)
                    col += 1
                    # Write Fama LCA taxonomy
                    lca_taxonomy = assembler.project.taxonomy_data.get_taxonomy_lineage(
                        lca_taxonomy_id)
                    genes_worksheet.write(row, col, lca_taxonomy)

                else:
                    for _ in range(0, 10):
                        genes_worksheet.write(row, col, 'N/A')
                        col += 1

                for sample in samples_list:
                    col += 1
                    gene_read_count = assembler.assembly.contigs[function][
                        contig].get_read_count(sample) * len(
                            gene.protein_sequence) * 3 / len(
                                assembler.assembly.contigs[function]
                                [contig].sequence)

                    genes_worksheet.write(row, col, gene_read_count,
                                          cell_numformat1)
                    col += 1
                    gene_rpkm = assembler.assembly.contigs[function][
                        contig].get_rpkm(
                            assembler.project.options.get_fastq1_readcount(
                                sample), sample)
                    genes_worksheet.write(row, col, gene_rpkm, cell_numformat5)
                    col += 1
                    genes_worksheet.write(
                        row, col, assembler.assembly.contigs[function]
                        [contig].get_coverage(sample), cell_numformat1)
                col += 1
                genes_worksheet.write(
                    row, col,
                    assembler.project.ref_data.lookup_function_name(function))

    # adjust column width
    genes_worksheet.set_column(0, 0, 20)
    genes_worksheet.set_column(1, 1, 10)
    genes_worksheet.set_column(7, 9, 15)
    genes_worksheet.set_column(col, col, 50)
    workbook.close()
Beispiel #5
0
def make_function_sample_xlsx(project, scores, metric, sample_id=None):
    """Generates XLSX file for function scores for one or more samples.

    Args:
        project (:obj:'Project'): Project object that stores all annotated reads
        scores (dict[str, dict[str, dict[str, float]]]): outer key is function
        identifier, middle-level key is sample identifier,
        inner key is metric, value id float
        metric (str, optional): acceptable values are 'readcount', 'erpk', 'rpkm',
            'fragmentcount', 'fpk', 'efpk', 'fpkm', 'erpkm', 'efpkm',
            'fpkg', 'rpkg', 'erpkg', 'efpkg', 'proteincount'
        sample_id (str, optional): sample identifier
    """
    if sample_id is None:
        xlsxfile = sanitize_file_name(
            os.path.join(
                project.options.work_dir, project.options.project_name + '_' +
                metric + '_functions.xlsx'))
    else:
        xlsxfile = sanitize_file_name(
            os.path.join(project.options.work_dir,
                         sample_id + '_' + metric + '_functions.xlsx'))

    print('Writing', xlsxfile)
    workbook = xlsxwriter.Workbook(xlsxfile)
    bold = workbook.add_format({'bold': True})

    functions_list = sorted(project.ref_data.functions_dict.keys())
    categories_list = sorted(
        list(
            set([
                project.ref_data.functions_dict[x]['group']
                for x in project.ref_data.functions_dict.keys()
            ])))

    scores_cat = autovivify(2, float)

    # generate tables for functions
    scores_worksheet = workbook.add_worksheet('Functions ' + metric)

    row = 0
    col = 0
    scores_worksheet.write(row, col, 'Function', bold)
    for sample in project.list_samples():
        if sample_id is not None and sample != sample_id:
            continue
        col += 1
        scores_worksheet.write(row, col, sample, bold)

    col += 1
    scores_worksheet.write(row, col, 'Definition', bold)

    for function in functions_list:
        category = project.ref_data.lookup_function_group(function)
        row += 1
        col = 0
        scores_worksheet.write(row, col, function, bold)
        for sample in project.list_samples():
            if sample_id is not None and sample != sample_id:
                continue
            col += 1
            if function in scores and sample in scores[function]:
                scores_worksheet.write(row, col,
                                       scores[function][sample][metric])
                scores_cat[category][sample] += scores[function][sample][
                    metric]
            else:
                scores_worksheet.write(row, col, 0.0)

        col += 1
        scores_worksheet.write(row, col,
                               project.ref_data.lookup_function_name(function))

    # adjust column width
    scores_worksheet.set_column(0, 0, 10)
    scores_worksheet.set_column(col, col, 50)

    # Write worksheet for categories
    scores_cat_worksheet = workbook.add_worksheet('Categories ' + metric)
    row = 0
    col = 0
    scores_cat_worksheet.write(row, col, 'Categories', bold)

    for sample in project.list_samples():
        if sample_id is not None and sample != sample_id:
            continue
        col += 1
        scores_cat_worksheet.write(row, col, sample, bold)

    for category in categories_list:
        row += 1
        col = 0
        scores_cat_worksheet.write(row, col, category, bold)
        for sample in project.list_samples():
            if sample_id is not None and sample != sample_id:
                continue
            col += 1
            if category in scores_cat and sample in scores_cat[category]:
                scores_cat_worksheet.write(row, col,
                                           scores_cat[category][sample])
            else:
                scores_cat_worksheet.write(row, col, 0.0)
    # adjust column width
    scores_cat_worksheet.set_column(0, 0, 50)

    workbook.close()