Exemple #1
0
def summarize_vgfs(input_file,
                   output_dir,
                   groupby_column='scaffold',
                   max_auxiliary_score=3,
                   remove_transposons=False,
                   remove_fs=False):
    start_time = datetime.now()

    # set up
    annotations = pd.read_csv(input_file, sep='\t', index_col=0)
    db_locs = get_database_locs()
    if 'genome_summary_form' not in db_locs:
        raise ValueError(
            'Genome summary form location must be set in order to summarize genomes'
        )
    mkdir(output_dir)
    genome_summary_form = pd.read_csv(db_locs['genome_summary_form'],
                                      sep='\t',
                                      index_col=0)
    print('%s: Retrieved database locations and descriptions' %
          (str(datetime.now() - start_time)))

    # get potential AMGs
    # potential_amgs = filter_to_amgs(annotations.fillna(''), max_aux=max_auxiliary_score,
    #                                 remove_transposons=remove_transposons, remove_fs=remove_fs, remove_js=remove_js)
    potential_amgs = filter_to_amgs(annotations.fillna(''),
                                    max_aux=max_auxiliary_score,
                                    remove_transposons=remove_transposons,
                                    remove_fs=remove_fs)
    print('%s: Determined potential amgs' % (str(datetime.now() - start_time)))

    # make distillate
    viral_genome_stats = make_viral_stats_table(annotations, potential_amgs,
                                                groupby_column)
    viral_genome_stats.to_csv(path.join(output_dir, 'vMAG_stats.tsv'),
                              sep='\t')
    print('%s: Calculated viral genome statistics' %
          (str(datetime.now() - start_time)))

    viral_distillate = make_viral_distillate(potential_amgs,
                                             genome_summary_form)
    viral_distillate.to_csv(path.join(output_dir, 'amg_summary.tsv'),
                            sep='\t',
                            index=None)
    print('%s: Generated AMG summary' % (str(datetime.now() - start_time)))

    # make liquor
    vgf_order = make_vgf_order(potential_amgs)
    amg_column = make_amg_count_column(potential_amgs, vgf_order)
    viral_function_df = make_viral_functional_df(potential_amgs,
                                                 genome_summary_form,
                                                 groupby_column=groupby_column)
    viral_functional_heatmap = make_viral_functional_heatmap(
        viral_function_df, vgf_order)
    alt.hconcat(amg_column, viral_functional_heatmap,
                spacing=5).save(path.join(output_dir, 'product.html'))
    print('%s: Generated product heatmap' % (str(datetime.now() - start_time)))
    print("%s: Completed distillation" % str(datetime.now() - start_time))
Exemple #2
0
def populate_description_db(output_loc=None, db_dict=None, start_time=None):
    if start_time is None:
        start_time = datetime.now()
        print('%s: Populating description database' %
              str(datetime.now() - start_time))
    # setup
    if db_dict is None:
        db_dict = get_database_locs()

    if db_dict.get('description_db') is None and output_loc is not None:
        db_dict['description_db'] = output_loc
    elif db_dict.get('description_db') is None and output_loc is None:
        raise ValueError(
            'Must provide output location if description db location is not set in configuration'
        )
    elif path.exists(db_dict['description_db']):
        remove(db_dict['description_db'])

    create_description_db(db_dict['description_db'])
    db_handler = DatabaseHandler(db_dict['description_db'])
    print('%s: Database connection established' %
          str(datetime.now() - start_time))

    # fill database
    add_to_description_db(db_dict['kegg'], 'kegg_description',
                          make_header_dict_from_mmseqs_db, db_handler)
    print('%s: KEGG descriptions added to description database' %
          str(datetime.now() - start_time))
    add_to_description_db(db_dict['uniref'], 'uniref_description',
                          make_header_dict_from_mmseqs_db, db_handler)
    print('%s: UniRef descriptions added to description database' %
          str(datetime.now() - start_time))
    add_to_description_db(db_dict['pfam_hmm_dat'], 'pfam_description',
                          process_pfam_descriptions, db_handler)
    print('%s: PFAM descriptions added to description database' %
          str(datetime.now() - start_time))
    add_to_description_db(db_dict['dbcan_fam_activities'], 'dbcan_description',
                          process_dbcan_descriptions, db_handler)
    print('%s: dbCAN descriptions added to description database' %
          str(datetime.now() - start_time))
    add_to_description_db(db_dict['viral'], 'viral_description',
                          make_header_dict_from_mmseqs_db, db_handler)
    print('%s: RefSeq viral descriptions added to description database' %
          str(datetime.now() - start_time))
    add_to_description_db(db_dict['peptidase'], 'peptidase_description',
                          make_header_dict_from_mmseqs_db, db_handler)
    print('%s: MEROPS descriptions added to description database' %
          str(datetime.now() - start_time))
    add_to_description_db(db_dict['vog_annotations'], 'vogdb_description',
                          process_vogdb_descriptions, db_handler)
    print('%s: VOGdb descriptions added to description database' %
          str(datetime.now() - start_time))
    print('%s: Description database populated' %
          str(datetime.now() - start_time))
def test_set_database_paths(tmpdir):
    test_config_dir = tmpdir.mkdir('test_config')
    # first test that adding nothing doesn't change CONFIG
    test_config = os.path.join(test_config_dir, 'CONFIG')
    pretest_db_dict = get_database_locs()
    set_database_paths(config_loc=test_config)
    test_db_dict = get_database_locs(test_config)
    assert type(test_db_dict) is dict
    assert pretest_db_dict == test_db_dict
    # test that adding something that doesn't exist throws error
    test_fake_database = os.path.join(test_config_dir, 'fake_database.mmsdb')
    with pytest.raises(ValueError):
        set_database_paths(kegg_db_loc=test_fake_database)
    # test that adding something real is really added
    kegg_loc = os.path.join('tests', 'data', 'fake_gff.gff')
    set_database_paths(kegg_db_loc=kegg_loc, config_loc=test_config)
    test_db_dict = get_database_locs(test_config)
    assert test_db_dict['kegg'] == os.path.realpath(kegg_loc)
    # test that adding something with use_current_locs False works
    set_database_paths(kegg_db_loc=kegg_loc, config_loc=test_config, use_current_locs=False)
    test_db_dict = get_database_locs(test_config)
    assert test_db_dict['kegg'] == os.path.realpath(kegg_loc)
    assert test_db_dict['description_db'] is None
Exemple #4
0
def print_database_locations(db_locs=None):
    if db_locs is None:
        db_locs = get_database_locs()

    print('KEGG db: %s' % db_locs.get('kegg'))
    print('KOfam db: %s' % db_locs.get('kofam'))
    print('KOfam KO list: %s' % db_locs.get('kofam_ko_list'))
    print('UniRef db: %s' % db_locs.get('uniref'))
    print('Pfam db: %s' % db_locs.get('pfam'))
    print('Pfam hmm dat: %s' % db_locs.get('pfam_hmm_dat'))
    print('dbCAN db: %s' % db_locs.get('dbcan'))
    print('dbCAN family activities: %s' % db_locs.get('dbcan_fam_activities'))
    print('RefSeq Viral db: %s' % db_locs.get('viral'))
    print('MEROPS peptidase db: %s' % db_locs.get('peptidase'))
    print('VOGDB db: %s' % db_locs.get('vogdb'))
    print('VOG annotations: %s' % db_locs.get('vog_annotations'))
    print('Description db: %s' % db_locs.get('description_db'))
    print('Genome summary form: %s' % db_locs.get('genome_summary_form'))
    print('Module step form: %s' % db_locs.get('module_step_form'))
    print('ETC module database: %s' % db_locs.get('etc_module_database'))
    print('Function heatmap form: %s' % db_locs.get('function_heatmap_form'))
    print('AMG database: %s' % db_locs.get('amg_database'))
Exemple #5
0
def annotate_vgfs(input_fasta,
                  virsorter_affi_contigs=None,
                  output_dir='.',
                  min_contig_size=2500,
                  prodigal_mode='meta',
                  trans_table='11',
                  bit_score_threshold=60,
                  rbh_bit_score_threshold=350,
                  custom_db_name=(),
                  custom_fasta_loc=(),
                  use_uniref=False,
                  low_mem_mode=False,
                  skip_trnascan=False,
                  keep_tmp_dir=True,
                  threads=10,
                  verbose=True):
    # set up
    start_time = datetime.now()
    print('%s: Viral annotation started' % str(datetime.now()))

    # check inputs
    prodigal_modes = ['train', 'meta', 'single']
    if prodigal_mode not in prodigal_modes:
        raise ValueError('Prodigal mode must be one of %s.' %
                         ', '.join(prodigal_modes))
    elif prodigal_mode in ['normal', 'single']:
        warnings.warn(
            'When running prodigal in single mode your bins must have long contigs (average length >3 Kbp), '
            'be long enough (total length > 500 Kbp) and have very low contamination in order for prodigal '
            'training to work well.')

    # get database locations
    db_locs = get_database_locs()
    db_handler = DatabaseHandler(db_locs['description_db'])
    db_locs_anno = filter_db_locs(db_locs, low_mem_mode, use_uniref,
                                  VMAG_DBS_TO_ANNOTATE)

    if virsorter_affi_contigs is not None:
        virsorter_hits = get_virsorter_hits(virsorter_affi_contigs)
    else:
        virsorter_hits = None

    # split sequences into seperate fastas
    mkdir(output_dir)
    contig_dir = path.join(output_dir, 'vMAGs')
    mkdir(contig_dir)
    contig_locs = list()
    for seq in read_sequence(input_fasta, format='fasta'):
        if len(seq) >= min_contig_size:
            if '=' in seq.metadata['id'] or ';' in seq.metadata['id']:
                raise ValueError(
                    'FASTA headers must not have = or ; before the first space (%s). To run DRAM-v you '
                    'must rerun VIRSorter with = and ; removed from the headers or run DRAM-v.py '
                    'remove_bad_characters and then rerun DRAM-v' %
                    seq.metadata['id'])
            if virsorter_hits is not None:
                if get_virsorter_affi_contigs_name(
                        seq.metadata['id']
                ) not in virsorter_hits['name'].values:
                    raise ValueError(
                        "No virsorter calls found in %s for scaffold %s from input fasta"
                        % (virsorter_affi_contigs, seq.metadata['id']))
            contig_loc = path.join(contig_dir, '%s.fasta' % seq.metadata['id'])
            write_sequence((i for i in [seq]), format='fasta', into=contig_loc)
            contig_locs.append(contig_loc)

    # annotate vMAGs
    rename_bins = False
    annotations = annotate_fastas(contig_locs, output_dir, db_locs_anno,
                                  db_handler, min_contig_size, prodigal_mode,
                                  trans_table, bit_score_threshold,
                                  rbh_bit_score_threshold, custom_db_name,
                                  custom_fasta_loc, skip_trnascan, rename_bins,
                                  keep_tmp_dir, start_time, threads, verbose)
    print('%s: Annotations complete, processing annotations' %
          str(datetime.now() - start_time))

    # setting up scoring viral genes
    amg_database_frame = pd.read_csv(db_locs['amg_database'], sep='\t')
    genome_summary_form = pd.read_csv(db_locs['genome_summary_form'],
                                      sep='\t',
                                      index_col=0)
    genome_summary_form = genome_summary_form.loc[
        genome_summary_form.potential_amg]

    # add auxiliary score
    if virsorter_hits is not None:
        gene_virsorter_category_dict = dict()
        gene_auxiliary_score_dict = dict()
        for scaffold, dram_frame in annotations.groupby('scaffold'):
            virsorter_scaffold_name = get_virsorter_affi_contigs_name(scaffold)
            virsorter_frame = virsorter_hits.loc[virsorter_hits.name ==
                                                 virsorter_scaffold_name]
            gene_order = get_gene_order(dram_frame, virsorter_frame)
            gene_virsorter_category_dict.update({
                dram_gene: virsorter_category
                for dram_gene, _, virsorter_category in gene_order
                if dram_gene is not None
            })
            gene_auxiliary_score_dict.update(
                calculate_auxiliary_scores(gene_order))
        annotations['virsorter_category'] = pd.Series(
            gene_virsorter_category_dict)
        annotations['auxiliary_score'] = pd.Series(gene_auxiliary_score_dict)

    # get metabolic flags
    scaffold_length_dict = {
        seq.metadata['id']: len(seq)
        for seq in read_sequence(input_fasta, format='fasta')
    }
    metabolic_genes = set(genome_summary_form.index)
    if 'pfam_hits' in annotations:
        annotations['is_transposon'] = [
            is_transposon(i) for i in annotations['pfam_hits']
        ]
    else:
        annotations['is_transposon'] = False

    amgs = get_amg_ids(amg_database_frame)
    verified_amgs = get_amg_ids(
        amg_database_frame.loc[amg_database_frame.verified])
    annotations['amg_flags'] = pd.Series(
        get_metabolic_flags(annotations, metabolic_genes, amgs, verified_amgs,
                            scaffold_length_dict))

    # downgrade B flag auxiliary scores
    if virsorter_affi_contigs is not None:
        annotations['auxiliary_score'] = pd.Series({
            gene: (4 if 'B' in row['amg_flags'] and row['auxiliary_score'] < 4
                   else row['auxiliary_score'])
            for gene, row in annotations.iterrows()
        })

    # write annotations
    annotations.to_csv(path.join(output_dir, 'annotations.tsv'), sep='\t')

    print("%s: Completed annotations" % str(datetime.now() - start_time))
Exemple #6
0
def annotate_vgfs(input_fasta,
                  virsorter_affi_contigs=None,
                  output_dir='.',
                  min_contig_size=2500,
                  prodigal_mode='meta',
                  trans_table='11',
                  bit_score_threshold=60,
                  rbh_bit_score_threshold=350,
                  custom_db_name=(),
                  custom_fasta_loc=(),
                  use_uniref=False,
                  low_mem_mode=False,
                  skip_trnascan=False,
                  keep_tmp_dir=True,
                  threads=10,
                  verbose=True):
    # set up
    start_time = datetime.now()
    print('%s: Viral annotation started' % str(datetime.now()))

    # check inputs
    prodigal_modes = ['train', 'meta', 'single']
    if prodigal_mode not in prodigal_modes:
        raise ValueError('Prodigal mode must be one of %s.' %
                         ', '.join(prodigal_modes))
    elif prodigal_mode in ['normal', 'single']:
        warnings.warn(
            'When running prodigal in single mode your bins must have long contigs (average length >3 Kbp), '
            'be long enough (total length > 500 Kbp) and have very low contamination in order for prodigal '
            'training to work well.')

    # get database locations
    db_locs = get_database_locs()
    db_handler = DatabaseHandler(db_locs['description_db'])
    db_locs_anno = filter_db_locs(db_locs, low_mem_mode, use_uniref,
                                  VMAG_DBS_TO_ANNOTATE)

    if virsorter_affi_contigs is not None:
        virsorter_hits = get_virsorter_hits(virsorter_affi_contigs)
    else:
        virsorter_hits = None

    # split sequences into seperate fastas
    mkdir(output_dir)
    contig_dir = path.join(output_dir, 'vMAGs')
    mkdir(contig_dir)
    contig_locs = list()
    for seq in read_sequence(input_fasta, format='fasta'):
        if len(seq) >= min_contig_size:
            if '=' in seq.metadata['id'] or ';' in seq.metadata['id']:
                raise ValueError(
                    'FASTA headers must not have = or ; before the first space (%s). To run DRAM-v you '
                    'must rerun VIRSorter with = and ; removed from the headers or run DRAM-v.py '
                    'remove_bad_characters and then rerun DRAM-v' %
                    seq.metadata['id'])
            if virsorter_hits is not None:
                if get_virsorter_affi_contigs_name(
                        seq.metadata['id']
                ) not in virsorter_hits['name'].values:
                    raise ValueError(
                        "No virsorter calls found in %s for scaffold %s from input fasta"
                        % (virsorter_affi_contigs, seq.metadata['id']))
            contig_loc = path.join(contig_dir, '%s.fasta' % seq.metadata['id'])
            write_sequence((i for i in [seq]), format='fasta', into=contig_loc)
            contig_locs.append(contig_loc)

    # annotate vMAGs
    rename_bins = False
    annotations = annotate_fastas(contig_locs, output_dir, db_locs_anno,
                                  db_handler, min_contig_size, prodigal_mode,
                                  trans_table, bit_score_threshold,
                                  rbh_bit_score_threshold, custom_db_name,
                                  custom_fasta_loc, skip_trnascan, rename_bins,
                                  keep_tmp_dir, start_time, threads, verbose)
    print('%s: Annotations complete, assigning auxiliary scores and flags' %
          str(datetime.now() - start_time))

    annotations = add_dramv_scores_and_flags(annotations, db_locs,
                                             virsorter_hits, input_fasta)

    # write annotations
    annotations.to_csv(path.join(output_dir, 'annotations.tsv'), sep='\t')

    print("%s: Completed annotations" % str(datetime.now() - start_time))
Exemple #7
0
def test_get_database_locs():
    test_database_locs = get_database_locs()
    assert type(test_database_locs) is dict
    assert 'description_db' in test_database_locs
Exemple #8
0
def summarize_genomes(input_file,
                      trna_path=None,
                      rrna_path=None,
                      output_dir='.',
                      groupby_column='fasta',
                      custom_distillate=None,
                      distillate_gene_names=False):
    start_time = datetime.now()

    # read in data
    annotations = pd.read_csv(input_file, sep='\t', index_col=0)
    if 'bin_taxnomy' in annotations:
        annotations = annotations.sort_values('bin_taxonomy')

    if trna_path is None:
        trna_frame = None
    else:
        trna_frame = pd.read_csv(trna_path, sep='\t')
    if rrna_path is None:
        rrna_frame = None
    else:
        rrna_frame = pd.read_csv(rrna_path, sep='\t')

    # get db_locs and read in dbs
    db_locs = get_database_locs()
    if 'genome_summary_form' not in db_locs:
        raise ValueError(
            'Genome summary form location must be set in order to summarize genomes'
        )
    if 'module_step_form' not in db_locs:
        raise ValueError(
            'Module step form location must be set in order to summarize genomes'
        )
    if 'function_heatmap_form' not in db_locs:
        raise ValueError(
            'Functional heat map form location must be set in order to summarize genomes'
        )

    # read in dbs
    genome_summary_form = pd.read_csv(db_locs['genome_summary_form'], sep='\t')
    if custom_distillate is not None:
        genome_summary_form = pd.concat(
            [genome_summary_form,
             pd.read_csv(custom_distillate, sep='\t')])
    genome_summary_form = genome_summary_form.drop('potential_amg', axis=1)
    module_steps_form = pd.read_csv(db_locs['module_step_form'], sep='\t')
    function_heatmap_form = pd.read_csv(db_locs['function_heatmap_form'],
                                        sep='\t')
    etc_module_df = pd.read_csv(db_locs['etc_module_database'], sep='\t')
    print('%s: Retrieved database locations and descriptions' %
          (str(datetime.now() - start_time)))

    # make output folder
    mkdir(output_dir)

    # make genome stats
    genome_stats = make_genome_stats(annotations,
                                     rrna_frame,
                                     trna_frame,
                                     groupby_column=groupby_column)
    genome_stats.to_csv(path.join(output_dir, 'genome_stats.tsv'),
                        sep='\t',
                        index=None)
    print('%s: Calculated genome statistics' %
          (str(datetime.now() - start_time)))

    # make genome metabolism summary
    genome_summary = path.join(output_dir, 'metabolism_summary.xlsx')
    if distillate_gene_names:
        summarized_genomes = fill_genome_summary_frame_gene_names(
            annotations, genome_summary_form, groupby_column)
    else:
        summarized_genomes = make_genome_summary(annotations,
                                                 genome_summary_form,
                                                 trna_frame, rrna_frame,
                                                 groupby_column)
    write_summarized_genomes_to_xlsx(summarized_genomes, genome_summary)
    print('%s: Generated genome metabolism summary' %
          (str(datetime.now() - start_time)))

    # make liquor
    if 'bin_taxonomy' in annotations:
        genome_order = get_ordered_uniques(
            annotations.sort_values('bin_taxonomy')[groupby_column])
        # if gtdb format then get phylum and most specific
        if all([
                i[:3] == 'd__' and len(i.split(';')) == 7
                for i in annotations['bin_taxonomy']
        ]):
            taxa_str_parser = get_phylum_and_most_specific
        # else just throw in what is there
        else:
            taxa_str_parser = lambda x: x
        labels = make_strings_no_repeats({
            row[groupby_column]: taxa_str_parser(row['bin_taxonomy'])
            for _, row in annotations.iterrows()
        })
    else:
        genome_order = get_ordered_uniques(
            annotations.sort_values(groupby_column)[groupby_column])
        labels = None

    # make module coverage frame
    module_nets = {
        module: build_module_net(module_df)
        for module, module_df in module_steps_form.groupby('module')
        if module in HEATMAP_MODULES
    }

    if len(genome_order) > GENOMES_PER_LIQUOR:
        module_coverage_dfs = list()
        etc_coverage_dfs = list()
        function_dfs = list()
        # generates slice start and slice end to grab from genomes and labels from 0 to end of genome order
        pairwise_iter = pairwise(
            list(range(0, len(genome_order), GENOMES_PER_LIQUOR)) +
            [len(genome_order)])
        for i, (start, end) in enumerate(pairwise_iter):
            genomes = genome_order[start:end]
            annotations_subset = annotations.loc[[
                genome in genomes for genome in annotations[groupby_column]
            ]]
            dfs = fill_liquor_dfs(annotations_subset,
                                  module_nets,
                                  etc_module_df,
                                  function_heatmap_form,
                                  groupby_column='fasta')
            module_coverage_df_subset, etc_coverage_df_subset, function_df_subset = dfs
            module_coverage_dfs.append(module_coverage_df_subset)
            etc_coverage_dfs.append(etc_coverage_df_subset)
            function_dfs.append(function_df_subset)
            liquor = make_liquor_heatmap(module_coverage_df_subset,
                                         etc_coverage_df_subset,
                                         function_df_subset, genomes, labels)
            liquor.save(path.join(output_dir, 'product_%s.html' % i))
        liquor_df = make_liquor_df(pd.concat(module_coverage_dfs),
                                   pd.concat(etc_coverage_dfs),
                                   pd.concat(function_dfs))
        liquor_df.to_csv(path.join(output_dir, 'product.tsv'), sep='\t')
    else:
        module_coverage_df, etc_coverage_df, function_df = fill_liquor_dfs(
            annotations,
            module_nets,
            etc_module_df,
            function_heatmap_form,
            groupby_column=groupby_column)
        liquor_df = make_liquor_df(module_coverage_df, etc_coverage_df,
                                   function_df)
        liquor_df.to_csv(path.join(output_dir, 'product.tsv'), sep='\t')
        liquor = make_liquor_heatmap(module_coverage_df, etc_coverage_df,
                                     function_df, genome_order, labels)
        liquor.save(path.join(output_dir, 'product.html'))
    print('%s: Generated product heatmap and table' %
          (str(datetime.now() - start_time)))
    print("%s: Completed distillation" % str(datetime.now() - start_time))
Exemple #9
0
def set_database_paths(kegg_db_loc=None,
                       kofam_hmm_loc=None,
                       kofam_ko_list_loc=None,
                       uniref_db_loc=None,
                       pfam_db_loc=None,
                       pfam_hmm_dat=None,
                       dbcan_db_loc=None,
                       dbcan_fam_activities=None,
                       viral_db_loc=None,
                       peptidase_db_loc=None,
                       vogdb_db_loc=None,
                       vog_annotations=None,
                       description_db_loc=None,
                       genome_summary_form_loc=None,
                       module_step_form_loc=None,
                       etc_module_database_loc=None,
                       function_heatmap_form_loc=None,
                       amg_database_loc=None,
                       start_time=None,
                       config_loc=None,
                       use_current_locs=True,
                       update_description_db=False):
    if start_time is None:
        start_time = datetime.now()
    print('%s: Setting database paths' % str(datetime.now() - start_time))
    if use_current_locs:
        db_dict = get_database_locs()
    else:
        db_dict = {}

    db_dict = check_exists_and_add_to_location_dict(kegg_db_loc, 'kegg',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(kofam_hmm_loc, 'kofam',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(kofam_ko_list_loc,
                                                    'kofam_ko_list', db_dict)
    db_dict = check_exists_and_add_to_location_dict(uniref_db_loc, 'uniref',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(pfam_db_loc, 'pfam',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(pfam_hmm_dat,
                                                    'pfam_hmm_dat', db_dict)
    db_dict = check_exists_and_add_to_location_dict(dbcan_db_loc, 'dbcan',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(dbcan_fam_activities,
                                                    'dbcan_fam_activities',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(viral_db_loc, 'viral',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(peptidase_db_loc,
                                                    'peptidase', db_dict)
    db_dict = check_exists_and_add_to_location_dict(vogdb_db_loc, 'vogdb',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(vog_annotations,
                                                    'vog_annotations', db_dict)

    db_dict = check_exists_and_add_to_location_dict(genome_summary_form_loc,
                                                    'genome_summary_form',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(module_step_form_loc,
                                                    'module_step_form',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(etc_module_database_loc,
                                                    'etc_module_database',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(function_heatmap_form_loc,
                                                    'function_heatmap_form',
                                                    db_dict)
    db_dict = check_exists_and_add_to_location_dict(amg_database_loc,
                                                    'amg_database', db_dict)
    print('%s: Database locations added to CONFIG' %
          str(datetime.now() - start_time))

    if update_description_db:
        if description_db_loc is None:
            description_db_loc = db_dict['description_db']
        populate_description_db(description_db_loc, db_dict, start_time)
        print('%s: Database descriptions updated' %
              str(datetime.now() - start_time))
    db_dict = check_exists_and_add_to_location_dict(description_db_loc,
                                                    'description_db', db_dict)

    # change data paths
    if config_loc is None:
        config_loc = get_config_loc()
    with open(config_loc, 'w') as f:
        f.write(json.dumps(db_dict))
    print('%s: Database locations set' % str(datetime.now() - start_time))