def get_Newick_tree(cluster_hierachy, DataMatrix, labeltext, output):
	"""
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`BacterialTyper.scripts.functions.printList2file`
		
	"""
	
	tree = sch.to_tree(cluster_hierachy, False)
	Newick_tree = generateNewick(tree, "", DataMatrix, labeltext)
	
	Newick_tree_file = output + '.nwk'
	HCGB_main.printList2file(Newick_tree_file, [Newick_tree])
	
	handle = StringIO(Newick_tree)
	treePhylo = Phylo.read(handle, "newick")
	
	leaves_tree = []
	for leaf in treePhylo.get_terminals(): 
		leaves_tree.append(leaf.name)
		#print(leaf.name)
	
	## BUG: the list is printed alphabetically ordered
	Newick_tree_leaves =  output + '.leaves.txt'
	HCGB_main.printList2file(Newick_tree_leaves, leaves_tree)
Beispiel #2
0
def generate_sketch(folder, assembly, entry, ksize, n_sketch, Debug):

    (sigfile, siglist) = min_hash_caller.sketch_database({entry: assembly},
                                                         folder, Debug, ksize,
                                                         n_sketch)
    #functions.print_sepLine("*",50, False)

    ## print original in file
    file2print = folder + '/.original'

    ## do not write full path because database can move.
    ## as long as mantains the folder organization it should work.
    assembly_tmp_path = "../assembly/" + os.path.basename(assembly)

    list_fna = [assembly_tmp_path, str(ksize), str(n_sketch)]
    HCGB_main.printList2file(file2print, list_fna)
    return (sigfile[0], siglist[0])
Beispiel #3
0
def install_R_packages(package, source, install_path, extra):
	
	(install_R, install_github_package) = get_install_R_files()
	
	HCGB_files.create_folder(install_path)
	Rscript_exe = set_config.get_exe('Rscript')
	print("+ Installing %s package..." %package)
	install_file = install_R
	if (source == 'github'):
		install_file = install_github_package
		package= extra + '/' + package
	
	cmd_R = '%s %s -l %s -p %s' %(Rscript_exe, install_file, package, install_path)
	HCGB_sys.system_call(cmd_R)
	
	## check if exists or try to install
	MLSTar_package = os.path.join(install_path, 'MLSTar')
	if os.path.exists(MLSTar_package):
		RDir_package = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'R', 'R_package.info.txt')
		HCGB_main.printList2file(RDir_package, [install_path])
	else:
		print_error_message(package, "No R package found", 'package')
		print ('Please install manually to proceed...')
def multiQC_module_call(givenList, name, path, option):
    """
	Prepares files for multiQC report generation.
	
	:param givenList: List of folder to search for multiQC report.
	:param name: Name to include in the html report.
	:param path: Absolute path for the output folder.
	:param option: Some options to provide to multiQC_call.
	
	:type givenList: list
	:type name: string
	:type path: string
	:type option: string
	
	.. seealso:: This function depends on other BacterialTyper functions called:
	
		- :func:`HCGB.functions.main_functions.printList2file`
		
		- :func:`BacterialTyper.scripts.multiQC_report.multiQC_call`
	
	"""
    pathFile = path + '/' + 'samples.txt'
    HCGB_main.printList2file(pathFile, givenList)
    multiQC_call(pathFile, name, path, option)
Beispiel #5
0
def parse_options(arg_dict):

    outdir = os.path.abspath(arg_dict.output_folder)

    ## TODO: Now set as mutually_exclusive group. It might be Set to multiple options
    ## ATTENTION: df_accID merge generated dataframe

    ## --------------------------------------- ##
    ## GFF or GBF file
    ## --------------------------------------- ##
    if (arg_dict.annot_file):
        arg_dict.annot_file = os.path.abspath(arg_dict.annot_file)

        # *************************** ##
        ## multiple files provided
        # *************************** ##
        if (arg_dict.batch):
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple annotation file provided option:',
                              'yellow')
                debug_message('arg_dict.annot_file: ' + arg_dict.annot_file,
                              'yellow')

            ## check if ok
            BacDup_functions.file_readable_check(arg_dict.annot_file)

            print(
                colored('\t* Multiple annotation files provided .......[OK]',
                        'green'))
            dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',')

            ## debug messages
            if (arg_dict.debug):
                debug_message('dict_entries: ', 'yellow')
                debug_message(dict_entries, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++\n\n')

        # *************************** ##
        ## single file provided
        # *************************** ##
        else:
            dict_entries = {}
            print(colored('\t* Annotation file:.......[OK]', 'green'))
            if (arg_dict.sample_name):
                sample_name = arg_dict.sample_name
            else:
                sample_name = "sample"

            ##
            dict_entries[sample_name] = arg_dict.annot_file

        ## create dataframe df_accID to match other formats
        df_accID = pd.DataFrame(
            columns=(BacDup_functions.columns_accID_table()))

        for name, file_annot in dict_entries.items():
            file_annot = os.path.abspath(file_annot)

            ## init all
            genome = ""
            prot = ""
            gff = ""
            gbk = ""
            plasmid_count = ""
            plasmid_id = ""

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message(
                    'dict_entries check annotation files provided option:',
                    'yellow')
                debug_message('name: ' + name, 'yellow')
                debug_message('file_annot: ' + file_annot, 'yellow')

            ## check file is valid
            BacDup_functions.file_readable_check(file_annot)

            ## get format
            format = format_checker.is_format(file_annot, arg_dict.debug)

            if (arg_dict.debug):
                debug_message('format: ' + format, 'yellow')

            ## parse accordingly
            taxonomy = ""
            organism = ""
            taxonomy_string = ""
            genus = ""
            if (format == 'gbk'):
                ## get information from each sample
                (taxonomy,
                 organism) = BacDup.scripts.functions.get_gbk_information(
                     file_annot, arg_dict.debug)
                ## plasmid_count, plasmid_id not available

            elif (format == 'gff'):
                if (arg_dict.ref_file):
                    arg_dict.ref_file = os.path.abspath(arg_dict.ref_file)
                    BacDup_functions.file_readable_check(arg_dict.ref_file)

                    if (arg_dict.batch):
                        ref_entries = HCGB_main.file2dictionary(
                            arg_dict.ref_file, ',')
                        genome = ref_entries[name]
                    else:
                        genome = arg_dict.ref_file

            ## save into dataframe
            if len(taxonomy) > 1:
                genus = taxonomy[-1]
                taxonomy_string = ";".join(taxonomy)

            dir_path = os.path.abspath(os.path.dirname(file_annot))
            df_accID.loc[len(df_accID)] = (name, dir_path, genus, organism,
                                           taxonomy_string, genome, file_annot,
                                           format, prot, plasmid_count,
                                           ";".join(plasmid_id))

    ## --------------------------------------- ##
    ## NCBI RefSeq/Genbank IDs: GCA_XXXXXXXX.1; GCF_XXXXXXXXX.1
    ## --------------------------------------- ##
    elif (arg_dict.GenBank_id):
        ## get database path
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder(
                "db", os.path.abspath(arg_dict.output_folder))

        ## debug messages
        if (arg_dict.debug):
            debug_message('+++++++++++++++++++++++++++++++')
            debug_message('GenBank ID option:', 'yellow')
            debug_message('db_folder: ' + db_folder, 'yellow')

        # *************************** ##
        ## batch file
        # *************************** ##
        if (arg_dict.batch):
            arg_dict.GenBank_id = os.path.abspath(arg_dict.GenBank_id)

            ## debug messages
            if (arg_dict.debug):
                debug_message('GenBank ID batch file provided:', 'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.GenBank_id)

            print(
                colored('\t* Multiple NCBI GenBank IDs in a file .......[OK]',
                        'green'))
            print()

            ## call IDs into a list and create tmp folder
            strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id)
            strains2get = list(filter(None, strains2get))

            ## debug messages
            if (arg_dict.debug):
                debug_message('strains2get: ' + str(strains2get), 'yellow')

            ## call NCBI_downloader
            df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
                strains2get, db_folder, arg_dict.debug,
                arg_dict.assembly_level)

        # *************************** ##
        ## single GenBank ID
        # *************************** ##
        else:
            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Single NCBI GenBank IDs provided option:',
                              'yellow')
                debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id,
                              'yellow')
                debug_message('db_folder: ' + db_folder, 'yellow')
                debug_message('+++++++++++++++++++++++++++++++')

            ## download
            print(colored('\t* A NCBI GenBank ID:.......[OK]', 'green'))
            print()
            HCGB_aes.print_sepLine("+", 75, False)
            df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload(
                arg_dict.GenBank_id, db_folder, arg_dict.debug)

    ## --------------------------------------- ##
    ## NCBI Taxonomy ID:
    ## --------------------------------------- ##
    elif (arg_dict.tax_id):
        #################
        ## get tax ids
        #################
        if (arg_dict.batch):
            print(
                colored('\t* Multiple NCBI Taxonomy IDs in a file .......[OK]',
                        'green'))

            ## debug messages
            if (arg_dict.debug):
                debug_message('+++++++++++++++++++++++++++++++')
                debug_message('Multiple NCBI Taxonomy IDs provided option:',
                              'yellow')

            ## check is a file and readable
            BacDup_functions.file_readable_check(arg_dict.tax_id)

            ## get IDs into a list
            taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id)

        else:
            print(colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green'))
            taxIDs2get = [arg_dict.tax_id]

        print()

        ##################################
        ## init ete NCBI taxonomy database
        ##################################
        print('+ Initiate NCBI taxonomy database...')
        ncbi = taxonomy_retrieval.init_db_object(arg_dict.debug)

        string_info_total = []
        for taxid in taxIDs2get:
            ## parse
            info = taxonomy_retrieval.parse_taxid(taxid, ncbi, 'unravel',
                                                  arg_dict.debug)
            print()

            ## debug messages
            if arg_dict.debug:
                debug_message(
                    "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
                )
                debug_message('info\n', "yellow")
                print(info)

            ## append if more
            string_info_total.extend(info)

        ## convert to list of strings
        string_info_total = [str(int) for int in string_info_total]

        ## assume all belong to same superkingdom if children of same tax_id
        group_obtained = taxonomy_retrieval.get_superKingdom(
            string_info_total[0], ncbi, arg_dict.debug)

        #################
        ## get database path
        #################
        if (arg_dict.db_folder):
            db_folder = HCGB_files.create_folder(
                os.path.abspath(arg_dict.db_folder))
        else:
            db_folder = HCGB_files.create_subfolder("db", outdir)

        ## debug messages
        if arg_dict.debug:
            debug_message(
                "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
            )
            debug_message('group_obtained: ' + group_obtained, "yellow")
            debug_message('db_folder: ' + db_folder, "yellow")
            debug_message(
                'arg_dict.assembly_level: ' + arg_dict.assembly_level,
                "yellow")
            debug_message('arg_dict.section: ' + arg_dict.section, "yellow")

        ##################################
        ## get GenBank entries selected
        ##################################
        (strains2get,
         allstrains_available) = taxonomy_retrieval.get_GenBank_ids(
             db_folder,
             string_info_total,
             int(arg_dict.k_random),
             arg_dict.debug,
             assembly_level_given=arg_dict.assembly_level,
             group_given=group_obtained,
             section_given=arg_dict.section)

        ## print list and dictionary of possible and selected taxIDs
        outdir = os.path.abspath(arg_dict.output_folder)
        info_dir = HCGB_files.create_subfolder("info", outdir)
        input_info_dir = HCGB_files.create_subfolder("input", info_dir)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'Downloaded.txt'), strains2get)
        HCGB_main.printList2file(
            os.path.join(input_info_dir, 'all_entries.txt'),
            allstrains_available)

        ## save into file
        file_info = os.path.join(input_info_dir, 'info.txt')

        ## stop here if dry_run
        if arg_dict.dry_run:
            print()
            HCGB_aes.print_sepLine("*", 75, False)
            print(
                "ATTENTION: Dry run mode selected. Stopping the process here.")
            HCGB_aes.print_sepLine("*", 75, False)
            print("+ All available entries listed and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'all_entries.txt'))
            print("+ Subset of entries generated and printed in file:\n\t" +
                  os.path.join(input_info_dir, 'Downloaded.txt'))
            print(
                "\n\nIf random numbers selected, take into account re-running this process might produce different results.\n"
            )
            HCGB_aes.print_sepLine("*", 75, False)
            print()
            exit()

        #################
        ## call NCBI_downloader
        #################
        df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list(
            strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level)

    ## --------------------------------------- ##
    ## Previous BacDup analysis folder
    ## --------------------------------------- ##
    ## TODO
    elif (arg_dict.project):
        print(
            colored(
                '\t* A previous BacDup analysis project folder:.......[OK]',
                'green'))
        ## create df_accID to store data
        ## TODO

    ## Returns dataframe with information

    df_accID = df_accID.set_index('new_name')
    return (df_accID)
def generate_db(file_abs_paths, name, fold_name, option, type_option, Debug, kma_bin):
	"""Generate a call to create or update index KMA databases for later kmer identification. 

	:param file_abs_paths: List of absolute paths fasta genome files to include in the database.
	:param name: Database name.
	:param fold_name: Directory path to store database generated.
	:param option: Generate a new database (option = 'new') or add to pre-existing database (option = 'add'). If database exists, automatically adds.
	:param type_option: Index genome fasta files one by one (option_type='single') or using a batch file containing multiple entries (option='batch').
	:param kma_bin:	Binary executable for KMA software 
	:param Debug: True/False for debugging messages.
	
	:type file_abs_paths: list
	:type name: string
	:type fold_name: string
	:type option: string
	:type type_option: string 
	:type kma_bin:
	:type Debug: bool
		
	:returns: Absolute path to database generated
	
	.. seealso:: This function depends on other ``BacterialTyper`` functions called:
	
		- :func:`BacterialTyper.scripts.functions.readList_fromFile`
		
		- :func:`BacterialTyper.scripts.functions.printList2file`
		
		- :func:`BacterialTyper.scripts.species_identification_KMA.check_db_indexed`

		- :func:`BacterialTyper.scripts.species_identification_KMA.index_database`
		
	"""

	print ('+ Updating the KMA database: ', name)			

	## check
	if len(file_abs_paths) > 1:
		## read db in fold_name and get index files
		info = fold_name + '/' + name + '.db'

		## 
		lineList = []
		toIndexList = []
		indexedList = []		

		###
		if os.path.exists(info):
			lineList = HCGB_main.readList_fromFile(info)
			option = 'add'

		for f in file_abs_paths:
			baseName = os.path.basename(f)
			
			## check if already index
			if baseName in lineList:
				print (colored('\t+ File %s is already available in database %s' %(baseName, name), 'green'))
				indexedList.append(f)
			else:
				toIndexList.append(f)		
		
		if toIndexList:
			## generate batch and call
			info2 = fold_name + '/.batch_entries.txt'
			HCGB_main.printList2file(info2, toIndexList)
			status = index_database(info2, kma_bin, name, option, fold_name, type_option)
			final_list = set(lineList + toIndexList + indexedList)
			final_list_name = [os.path.basename(f) for f in final_list]
			HCGB_main.printList2file(info, final_list_name)
			count_files = len(toIndexList)
			print ('+ %s samples have been added to the database' %count_files)
		else:
			print ('\n+ No new sequences were added to the database.')
			return (fold_name + '/' + name)			
		
	else:
		file_name = file_abs_paths[0]
		## check if previously indexed
		status = check_db_indexed(file_name, fold_name)
		if (status): #true
			## debug message
			if (Debug):
				print (colored("**DEBUG: Database (%s) is indexed" %file_name + " **", 'yellow'))
			return (file_name)
		else: #false
			## debug message
			if (Debug):
				print (colored("**DEBUG: Database (%s) is not indexed" %file_name + " **", 'yellow'))
			status = index_database(file_name, kma_bin, file_name, option, fold_name, type_option)
	
	## return
	if (status): #true
		return (file_name)
	else:
		return False