Esempio n. 1
0
def get_qmatrix_data(qmatrix_path):

    # check path exist
    if utils.check_path_exist(qmatrix_path):

        # initialize
        qmatrix_comments_list = []
        qmatrix_data_list = []

        # open and read qmatrix
        for qmatrix_line in open(qmatrix_path):
            qmatrix_line = qmatrix_line.strip()
            line_columns = qmatrix_line.split('\t')

            # read data
            if line_columns[0] == '*':
                qmatrix_data_list.append(qmatrix_line)
            else:
                qmatrix_comments_list.append(qmatrix_line)

        return (qmatrix_comments_list, qmatrix_data_list)

    else:
        sys.stderr.write("\n** Cannot open %s.\n" % qmatrix_path)
        utils.die("** Program exit!")
Esempio n. 2
0
def count_number_of_reads(reads_filepath, fasta_reads_flag):

    # number of reads
    number_of_reads = 0

    # check path exist
    if utils.check_path_exist(reads_filepath):

        # check fasta or fastq format
        if fasta_reads_flag:   # fasta format
            for read_line in open(reads_filepath,):
                read_line = read_line.strip()
                # fasta read id starts with >
                if read_line.startswith('>'):
                    number_of_reads += 1
        else:    # fastq format
            for read_line in open(reads_filepath):
                read_line = read_line.strip()
                # fastq read id starts with @
                if read_line.startswith('@'):
                    number_of_reads += 1
    else:
        sys.stderr.write("\n** Cannot open %s.\n" % reads_filepath)
        utils.die("** Program exit!")

    return number_of_reads
Esempio n. 3
0
def get_bootstrap_gvector(bootstrap_gvector_path):

    # check path exist
    if utils.check_path_exist(bootstrap_gvector_path):

        # initialize
        bootstrap_gvector_map = {}

        # open and read bootstrap_gvector_path
        for gvector_line in open(bootstrap_gvector_path):
            gvector_line = gvector_line.strip()
            gvector_columns = gvector_line.split('\t')

            # read data
            if gvector_columns[0] == '*':
                genome_index = int(gvector_columns[1])
                # ScaledPercentageChance is considered now.
                # genome_chance = float(gvector_columns[2])
                genome_chance = float(gvector_columns[3])
                bootstrap_gvector_map[genome_index] = genome_chance

        return bootstrap_gvector_map

    else:
        sys.stderr.write("\n** Cannot open %s!\n" % bootstrap_gvector_path)
        utils.die("** Program exit!")
Esempio n. 4
0
def check_aligned_reads(filename):

    check_align = False
    if utils.check_path_exist(filename):
        check_align = True
        return check_align
    else:
        sys.stderr.write("\n** Cannot open %s.\n" %(filename))
        utils.die("** Program exit!")
Esempio n. 5
0
def get_target_genome(gvector_path, reconstruction_selection, reconstruction_cutoff_abundance, reconstruction_genome_name):

    # genome_name_map(key:genome_index, val:genome_name)
    genome_name_map = {}
    # genome_chance_map
    genome_chance_map = {}
    # target_genome_index_list
    target_genome_index_list = []
    # to check the reconstruction_genome_name
    reconstruction_genome_name_exist_tag = False

    # check g-vector file
    if not utils.check_path_exist(gvector_path):
        utils.die("** Error: No such file or directory: " + gvector_path)
        
    # get target genome list (g_vector percentage >= cutoff_reconstruction_percentage)
    with open(gvector_path, 'r') as f:
		for gvector_line in f:
			gvector_line = gvector_line.strip()
			gvector_columns = gvector_line.split('\t')

			# consider first column is '@': @	Genome_Index	Genome_Name	Alignment_Rate
			if gvector_columns[0] == '@':
				genome_index = int(gvector_columns[1])
				genome_name = gvector_columns[2]
				genome_name_map[genome_index] = genome_name
			# consider first column is '*': *	Genome_Index	Percentage_Chance
			elif gvector_columns[0] == '*':
				genome_index = int(gvector_columns[1])
				# ScaledPercentageChance is considered now.
				# genome_chance = float(gvector_columns[2])
				genome_chance = float(gvector_columns[3])
				genome_chance_map[genome_index] = genome_chance

				# reconstruct all genomes >= cut-off
				if reconstruction_selection == 1:
					# if genome_chance >= Reconstruction_Cutoff_Abundance, then save
					if genome_chance >= reconstruction_cutoff_abundance:
						target_genome_index_list.append(genome_index)
				# reconstruct one target genome
				else:
					genome_name = genome_name_map[genome_index]
					if genome_name == reconstruction_genome_name:
						reconstruction_genome_name_exist_tag = True
						target_genome_index_list.append(genome_index)

    if not reconstruction_genome_name_exist_tag:
        utils.die("** Check the config file! Reconstruction_Genome_Name value does not match to the SIGMA gvector results!")

    if len(target_genome_index_list) == 0:
        utils.die("** No target genomes exist for reconstruction!")

    return (genome_name_map, genome_chance_map, target_genome_index_list)
Esempio n. 6
0
def get_filtering_target_genome(gvector_path, filtering_genome_name):

    # genome_name_map(key:genome_index, val:genome_name)
    genome_name_map = {}
    # genome_chance_map
    genome_chance_map = {}
    # target_genome_index_list
    target_genome_index_list = []
    # to check the filtering_genome_name
    filtering_genome_name_exist_tag = False

    # check g-vector file
    if not utils.check_path_exist(gvector_path):
        utils.die("** Error: No such file or directory: " + gvector_path)
        
    # get target genome list
    with open(gvector_path, 'r') as f:
		for gvector_line in f:
			gvector_line = gvector_line.strip()
			gvector_columns = gvector_line.split('\t')

			# consider first column is '@': @	Genome_Index	Genome_Name	Alignment_Rate
			if gvector_columns[0] == '@':
				genome_index = int(gvector_columns[1])
				genome_name = gvector_columns[2]
				genome_name_map[genome_index] = genome_name
			# consider first column is '*': *	Genome_Index	Percentage_Chance
			elif gvector_columns[0] == '*':
				genome_index = int(gvector_columns[1])
				# if PercentageChance is considered
				genome_chance = float(gvector_columns[2])
				# if ScaledPercentageChance is considered
				#genome_chance = float(gvector_columns[3])
				genome_chance_map[genome_index] = genome_chance
				genome_name = genome_name_map[genome_index]
				if genome_name == filtering_genome_name:
					filtering_genome_name_exist_tag = True
					target_genome_index_list.append(genome_index)

    if not filtering_genome_name_exist_tag:
        utils.die("** Check the config file! Filtering_Genome_Name value does not match to the SIGMA gvector results!")

    if len(target_genome_index_list) == 0:
        utils.die("** No target genomes exist for filtering!")

    return (genome_name_map, genome_chance_map, target_genome_index_list)
Esempio n. 7
0
def write_bootstrap_qmatrix(bootstrap_qmatrix_path, qmatrix_comments_list, qmatrix_data_list):

    # check path exist
    if utils.check_path_exist(bootstrap_qmatrix_path):
        check_call(["rm", "-f", bootstrap_qmatrix_path], stdout = PIPE, stderr = sys.stderr)
    bootstrap_qmatrix_out = open(bootstrap_qmatrix_path, 'wb')

    # loop for qmatrix_comments_list
    for qmatrix_comments_list_item in qmatrix_comments_list:
        bootstrap_qmatrix_out.write(str(qmatrix_comments_list_item) + '\n')

    # get qmatrix reads count
    qmatrix_data_count = len(qmatrix_data_list)

    # loop for qmatrix_data_list
    for qmatrix_data_index in xrange(0, qmatrix_data_count):
        random_index = random.randint(0, qmatrix_data_count - 1)
        bootstrap_qmatrix_out.write(str(qmatrix_data_list[random_index]) + '\n')
Esempio n. 8
0
def get_qmatrix_reads_count(qmatrix_path):

    # check path exist
    if utils.check_path_exist(qmatrix_path):

        # initialize
        qmatrix_reads_count = 0

        # open and read qmatrix
        for qmatrix_line in open(qmatrix_path):
            qmatrix_line = qmatrix_line.strip()
            line_columns = qmatrix_line.split('\t')

            # read data
            if line_columns[0] == '*':
                qmatrix_reads_count += 1

        return qmatrix_reads_count

    else:
        sys.stderr.write("\n** Cannot open %s.\n" % qmatrix_path)
        utils.die("** Program exit!")
Esempio n. 9
0
def get_gvector_genome_list(gvector_path):

    # check file exist
    if utils.check_path_exist(gvector_path):

        # initialize
        gvector_genome_list = []
            
        # open file
        gvector_file = open(gvector_path, 'r')
        for gvector_line in gvector_file:
            gvector_line = gvector_line.strip()
            gvector_columns = gvector_line.split('\t')

            # consider first column is '@': @   Genome_Index    Genome_Name Alignment_Rate
            if gvector_columns[0] == '@':
                gvector_genome_list.append(gvector_line)

        return gvector_genome_list

    else:
        sys.stderr.write("\n** Cannot open %s.\n" % qmatrix_path)
        utils.die("** Program exit!")
Esempio n. 10
0
def check_bowtie_index_built(genome_index_base,
                             genome_fasta_path_sublist):

    # get bowtie index filepath (basename.1.bt2)
    genome_index_bt1_path = genome_index_base + ".1.bt2"
    # get fasta filepath (first fasta file)
    genome_fasta_1_path = genome_fasta_path_sublist[0]

    # check if the bowtie index file exist
    if utils.check_path_exist(genome_index_bt1_path):

        # get file creation time
        index_creation_time = os.path.getctime(genome_index_bt1_path)
        fasta_creation_time = os.path.getctime(genome_fasta_1_path)
       
        # if index_creation_time is newer than fasta_creation_time
        if index_creation_time >= fasta_creation_time:
             return True    
        else:
             return False

    # if bowtie indexes do not exist, return false
    else:
        return False
 def save(self, path=None, name='dqn_net.pkl'):
     path = self.save_path if not path else path
     utils.check_path_exist(path)
     torch.save(self.eval_net.state_dict(), path + name)