Ejemplo n.º 1
0
 def write_most_abundant_seq_foreach_tpoint(
         self, output_dirpath, write_attribute_value_instead=None):
     """
     This method finds the most abundant sequence in each of the time-points, and writes them (as separate files) to disk.
     output_dirpath - This give the path to the directory that will contain the most abundant seqs.
     write_attribute_value_instead - If defined (default, None), this will write the value of the provided attribute. If defined this must be the name of an attribute in the seq data, and the value of this attribute should be a sequence of some sort. This is essentially a hack to allow the writing of CDR3 seq data.
     """
     for tpoint_filepath in self.sample_filepaths:
         filename = '.'.join(
             os.path.basename(tpoint_filepath).split('.')[:-1])
         output_filepath = '%s%s.fasta' % (output_dirpath, filename)
         sample = sequence_sample(
             filepath=tpoint_filepath,
             count_attribute_name=self.count_attribute_name)
         most_abun_index = sample.get_most_abundant_seq(
             output_filepath=None,
             translate_by_ref_seq=False,
             temp_dirpath=None,
             append_to_output=False,
             return_seq_index=True)
         sample.write_subset_of_seqs_to_disk(
             seq_indices=[most_abun_index],
             output_filepath=output_filepath,
             append_to_file=False,
             seq_ids=None,
             add_string_to_each_id_being_written=None,
             write_attribute_value_instead=write_attribute_value_instead)
     return
def get_divergence_distrbs(input_fasta_dirpath, output_dirpath, dataset='partis'):
	"""
	dataset - gives the type of preprocessing that was done on the raw seq data. The different processing have different formatted fasta headers. Exceptable values are:
		'partis' - means that the processing was PRESTO->partis->mixcr
		'changeo' - mean that the processing was PRESTO->IgBLAST->changeo
	"""
	if input_fasta_dirpath[-1] != '/':
		input_fasta_dirpath += '/'
	if output_dirpath[-1] != '/':
		output_dirpath += '/'
	for i in os.listdir(input_fasta_dirpath):
		if i[0] == '.' or i[:6] == 'README':
			continue

		print 'patient:', i

		if not os.path.exists(output_dirpath + i):
			os.makedirs(output_dirpath + i)
		for j in os.listdir(input_fasta_dirpath + i):
			if j[0] == '.' or j[:6] == 'README':
				continue

			print '\ttime-point:', j

			time_point = j[:-6]
			input_fasta_filepath = input_fasta_dirpath + i + '/' + j
			output_filepath = output_dirpath + i + '/' + time_point
			# print output_filepath
			# if os.path.exists(output_filepath):
			# 	print "output_filepath exists, so skipping"
			# 	continue
			fileout = open(output_filepath, "w")
			fileout.write('sequence_ID\tdivergence\n')
			if dataset == 'partis':
				sample = sequence_sample(input_fasta_filepath, count_attribute_name='count')
				for k in sample.data:
					for l in xrange(k['count']):
						fileout.write('%s\t%s\n' % (k['id'], k['other']['mut_freq']))
			elif dataset == 'changeo':
				sample = sequence_sample(input_fasta_filepath, count_attribute_name='DUPCOUNT')
				for k in sample.data:
					for l in xrange(k['count']):
						fileout.write('%s\t%s\n' % (k['id'], k['other']['total_mut_freq']))
			fileout.close()
	return
def get_mean_divergence(input_fasta_dirpath, output_dirpath, dataset='partis'):
	"""
	dataset - gives the type of preprocessing that was done on the raw seq data. The different processing have different formatted fasta headers. Exceptable values are:
		'partis' - means that the processing was PRESTO->partis->mixcr
		'changeo' - mean that the processing was PRESTO->IgBLAST->changeo
	"""
	if input_fasta_dirpath[-1] != '/':
		input_fasta_dirpath += '/'
	if output_dirpath[-1] != '/':
		output_dirpath += '/'
	if not os.path.exists(output_dirpath):
		os.makedirs(output_dirpath)
	for i in os.listdir(input_fasta_dirpath):
		if i[0] == '.' or i[:6] == 'README':
			continue

		print 'patient:', i

		fileout = open(output_dirpath + i, "w")
		fileout.write('time_point\tmean_divergence\n')
		for j in os.listdir(input_fasta_dirpath + i):
			if j[0] == '.' or j[:6] == 'README':
				continue

			print '\ttime-point:', j

			time_point = j[:-6]
			input_fasta_filepath = input_fasta_dirpath + i + '/' + j
			if dataset == 'partis':
				sample = sequence_sample(input_fasta_filepath, count_attribute_name='count')
				mean_divergence = sample.get_mean_attribute_value('mut_freq')
			elif dataset == 'changeo':
				sample = sequence_sample(input_fasta_filepath, count_attribute_name='DUPCOUNT')
				mean_divergence = sample.get_mean_attribute_value('total_mut_freq')
			fileout.write('%s\t%s\n' % (time_point, mean_divergence))
		fileout.close()
	return
Ejemplo n.º 4
0
 def get_time_series_sample_counts(self, return_dic=False):
     """
     This gets the sample counts for each of the sequence samples. Returns a list of floats where each float gives the total seqs in a given sample (in chronological order).
     return_dic - If True (default, False), this will return a dictionary, where the index a the time-point (as a float) and the def is the sample count (as a float).
     """
     sample_counts = []
     sample_counts_dic = {}
     for index, i in enumerate(self.sample_filepaths):
         sample = sequence_sample(
             filepath=i, count_attribute_name=self.count_attribute_name)
         sample_counts.append(sample.total)
         sample_counts_dic[self.timepoints[index]] = sample.total
     if return_dic:
         return sample_counts_dic
     else:
         return sample_counts
Ejemplo n.º 5
0
 def get_first_tpoint_most_abundant_seq(self,
                                        output_filepath=None,
                                        translate_by_ref_seq=False,
                                        temp_dirpath=None,
                                        append_to_output=False):
     """
     This method returns the most abundant seq from the first time-point.
     output_filepath - If defined (default, None) then this is the path to the output file that will be written for the most abundant seq. Will be in fasta format.
     translate_by_ref_seq - If defined (default is False) this should be the path to a fasta formatted reference sequence for the seqs in the alignment. This reference seq needs to be annotated such that the coding frame is known. Meaning, it is known where to begin translating the ref seq. The annotation for the coding frame information must be in the header of the ref seq and should look like:'...|coding_frame_start=[X]|...', where 'X' gives the coding frame start position. The most abundant seq is then aligned to this ref seq so that the coding frame can be mapped from the ref seq to the most abundant seq. The alignment is done using 'needle' from the EMBOSS package. This needs to be in the PATH. The resulting translation is recorded in the 'output_filepath', so this should be defined too. If 'translate_by_ref_seq' equals False, then no translation occurs.
     temp_dirpath - Gives the path to a directory for which temp data can be written. If None (default), then uses the master directory to 'output_filepath', if there is not output_filepath given then uses the current working directory.
     append_to_output - If True (default, False) then this will cause the output file to be appended to, rather than over-written. This is only considered if 'output_filepath' is defined.
     """
     sample = sequence_sample(
         filepath=self.sample_filepaths[0],
         count_attribute_name=self.count_attribute_name)
     most_abun_seq = sample.get_most_abundant_seq(
         output_filepath=output_filepath,
         translate_by_ref_seq=translate_by_ref_seq,
         temp_dirpath=temp_dirpath,
         append_to_output=append_to_output)
     return most_abun_seq
Ejemplo n.º 6
0
 def get_most_abundant_seq_from_tpoint(self,
                                       tpoint_index,
                                       output_filepath=None,
                                       translate_by_ref_seq=False,
                                       temp_dirpath=None,
                                       append_to_output=False,
                                       path_to_needle='needle'):
     """
     This method does the same thing as that of 'get_first_tpoint_most_abundant_seq', but it will get the most abundant seq for any desired time-point.
     tpoint_index - Int. This gives the index (indexed at 0) for the query time-point for which the mst abundant seq is desired. For example, if one wants the most abun. seq from the 2nd time-point this would be 1.
     append_to_output - If True (default, False) then this will cause the output file to be appended to, rather than over-written. This is only considered if 'output_filepath' is defined.
     path_to_needle - String. This give the path to the 'needle' executable. The default is simply 'needle', which means that it is already in your PATH.
     """
     sample = sequence_sample(
         filepath=self.sample_filepaths[tpoint_index],
         count_attribute_name=self.count_attribute_name)
     most_abun_seq = sample.get_most_abundant_seq(
         output_filepath=output_filepath,
         translate_by_ref_seq=translate_by_ref_seq,
         temp_dirpath=temp_dirpath,
         append_to_output=append_to_output,
         path_to_needle=path_to_needle)
     return most_abun_seq
Ejemplo n.º 7
0
def get_mean_attribute_compcluster(attribute_name, output_dirpath,
                                   lineage_master_dirpath,
                                   count_attribute_name, weight_by_counts,
                                   all_timepoints, treat_NAs,
                                   full_seq_sample_dirpath, min_freq_cutoff,
                                   random_suffix, temp_dirpath):
    all_timepoints = [float(i) for i in all_timepoints.split(',')]
    if weight_by_counts == 'True':
        weight_by_counts = True
    else:
        weight_by_counts = False
    min_freq_cutoff = float(min_freq_cutoff)
    sge_task_id = int(os.environ['SGE_TASK_ID'])
    temp_filepath = '%s%s_%s' % (temp_dirpath, random_suffix, sge_task_id)
    temp_file = open(temp_filepath, "r")
    filepath_info = temp_file.readline()[:-1].split('\t')
    lineage_dirpath = filepath_info[0]
    lineage = filepath_info[1]
    temp_file.close()
    subprocess.call(['rm', temp_filepath])
    #get full sample time-series counts
    seq_time_series_sample = sequence_time_series(
        dirpath=full_seq_sample_dirpath,
        count_attribute_name=count_attribute_name)
    total_counts_dic = seq_time_series_sample.get_time_series_sample_counts(
        return_dic=True)
    #check that lineage has high enough frequency, abort if not
    seq_time_series_sample = sequence_time_series(
        dirpath=lineage_dirpath, count_attribute_name=count_attribute_name)
    lineage_counts_dic = seq_time_series_sample.get_time_series_sample_counts(
        return_dic=True)
    freqs_too_low = True
    for tpoint_float in lineage_counts_dic:
        freq = lineage_counts_dic[tpoint_float] / total_counts_dic[tpoint_float]
        if freq >= min_freq_cutoff:
            freqs_too_low = False
            break
    if freqs_too_low:
        print 'lineage from %s is composed of seq samples that are too low in frequency, so aborting.' % lineage_dirpath
        return
    tpoint_value_dic = {}
    for tpoint_sample in os.listdir(lineage_dirpath):
        if tpoint_sample[0] == '.' or tpoint_sample[:6] == 'README':
            continue
        tpoint_float = float('.'.join(tpoint_sample.split('.')[:-1]))
        sample_filepath = '%s/%s' % (lineage_dirpath, tpoint_sample)
        seq_sample = sequence_sample(filepath=sample_filepath,
                                     count_attribute_name=count_attribute_name)
        mean_attribute_value = seq_sample.get_mean_attribute_value(
            query_attribute_name=attribute_name,
            weight_by_counts=True,
            treat_NAs=treat_NAs)
        tpoint_value_dic[tpoint_float] = mean_attribute_value
    output_filepath = '%s%s.txt' % (output_dirpath, lineage)
    fileout = open(output_filepath, "w")
    fileout.write('timepoint\t%s\n' % attribute_name)
    for tpoint in all_timepoints:
        if tpoint in tpoint_value_dic:
            attribute_value = tpoint_value_dic[tpoint]
        else:
            attribute_value = 'NA'
        fileout.write('%s\t%s\n' % (tpoint, attribute_value))
    fileout.close()
    return
Ejemplo n.º 8
0
    def get_freq_trajectories(self,
                              full_seq_sample_dirpath,
                              output_dirpath=None,
                              min_freq_cutoff=0.0):
        """
		This method gets the total count of each of the lineages and then divides this by the total count of the sample to arrive at the relative frequency of each of the lineages. Writes the trajectory of this frequency over time.
		full_seq_sample_dirpath - This gives the directory path for the full sequence samples from which all of the lineages came from. We need this to get the total count for the entire sample.
		output_dirpath - This gives the path to the directory that the frequency trajectories for each of the lineages will be written. If this is None, than no output written.
		"""
        if full_seq_sample_dirpath[-1] != '/':
            full_seq_sample_dirpath += '/'
        if output_dirpath:
            if output_dirpath[-1] != '/':
                output_dirpath += '/'
            if not os.path.exists(output_dirpath):
                os.makedirs(output_dirpath)
        #get full sample total counts foreach tpoint
        total_counts_dic = {}
        for i in os.listdir(full_seq_sample_dirpath):
            if i[0] == '.' or i[:6] == 'README':
                continue
            tpoint_float = float('.'.join(i.split('.')[:-1]))
            full_seq_sample_filepath = '%s%s' % (full_seq_sample_dirpath, i)
            full_seq_sample = sequence_sample(
                filepath=full_seq_sample_filepath,
                count_attribute_name=self.count_attribute_name)
            total_counts_dic[tpoint_float] = full_seq_sample.total
        lineage_freq_trajs = {}
        for lineage in self.lineages:
            print '\tlineage:', lineage
            lineage_dirpath = '%s%s/' % (self.dirpath, lineage)
            lineage_freq_traj = {}
            freqs_too_low = True
            for tpoint in self.lineages[lineage]:
                sample_filepath = '%s%s' % (lineage_dirpath, tpoint)
                seq_sample = sequence_sample(
                    filepath=sample_filepath,
                    count_attribute_name=self.count_attribute_name)
                tpoint_float = float('.'.join(tpoint.split('.')[:-1]))
                freq = seq_sample.total / total_counts_dic[tpoint_float]
                if freq >= min_freq_cutoff:
                    freqs_too_low = False
                lineage_freq_traj[
                    tpoint_float] = seq_sample.total / total_counts_dic[
                        tpoint_float]
            if freqs_too_low:
                print 'lineage from %s is composed of seq samples that are too low in frequency, so skipping.' % lineage_dirpath
                continue
            lineage_freq_trajs[lineage] = lineage_freq_traj
            if output_dirpath:
                output_filepath = '%s%s.txt' % (output_dirpath, lineage)
                if not os.path.exists(os.path.dirname(output_filepath)):
                    os.makedirs(os.path.dirname(output_filepath))
                fileout = open(output_filepath, "w")
                fileout.write('timepoint\tfrequency\n')
                for i in self.timepoints:
                    if i in lineage_freq_traj:
                        freq = lineage_freq_traj[i]
                    else:
                        freq = 0.
                    fileout.write('%s\t%s\n' % (i, freq))
                fileout.close()
        return lineage_freq_trajs
Ejemplo n.º 9
0
    def calc_diversity_pi(self,
                          full_seq_sample_dirpath,
                          output_dirpath,
                          num_alignments_per_job=100000,
                          method='needle',
                          path_to_needle='needle',
                          path_to_vsearch=None,
                          temp_dirpath=None,
                          num_parallel_cores=12,
                          try_again=False,
                          one_job_per='lineage',
                          min_freq_cutoff=0.0):
        """
		This method will calculate diversity (pi) for each of the seq time-points, for each of the lineages.
		full_seq_sample_dirpath - This gives the directory path for the full sequence samples from which all of the lineages came from. We need this to get the total count for the entire sample.
		path_to_needle - This is the path to the needle program by EMBOSS that impliments the needleman-wunsch global alignment algorithm. This is what we use to get the genetic distance between two seqs.
        method - This gives the method that will be used to get the genetic distance from the pairs of seqs. The exceptable values for this are:
            'needle' - This means the needleman-wunsch global alignment algorithm will be used in a program called 'needle' in the EMBOSS package
            'pairwise2' - This means the 'pairwise2' Biopython package will be used, which also implements the needleman-wunsch global alignment algorithm.
            'vsearch' - This means the vsearch program will be used. This program is capable of doing an all pairwise alignment by itself, so the flow of this method is significantly changed if this is selected.
        path_to_vsearch - This is the path to the vsearch excecutable. If this is None (default) then the path is assumed to be 'vsearch' (i.e. in $PATH). Ignored if method!='vsearch'
        temp_dirpath - This gives the path to the temporary directory that will contain the fasta files to be submitted to the aligner (if the chosen aligner needs this), and the sub_sum_of_distances values outputed by 'pi_calculator_compCluster'. The temp fasta files can get quite large, so make sure that there is enough space where ever this path leads. If temp_dirpath=None (default) then a temp dir in the current working directory is made for this.
        num_parallel_cores - This gives the number of parallel jobs that vsearch can use. vsearch's default is to use as many cores as it can, and this slows down the cluster for others. So, we need to set this to a fixed amount, and then let the cluster know when submitting array job. Default is 12.
        try_again - Boolean. If True (not default), then the script will keep trying to calculate pi until it works. It seems that vsearch is a bit buggy and can write uninterpretable lines of output. This setting (when set to True) will simply keep trying until it works. Only relevant if method='vsearch'.
        one_job_per - This tells the method how to split up the array jobs. Acceptable values are:
        	'timepoint' - This makes it so one cycles through lineages, and for each lineages, submits and array job where each job is one time-point. The script waits for each lineage to complete before starting the next.
        	'lineage' - Default. This makes it so there is one job per lineage, and within each job the script will cycle through each time-point. This is most likely faster.
        min_freq_cutoff - Float. This gives the frequency threshold that each lineage must pass (at least once) in order to be recorded in the output.
		"""
        if full_seq_sample_dirpath[-1] != '/':
            full_seq_sample_dirpath += '/'
        if output_dirpath[-1] != '/':
            output_dirpath += '/'
        if not os.path.exists(output_dirpath):
            os.makedirs(output_dirpath)
        #get full sample total counts foreach tpoint
        total_counts_dic = {}
        for i in os.listdir(full_seq_sample_dirpath):
            if i[0] == '.' or i[:6] == 'README':
                continue
            tpoint_float = float('.'.join(i.split('.')[:-1]))
            full_seq_sample_filepath = '%s%s' % (full_seq_sample_dirpath, i)
            full_seq_sample = sequence_sample(
                filepath=full_seq_sample_filepath,
                count_attribute_name=self.count_attribute_name)
            total_counts_dic[tpoint_float] = full_seq_sample.total
        total_counts_string = ''
        total_counts_list = []
        for tpoint in sorted(total_counts_dic):
            total_counts_string += '%s:%s,' % (tpoint,
                                               total_counts_dic[tpoint])
            total_counts_list.append(total_counts_dic[tpoint])
        total_counts_string = total_counts_string[:-1]
        if one_job_per == 'lineage':
            num_jobs = 0
            random_suffix = str(random.random())
            for i in self.lineages:
                num_jobs += 1
                temp_file = open(
                    '%s%s_%s' % (temp_dirpath, random_suffix, num_jobs), "w")
                input_dirpath = '%s%s/' % (self.dirpath, i)
                temp_file.write('%s\t%s\n' % (input_dirpath, i))
                temp_file.close()
            num_jobs = len(self.lineages)
            timepoints_string = ','.join([str(i) for i in self.timepoints])
            p = Popen([
                'qsub', '-t',
                '1-%s' % num_jobs, 'seq_lineage_time_series_class.py',
                'calc_diversity_pi_compcluster', self.dirpath,
                self.count_attribute_name,
                str(num_alignments_per_job), method, path_to_needle,
                path_to_vsearch, temp_dirpath,
                str(num_parallel_cores),
                str(try_again), output_dirpath, timepoints_string,
                total_counts_string,
                str(min_freq_cutoff), random_suffix
            ],
                      stdout=PIPE,
                      stderr=PIPE)
            out, err = p.communicate()
            print out
            print err
        elif one_job_per == 'timepoint':
            for lineage in self.lineages:
                lineage_dirpath = '%s%s/' % (self.dirpath, lineage)
                seq_samp_time_series = sequence_time_series(
                    dirpath=lineage_dirpath,
                    count_attribute_name=self.count_attribute_name)
                sample_counts = seq_samp_time_series.get_time_series_sample_counts(
                )
                freqs_too_low = True
                for index in xrange(len(sample_counts)):
                    freq = sample_counts[index] / total_counts_list[index]
                    if freq >= min_freq_cutoff:
                        freqs_too_low = False
                        break
                if freqs_too_low:
                    print 'lineage from %s is composed of seq samples that are too low in frequency, so skipping.' % lineage_dirpath
                    continue
                diversity_pi_values = seq_samp_time_series.down_sample_and_calc_pi_compCluster(
                    downsamp_to='Nope',
                    num_downsamp_trials=1,
                    num_alignments_per_job=num_alignments_per_job,
                    method=method,
                    path_to_needle=path_to_needle,
                    path_to_vsearch=path_to_vsearch,
                    temp_dirpath=temp_dirpath,
                    num_parallel_cores=num_parallel_cores,
                    try_again=try_again,
                    no_array_job=False)
                pi_vals_dic = {}
                for i in diversity_pi_values:
                    tpoint = float('.'.join(
                        os.path.basename(i).split('.')[:-1]))
                    pi_vals_dic[tpoint] = diversity_pi_values[i][0]
                output_filepath = '%s%s.txt' % (output_dirpath, lineage)
                fileout = open(output_filepath, "w")
                fileout.write('timepoint\tdiversity\n')
                for i in self.timepoints:
                    if i in pi_vals_dic:
                        pi = pi_vals_dic[i]
                    else:
                        pi = "NA"
                    fileout.write('%s\t%s\n' % (i, pi))
                fileout.close()
        return
def run(input_seq_clusters_dirpath, output_msa_dirpath, output_newick_tree_dirpath, input_indiv_seq_fasta_dirpath, output_outliers_removed_fasta_dirpath, output_outlier_seqs_dirpath):
	"""
	This script will take fasta files that are of clustered HIV data, and first make MSA's of them for each patient, and then make phylogeneic trees for each patient.
	"""

	########### parameters ###########
	count_attribute_name = 'DUPCOUNT'
	temp_dirpath = '/Users/nstrauli/data/abr_hiv_coevo/temp_stuff'
	alignment_method = 'mafft'
	ref_seq_dirpath = '/Users/nstrauli/data/abr_hiv_coevo/seq_data/1st_time_point_consensus_seqs/hiv/most_abundant_method/'
	phylo_method = 'fasttree'
	path_to_fasttree = '/Users/nstrauli/tools/fastTree/FastTree'
	nuc_or_aa = 'nucleotide'
	include_freq_info = True
	freq_attribute_name = 'total_freq'
	outgroup_label = 'outgroup'
	info_included_in_trees = ['is_outlier']
	#tree plotting parameters
	outlier_attribute_name = 'is_outlier'
	tree_plot_file_suffix = '_full_tree_half_circ_plot.pdf'
	time_series_info = 'start_of_id'
	tree_style = 'half_circle'
	leaf_size_map_to = 'freq'
	show_leaf_names = False
	color_branches_by = 'is_outlier'
	line_width = 3
	start_color = 'red'
	end_color = 'blue'
	ladderize = True
	########### parameters ###########

	if input_seq_clusters_dirpath[-1] != '/':
		input_seq_clusters_dirpath += '/'
	if output_msa_dirpath[-1] != '/':
		output_msa_dirpath += '/'
	if not os.path.exists(output_msa_dirpath):
		os.makedirs(output_msa_dirpath)
	if output_newick_tree_dirpath[-1] != '/':
		output_newick_tree_dirpath += '/'
	if not os.path.exists(output_newick_tree_dirpath):
		os.makedirs(output_newick_tree_dirpath)
	if input_indiv_seq_fasta_dirpath[-1] != '/':
		input_indiv_seq_fasta_dirpath += '/'
	if output_outliers_removed_fasta_dirpath[-1] != '/':
		output_outliers_removed_fasta_dirpath += '/'
	if not os.path.exists(output_outliers_removed_fasta_dirpath):
		os.makedirs(output_outliers_removed_fasta_dirpath)
	if output_outlier_seqs_dirpath[-1] != '/':
		output_outlier_seqs_dirpath += '/'
	if not os.path.exists(output_outlier_seqs_dirpath):
		os.makedirs(output_outlier_seqs_dirpath)
	for patient in os.listdir(input_seq_clusters_dirpath):
		if patient[0] == '.' or patient[:6] == 'README':
			continue

		print ''
		print "#####################"
		print "patient:", patient
		print "#####################"
		print ''

		patient_alignment_filepath = output_msa_dirpath + patient + '.fasta'
		patient_ref_seq_filepath = ref_seq_dirpath + patient + '.fasta'
		newick_output_filepath = '%s%s.tree' % (output_newick_tree_dirpath, patient)

		#get seq ID for the consensous seq for the patient
		filein = open(patient_ref_seq_filepath, "r")
		ref_seq_id = filein.readline()[1:].split('|')[0]
		filein.close()

		#make MSA
		sample_time_series = sequence_time_series(dirpath=input_seq_clusters_dirpath+patient, count_attribute_name=count_attribute_name)
		sample_time_series.make_MSA(output_filepath=patient_alignment_filepath, temp_dirpath=temp_dirpath, method=alignment_method, add_ref_seq=None, write_seq_id_first='%s_%s' % (sample_time_series.timepoints[0], ref_seq_id))

		#make tree in newick format
		time_series_msa = msa(filepath=patient_alignment_filepath, count_attribute_name=count_attribute_name)
		time_series_msa.make_phylo_trees(output_filepath=newick_output_filepath, method=phylo_method, path_to_fasttree=path_to_fasttree, nuc_or_aa=nuc_or_aa, temp_dirpath=temp_dirpath, include_freq_info=include_freq_info, freq_attribute_name=freq_attribute_name, first_seq_is_outgroup=True, outgroup_label=outgroup_label, include_attributes=info_included_in_trees)

		#plot tree, and get outlier clade
		tree = phylo_tree(filepath=newick_output_filepath, count_attribute_name=count_attribute_name, freq_attribute_name=freq_attribute_name, outgroup_label=outgroup_label)
		if patient == '5' or patient == '6':
			outlier_seq_ids = tree.find_outlier_clade(outlier_attribute_name=outlier_attribute_name, get_ancestor_of_outier_clade=True)
		else:
			outlier_seq_ids = tree.find_outlier_clade(outlier_attribute_name=outlier_attribute_name)
		tree.plot_tree(output_filepath=newick_output_filepath[:-5]+tree_plot_file_suffix, time_series_info=time_series_info, tree_style=tree_style, leaf_size_map_to=leaf_size_map_to, show_leaf_names=show_leaf_names, color_branches_by=color_branches_by, line_width=line_width, start_color=start_color, end_color=end_color, ladderize=ladderize)

		#remove seqs that are in outlier clade from data
		sequence_clusters = sequence_sample(filepath=patient_alignment_filepath, count_attribute_name=count_attribute_name)
		outlier_cluster_to_seq_ids_dic = sequence_clusters.get_seqID_to_attribute_dic(query_attribute_name='indiv_seq_ids', only_for_seq_ids=outlier_seq_ids)
		#init dic
		tpoint_to_outlier_seq_ids_dic = {}
		for i in sample_time_series.timepoints:
			tpoint_to_outlier_seq_ids_dic[str(int(i))] = []
		for i in outlier_cluster_to_seq_ids_dic:
			tpoint = i.split('_')[0].split('.')[0]
			seq_ids = outlier_cluster_to_seq_ids_dic[i].split(',')
			tpoint_to_outlier_seq_ids_dic[tpoint] += seq_ids
		input_indiv_seqs_patient_dirpath = input_indiv_seq_fasta_dirpath + patient + '/'
		output_fasta_dirpath = output_outliers_removed_fasta_dirpath + patient + '/'
		if not os.path.exists(output_fasta_dirpath):
			os.makedirs(output_fasta_dirpath)
		outlier_seqs_patient_filepath = output_outlier_seqs_dirpath + patient + '.fasta'
		if os.path.exists(outlier_seqs_patient_filepath):
			subprocess.call(['rm', outlier_seqs_patient_filepath])
		for tpoint in tpoint_to_outlier_seq_ids_dic:
			input_idiv_seqs_filepath = input_indiv_seqs_patient_dirpath + tpoint + '.fasta'
			sample = sequence_sample(filepath=input_idiv_seqs_filepath, count_attribute_name=count_attribute_name)
			if tpoint_to_outlier_seq_ids_dic[tpoint]:
				sample.write_subset_of_seqs_to_disk(seq_indices=None, output_filepath=outlier_seqs_patient_filepath, append_to_file=True, seq_ids=tpoint_to_outlier_seq_ids_dic[tpoint], add_string_to_each_id_being_written=tpoint+'_')
			sample.remove_seq_entries(seq_indicators=tpoint_to_outlier_seq_ids_dic[tpoint], seq_ids=True)
			output_fasta_filepath = output_fasta_dirpath + tpoint + '.fasta'
			sample.write_full_data_to_disk_fasta(output_filepath=output_fasta_filepath, append_to_file=False, seq_id_fileout_dic=None)

	return
Ejemplo n.º 11
0
    def cluster_sequences(self,
                          method='vsearch',
                          min_ident_within_cluster=0.90,
                          output_filepath=None,
                          path_to_vsearch=None,
                          temp_dirpath=None,
                          freqs_or_counts='freqs',
                          sort_lineages_by='range',
                          output_network_filepath=None,
                          output_node_attribute_filepath=None,
                          node_attributes_to_add=['count'],
                          names_for_node_attributes_in_header=None,
                          max_edit_distance=1,
                          path_to_needle=None,
                          freq_attribute_name=None):
        """
        This method will cluster all the sequences in all the time-points. BEWARE, in order to do this, all the data must be read into memory, so make sure they aren't super huge files. It clusters the sequences using Vsearch, and also gets the relative size of each cluster as well. Can write this information as expression trajectories for each of the clusters found in the data.
        method - This gives the method that will be used for clustering. Acceptable values are:
            'vsearch' - Default. This means that vsearch will be used to cluster the seqs
            'by_edit_distance' - This means that an in house algorithm of joining seqs to be in the same cluster if they have an edit distance at or below some given threshold. The 'needle' program is used for determining the edit distance between pairs of seqs.
        output_filepath - This give the path to the directory for which the lineage expression trajectories will be written. The format is tab delimited, where the first column gives the identity of the lineage (i.e. Vgene identity, Jgene identity, and centroid sequence, separted by '_'), and the following columns give the expression level of each lineage at each of the respective time-points. If equals None, then no output will be written.
        sort_lineages_by - This tells how the lineages should be sorted when written to file. Exceptable vlaues are:
            'range' - This means the absolute range is used to sort them (in descending order). Range is defined as the tpoint with max expression minus the tpoint with min expression.
            'sum' - This means the lineages will be sorted by the summation of their expression values over the time-course.

        #### Below are parameters only considered if method=='vsearch' ####

        min_ident_within_cluster - This gives the clustering parameter for clustering seqs within a tpoint. See the immune_sample_class.cluster_clones for explanation.
        path_to_vsearch - This gives the path to the vsearch excecutable. If this is None (default) then the path is assumed to be 'vsearch' (i.e. in $PATH).                                          
        temp_dirpath - This gives the path to the directory for which temporary directories will be made within. The temporary directories will be deleted at the end of the script, but 'temp_dirpath' will not.
        freqs_or_counts - This indicates whether the relative frequency (default) or the absolute counts for each lineage should be reported at each of the time-points. If 'freq' then gives relative frequency, if 'count' then gives absolute counts.

        #### Below are parameters only considered if method=='by_edit_distance' ####

        output_network_filepath - If defined (default, None), this will, 1) instruct the method to creat a network file (as a simple interaction file ('.sif')), and 2) will be the path to this file.
        output_node_attribute_filepath - This is the file that will contain all the node attributes for each of the seqs. This is for cytoscape. This parameter is only considered if 'output_network_filepath' is defined.
        node_attributes_to_add - This is a list of sequence attributes to include in the node attribute file. This parameter is only considered if 'output_node_attribute_filepath' is defined. The default is to only add the value for the 'count' attribute (i.e. the value for self.data[some_index]['count']). Acceptable values within this list of strings are:
            any string that is a key for one of the attributes for the sequences. So, could be 'count', 'total_freq', 'timepoint', etc. if those are keys to the dictionarys within self.data[some_index], or self.data[some_index]['other']
            'element_X_of_seq_id' - sometimes info about the seq is encoded in the seq ID, delimited by '_'. This means that the 'X'th element of the seq ID (when delimited by a '_') will be included in the node attributes file.
        names_for_node_attributes_in_header - This is an optional argument, which if defined (default, None), it will give the names for each of the node attributes included in the 'output_node_attribute_filepath' header. This parameter is only considered if 'output_node_attribute_filepath'. If 'output_node_attribute_filepath' is defined and this is not, then the default is to simply use the values in 'node_attributes_to_add'. The names in this list need to be in the same order as the attributes in 'node_attributes_to_add'.
        max_edit_distance - Int. Default, 1. This gives the maximum edit distance between any two seqs, for them to be assigned to the same cluster.
        path_to_needle - This gives the path to the needle executable. This is required if using the computational cluster for an array job because the PATH variable is all messed up when sending jobs to parallele nodes. One can also use this if needle is not in the PATH
        freq_attribute_name - If defined (default, None), this will give the name of the attribute that gives the frequency for each seq in the data set. If it is not defined then frequency is calculated by the seq 'count' / the total, and the freq_attribute_name is set to 'freq'.
        """
        #make sure freqs_or_counts has exceptable value
        if freqs_or_counts != 'freqs' and freqs_or_counts != 'counts':
            print 'freqs_or_counts parameter does not have an exceptable value:', freqs_or_counts
            return
        #make sure sort_lineages_by has exceptable value
        if sort_lineages_by != 'range' and sort_lineages_by != 'sum':
            print 'sort_lineages_by parameter does not have an exceptable value:', sort_lineages_by
            return
        if temp_dirpath == None:
            temp_dirpath = tempfile.mkdtemp(dir=os.getcwd()) + '/'
        else:
            temp_dirpath = tempfile.mkdtemp(dir=temp_dirpath) + '/'

        if method == 'vsearch':
            #write all the data to a temp fasta file
            random_string = str(random.random())[2:]
            temp_fasta_filepath = '%s%s.fasta' % (temp_dirpath, random_string)
            fileout = open(temp_fasta_filepath, "w")
            count = 0
            for i in xrange(len(self.sample_filepaths)):
                sample = sequence_sample(self.sample_filepaths[i],
                                         self.count_attribute_name)
                for j in sample.data:
                    count += 1
                    fileout.write('>%s;size=%s;tpoint=%s;freq=%s;\n%s\n' %
                                  (count, j['count'], i,
                                   j['count'] / sample.total, j['seq']))
            fileout.close()
            #cluster all this data
            msa_output_filepath = '%s%s_msa.fasta' % (temp_dirpath,
                                                      random_string)
            p = Popen([
                path_to_vsearch, '--cluster_size', temp_fasta_filepath, '--id',
                str(min_ident_within_cluster), '--sizein', '--sizeout',
                '--msaout', msa_output_filepath, '--fasta_width',
                str(0)
            ],
                      stdout=PIPE,
                      stderr=PIPE)
            out_err = p.communicate()
            #parse MSA output to get centroids and counts for each timepoint
            lineage_expr_trajs = {}
            filein = open(msa_output_filepath, "r")
            while True:
                line = filein.readline()
                #if EOF, break loop
                if not line:
                    break
                elif line[0] == '>':
                    #if this is the centroid sequence
                    if line[1] == '*':
                        line = line[:-1].split(';')
                        tpoint = int(line[2].split('=')[1])
                        #get centroid seq for index
                        centroid_seq = filein.readline()[:-1]
                        #remove gaps
                        centroid_seq = re.sub('-', '', centroid_seq)
                        index = centroid_seq
                        #initialize the dic entry
                        lineage_expr_trajs[index] = [
                            0.0 for j in self.timepoints
                        ]
                        if freqs_or_counts == 'freqs':
                            freq = float(line[3].split('=')[1])
                            lineage_expr_trajs[index][tpoint] += freq
                        else:
                            count = int(line[1].split('=')[1])
                            lineage_expr_trajs[index][tpoint] += count
                    elif line[:-1] != '>consensus':
                        line = line[:-1].split(';')
                        tpoint = int(line[2].split('=')[1])
                        if freqs_or_counts == 'freqs':
                            freq = float(line[3].split('=')[1])
                            lineage_expr_trajs[index][tpoint] += freq
                        else:
                            count = int(line[1].split('=')[1])
                            lineage_expr_trajs[index][tpoint] += count
            filein.close()
            #delete temp files
            subprocess.call(['rm', temp_fasta_filepath, msa_output_filepath])

        elif method == 'by_edit_distance':
            random_suffix = str(random.random())
            temp_fasta_filepath = '%stemp_seqfile_%s.fasta' % (temp_dirpath,
                                                               random_suffix)
            for i in self.sample_filepaths:
                sample = sequence_sample(
                    filepath=i, count_attribute_name=self.count_attribute_name)
                if not freq_attribute_name:
                    sample.add_freq_attribute(freq_attribute_name='freq')
                tpoint = float('.'.join(os.path.basename(i).split('.')[:-1]))
                sample.add_string_to_each_id(str(tpoint) + '_',
                                             add_to_start_or_end='start')
                sample.write_full_data_to_disk_fasta(
                    output_filepath=temp_fasta_filepath, append_to_file=True)
            if not freq_attribute_name:
                freq_attribute_name = 'freq'
            sample = sequence_sample(
                filepath=temp_fasta_filepath,
                count_attribute_name=self.count_attribute_name)
            print 'Total unique sequences in pooled data:', len(sample.data)
            cluster_dic, connections_dic = sample.cluster_seqs(
                method='by_edit_distance',
                temp_dirpath=temp_dirpath,
                max_edit_distance=max_edit_distance,
                overwrite_data=False,
                output_network_filepath=output_network_filepath,
                output_node_attribute_filepath=output_node_attribute_filepath,
                node_attributes_to_add=node_attributes_to_add,
                names_for_node_attributes_in_header=
                names_for_node_attributes_in_header,
                path_to_needle=path_to_needle)
            tpoint_to_index_dic = {}
            for index, i in enumerate(self.timepoints):
                tpoint_to_index_dic[i] = index
            lineage_expr_trajs = {}
            for cluster_id in cluster_dic:
                lineage_expr_trajs[cluster_id] = [0.0 for j in self.timepoints]
                for seq_index in cluster_dic[cluster_id]:
                    tpoint = float(sample.data[seq_index]['id'].split('_')[0])
                    traj_index = tpoint_to_index_dic[tpoint]
                    lineage_expr_trajs[cluster_id][traj_index] += float(
                        sample.data[seq_index]['other'][freq_attribute_name])
            subprocess.call(['rm', temp_fasta_filepath])

        if output_filepath:
            #turn lineage_expr_trajs into a list for sorting
            if sort_lineages_by == 'sum':
                lineage_expr_trajs_list = [[sum(lineage_expr_trajs[i]), i] +
                                           lineage_expr_trajs[i]
                                           for i in lineage_expr_trajs]
            elif sort_lineages_by == 'range':
                lineage_expr_trajs_list = [[
                    max(lineage_expr_trajs[i]) - min(lineage_expr_trajs[i]), i
                ] + lineage_expr_trajs[i] for i in lineage_expr_trajs]
            fileout = open(output_filepath, "w")
            #write header
            fileout.write('\t' + '\t'.join([str(i)
                                            for i in self.timepoints]) + '\n')
            for i in sorted(lineage_expr_trajs_list, reverse=True):
                fileout.write(i[1] + '\t' + '\t'.join([str(j)
                                                       for j in i[2:]]) + '\n')
            fileout.close()

        subprocess.call(['rm', '-r', temp_dirpath])

        return lineage_expr_trajs
Ejemplo n.º 12
0
    def cluster_over_grid_of_dists(self,
                                   path_to_vsearch=None,
                                   temp_dirpath=None):
        """
        This method will do the same clustering on the seqs in the time-series data as 'cluster_sequences', but it will do this over a grid of 'min_ident_within_cluster' values. The purpose of this is to figure out what a good distance is for a given sequence set. It will return the number of clusters for each of the distance parameters. The idea is that if there is some natural structure to the population then there will be a plateu where the number of cluster stays relatively consistent over a range of distance values.
        path_to_vsearch - This gives the path to the vsearch excecutable. If this is None (default) then the path is assumed to be 'vsearch' (i.e. in $PATH).
        temp_dirpath - This gives the path to the directory for which temporary directories will be made within. The temporary directories will be deleted at the end of the script, but 'temp_dirpath' will not.

        OUTPUT:
        Returns 'num_clusters' which is a list that represents the number of clusters found for each of the distance parameters. Also returns 'ident_values' which is a list of each of the identity values that were used to cluster, in ascending order, and order corresponds to 'num_clusters'. Also returns 'mean_cluster_sizes' which is the mean number of unique sequences across the clusters, for each of the ident_values.
        """

        ######## important parameters ########
        #this gives the grid for ident values
        ident_values = numpy.linspace(.8, 1, num=101)
        ######################################

        if temp_dirpath == None:
            temp_dirpath = tempfile.mkdtemp(dir=os.getcwd()) + '/'
        else:
            temp_dirpath = tempfile.mkdtemp(dir=temp_dirpath) + '/'
        num_clusters = []
        mean_cluster_sizes = []
        for i in ident_values:
            print 'clustering with min identity within cluster parameter:', i
            #write all the data to a temp fasta file
            random_string = str(random.random())[2:]
            temp_fasta_filepath = '%s%s.fasta' % (temp_dirpath, random_string)
            fileout = open(temp_fasta_filepath, "w")
            count = 0
            for j in xrange(len(self.sample_filepaths)):
                sample = sequence_sample(self.sample_filepaths[j],
                                         self.count_attribute_name)
                for k in sample.data:
                    count += 1
                    fileout.write('>%s;size=%s;tpoint=%s;freq=%s;\n%s\n' %
                                  (count, k['count'], j,
                                   k['count'] / sample.total, k['seq']))
            fileout.close()
            #cluster all this data
            msa_output_filepath = '%s%s_msa.fasta' % (temp_dirpath,
                                                      random_string)
            p = Popen([
                path_to_vsearch, '--cluster_size', temp_fasta_filepath, '--id',
                str(i), '--sizein', '--sizeout', '--msaout',
                msa_output_filepath, '--fasta_width',
                str(0)
            ],
                      stdout=PIPE,
                      stderr=PIPE)
            out_err = p.communicate()
            #parse MSA output to get centroids and counts for each timepoint
            cluster_sizes = []
            cluster_count = 0
            filein = open(msa_output_filepath, "r")
            while True:
                line = filein.readline()
                #if EOF, break loop
                if not line:
                    break
                elif line[0] == '>':
                    #if this is the centroid sequence
                    if line[1] == '*':
                        cluster_count += 1
                        cluster_size = 1
                    #else if this is a member seq of the cluster
                    elif line[:-1] != '>consensus':
                        cluster_size += 1
                    #else if this is the end of the cluster seqs
                    elif line[:-1] == '>consensus':
                        cluster_sizes.append(cluster_size)
            filein.close()
            #delete temp files
            subprocess.call(['rm', temp_fasta_filepath, msa_output_filepath])
            num_clusters.append(cluster_count)
            mean_cluster_sizes.append(
                float(sum(cluster_sizes)) / len(cluster_sizes))
        subprocess.call(['rm', '-r', temp_dirpath])
        return num_clusters, mean_cluster_sizes, ident_values
Ejemplo n.º 13
0
def QC_all_samples(input_sequence_dirpath, fwd_primer_fasta_filepath, rev_primer_fasta_filepath, output_dirpath, clean_data_output_dirpath, abr_or_hiv='hiv', fasta_dirpath=None, consensus_output_dirpath=None):
    """This script will walk through each of the patient/timepoint samples and run the QC script 'prelim_qc_using_presto.bash' in order to do basic quality control on the data.
    'clean_data_output_dirpath' = this is the data that only the final fastq file (after all the QC) will be written.
    '[fwd|rev]_primer_fasta_filepath' = Paths to the forward and reverse fasta formatted files. If these = None, then it is assumed that no primer information exists and the porgram 'prelim_qc_using_presto_noPrimerMask.bash' will be used (this is what we do for the AbR data).
    'abr_or_hiv' = Determines what data type we are dealing with. If 'hiv', then the cluster is not used, and the FWD and REV primer seqs are known. If 'abr' then we use the cluster (and all the complications that go with that), and the primers seqs are unknown.
    'fasta_dirpath' = If defined (default, None) then this should be the path to a directory that will contain the sequence data (post QC) in fasta format.
    'consensus_output_dirpath' - If defined (default, None), this will be the path to the directory that will contain the concesus seq for each of the patients. This is currently only applicable to HIV data.
    """

    ######################
    ##### parameters #####
    ######################

    #This is name of the file that is produced at the 
    #final step of the QC process
    final_QCed_filename = 'QC_atleast-2.fastq'
    #this is the basename of the read 1 and read 2 fastq input files
    # read1_basename = 'read1'
    read1_basename = '1'
    # read2_basename = 'read2'
    read2_basename = '2'
    count_attribute_name = 'DUPCOUNT'
    # path_to_blast = '/Users/nstrauli/tools/ncbi-blast-2.7.1+/bin'
    path_to_blast = '/netapp/home/nstrauli/tools_c/ncbi-blast-2.2.28+/bin'
    #path_to_blast_db = '/Users/nstrauli/tools/ncbi-blast-2.7.1+/databases/hxb2_env/env'
    # path_to_blast_db = '/Users/nstrauli/tools/ncbi-blast-2.7.1+/databases/hiv_subtype_ref_seqs/env'
    path_to_blast_db = '/netapp/home/nstrauli/tools_c/ncbi-blast-2.2.28+/databases/hiv_subtype_ref_seqs/env'
    # path_to_blast_db_all_env = '/Users/nstrauli/tools/ncbi-blast-2.7.1+/databases/all_hiv_env_lanl/all_env'
    path_to_blast_db_all_env = '/netapp/home/nstrauli/tools_c/ncbi-blast-2.2.28+/databases/all_hiv_env_lanl/all_env'
    percent_ident_to_ref_cutoff = '70'
    max_percent_ident_to_any_env_cutoff = '99'
    # path_to_hxb2_ref_seq = '/Users/nstrauli/data/abr_hiv_coevo/seq_data/ref_seqs/hiv/HXB2_env_nucleotide_reference_seq.fasta'
    path_to_hxb2_ref_seq = '/hernandez/mandrill/users/nstrauli/data/abr_hiv_coevo/seq_data/ref_seqs/hiv/HXB2_env_nucleotide_reference_seq.fasta'
    #Below are the boundaries to the target reference sequence (HXB2) that will be searched using a pairwise alignment program. These boundaries were found by aligning our fwd and rev primers to the HXB2 ref seq and finding the positions that these primers map to the seq. Did this manually using the EMBL gui for their 'water' program.
    start_of_hxb2_to_search = '704'
    end_of_hxb2_to_search = '1120'
    #vsearch path for rocinante
    # path_to_vsearch = '/Users/nstrauli/tools/vsearch-2.4.0-macos-x86_64/bin/vsearch'
    #vsearch path for gibbon
    # path_to_vsearch = '/Users/nstrauli/tools/vsearch-2.4.3-macos-x86_64/bin/vsearch'
    #vsearch path for cluster
    path_to_vsearch = '/netapp/home/nstrauli/tools_c/vsearch-2.4.3-linux-x86_64/bin/vsearch'
    ######################
    ##### parameters #####
    ######################

    if input_sequence_dirpath[-1] != '/':
        input_sequence_dirpath += '/'
    if output_dirpath[-1] != '/':
        output_dirpath += '/'
    if not os.path.exists(output_dirpath):
        os.makedirs(output_dirpath)
    if clean_data_output_dirpath[-1] != '/':
        clean_data_output_dirpath += '/'
    if not os.path.exists(clean_data_output_dirpath):
        os.makedirs(clean_data_output_dirpath)
    if fasta_dirpath:
        if fasta_dirpath[-1] != '/':
            fasta_dirpath += '/'
        if not os.path.exists(fasta_dirpath):
            os.makedirs(fasta_dirpath)
    if consensus_output_dirpath:
        if consensus_output_dirpath[-1] != '/':
            consensus_output_dirpath += '/'
        if not os.path.exists(consensus_output_dirpath):
            os.makedirs(consensus_output_dirpath)
    #do QC
    for i in os.listdir(input_sequence_dirpath):
        if i[0] == '.' or i == 'README':
            continue
        output_dirpath_patient = output_dirpath + i + '/'
        if not os.path.exists(output_dirpath_patient):
            os.makedirs(output_dirpath_patient)
        clean_output_dirpath_patient = clean_data_output_dirpath + i + '/'
        if not os.path.exists(clean_output_dirpath_patient):
            os.makedirs(clean_output_dirpath_patient)
        if fasta_dirpath:
            fasta_dirpath_patient = fasta_dirpath + i + '/'
            if not os.path.exists(fasta_dirpath_patient):
                os.makedirs(fasta_dirpath_patient)
        for j in os.listdir(input_sequence_dirpath + i):
            if j[0] == '.' or j[:6] == 'README':
                continue

            print ''
            print '################################'
            print 'patient:', i
            print 'time-point:', j
            print '################################'
            print ''

            output_dirpath_tpoint = output_dirpath_patient + j + '/'
            if not os.path.exists(output_dirpath_tpoint):
                os.makedirs(output_dirpath_tpoint)
            if fasta_dirpath:
                fasta_filepath_tpoint = fasta_dirpath_patient + j + '.fasta'
            input_filepath_read1 = input_sequence_dirpath + i + '/' + j + '/' + read1_basename + '.fastq'
            input_filepath_read2 = input_sequence_dirpath + i + '/' + j + '/' + read2_basename + '.fastq'
            if abr_or_hiv == 'hiv':
                clean_output_filepath_tpoint = clean_output_dirpath_patient + j + '.fasta'
                p = Popen(['bash', 'prelim_qc_using_presto.bash', input_filepath_read1, input_filepath_read2, fwd_primer_fasta_filepath, rev_primer_fasta_filepath, output_dirpath_tpoint, 'QC', clean_output_filepath_tpoint, path_to_blast, path_to_blast_db, percent_ident_to_ref_cutoff, path_to_hxb2_ref_seq, start_of_hxb2_to_search, end_of_hxb2_to_search, path_to_vsearch, path_to_blast_db_all_env, max_percent_ident_to_any_env_cutoff, '/netapp/home/nstrauli/tools_c/presto-0.5.4/bin'], stderr=PIPE, stdout=PIPE)
                out_err = p.communicate()
                print out_err[0]
                print out_err[1]
            elif abr_or_hiv == 'abr':
                clean_output_filepath_tpoint = clean_output_dirpath_patient + j + '.fastq'
                p = Popen(['qsub', 'prelim_qc_using_presto_abr.bash', input_filepath_read1, input_filepath_read2, output_dirpath_tpoint, 'QC', clean_output_filepath_tpoint, '/netapp/home/nstrauli/tools_c/presto-0.5.4/bin'], stderr=PIPE, stdout=PIPE)
                out_err = p.communicate()
                print out_err[0]
                print out_err[1]
                if fasta_dirpath:
                    sample = sequence_sample(clean_output_filepath_tpoint, count_attribute_name='DUPCOUNT')
                    sample.write_full_data_to_disk_fasta(output_filepath=fasta_filepath_tpoint)

    if abr_or_hiv == 'hiv' and consensus_output_dirpath:
        #get divergence and selection values for each seq using in-house scripts, and write to final output file. This script has a lot of hardcoded parameters of its own. See the script to view these parameters.
        get_divergence_and_dNdS_hiv_sample.run(input_fasta_dirpath=clean_data_output_dirpath, consensus_output_dirpath=consensus_output_dirpath, fasta_output_dirpath=fasta_dirpath)
    
    return
Ejemplo n.º 14
0
def run(input_clustered_samples_fasta_dirpath, input_indiv_seq_fasta_dirpath, output_network_filepath, outlier_filepath, output_msa_dirpath, output_newick_tree_dirpath, output_outliers_removed_fasta_dirpath, output_seqs_in_outlier_clade_dirpath):
	"""This script clusters all the HIV data across all the patient/time-points. It does this by first clustering within a sample, and then clustering across samples. The purpose is to see if some sequences cluster more closely with other than there assigned patient."""

	########### parameters ###########
	count_attribute_name = 'DUPCOUNT'
	sample_clustering_method = 'by_edit_distance'
	max_edit_distance_within_sample = 4
	max_edit_distance_across_samples = 4
	temp_dirpath = '/Users/nstrauli/data/abr_hiv_coevo/temp_stuff'
	use_comp_cluster = True
	# path_to_needle = '/netapp/home/nstrauli/tools_c/EMBOSS-6.6.0/emboss/needle'
	path_to_needle = '/Users/nstrauli/tools/EMBOSS-6.6.0/emboss/needle'
	node_attributes_to_add = ['element_0_of_seq_id', 'element_1_of_seq_id', 'total_freq', 'count']
	names_for_node_attributes_in_header = ['patient_ID', 'time_point', 'frequency', 'count']
	outlier_freq_cutoff = 0.001
	freq_attribute_name = 'total_freq'
	########### parameters ###########

	if input_clustered_samples_fasta_dirpath[-1] != '/':
		input_clustered_samples_fasta_dirpath += '/'
	if input_indiv_seq_fasta_dirpath[-1] != '/':
		input_indiv_seq_fasta_dirpath += '/'
	if output_network_filepath[-1] != '/':
		output_network_filepath += '/'
	if outlier_filepath[-1] != '/':
		outlier_filepath += '/'
	if output_msa_dirpath[-1] != '/':
		output_msa_dirpath += '/'
	if output_newick_tree_dirpath[-1] != '/':
		output_newick_tree_dirpath += '/'
	if output_outliers_removed_fasta_dirpath[-1] != '/':
		output_outliers_removed_fasta_dirpath += '/'
	if output_seqs_in_outlier_clade_dirpath[-1] != '/':
		output_seqs_in_outlier_clade_dirpath += '/'
	#add appropriate basename to each of the file/dir paths
	base_name = 'max_edit_dist_witin_samp_%s_across_samps_%s' % (max_edit_distance_within_sample, max_edit_distance_across_samples)
	input_clustered_samples_fasta_dirpath += 'max_edit_dist_%s/' % max_edit_distance_within_sample
	output_network_filepath += base_name + '.sif'
	output_node_attribute_filepath = output_network_filepath[:-4] + '_node_attributes.txt'
	outlier_filepath += 'outliers_%s.fasta' % base_name
	output_msa_dirpath += base_name + '/'
	output_newick_tree_dirpath += base_name + '/'
	output_seqs_in_outlier_clade_dirpath += base_name + '/'
	if not os.path.exists(input_clustered_samples_fasta_dirpath):
		os.makedirs(input_clustered_samples_fasta_dirpath)
	if not os.path.exists(output_msa_dirpath):
		os.makedirs(output_msa_dirpath)
	if not os.path.exists(output_newick_tree_dirpath):
		os.makedirs(output_newick_tree_dirpath)
	if not os.path.exists(output_outliers_removed_fasta_dirpath):
		os.makedirs(output_outliers_removed_fasta_dirpath)
	if not os.path.exists(output_seqs_in_outlier_clade_dirpath):
		os.makedirs(output_seqs_in_outlier_clade_dirpath)
	
	#cluster across samples
	seq_sample_set = sequence_sample_set(dirpath=input_clustered_samples_fasta_dirpath, count_attribute_name=count_attribute_name)
	seq_sample_set.cluster_seqs_across_samples(output_network_filepath=output_network_filepath, output_node_attribute_filepath=output_node_attribute_filepath, node_attributes_to_add=node_attributes_to_add, names_for_node_attributes_in_header=names_for_node_attributes_in_header, temp_dirpath=temp_dirpath, add_string_to_seq_ids='dirname_and_filename', add_to_start_or_end='start', max_edit_distance=max_edit_distance_across_samples, path_to_needle=path_to_needle, id_outliers=outlier_filepath, outlier_def='clusters_with_dif_directory', outlier_freq_cutoff=outlier_freq_cutoff, freq_attribute_name=freq_attribute_name)

	#label outliers seqs in sequence clusters data
	filein = open(outlier_filepath, "r")
	sample_filepath_outlier_dic_seq_clusts = {}
	seq_sample_set_seq_clusts = sequence_sample_set(dirpath=input_clustered_samples_fasta_dirpath, count_attribute_name=count_attribute_name)
	for i in seq_sample_set_seq_clusts.sample_filepaths:
		sample_filepath_outlier_dic_seq_clusts[i] = [] #initialize the dic
	for i in filein:
		if i[0] == '>':
			header = i[1:-1].split('|')
			seq_data = header[0].split('_')
			patient_ID = seq_data[0]
			tpoint = seq_data[1]
			seq_id = seq_data[2]
			#append cluster ID to cluster outlier dic
			sample_filepath_seq_clusts = '%s%s/%s.fasta' % (input_clustered_samples_fasta_dirpath, patient_ID, tpoint)
			sample_filepath_outlier_dic_seq_clusts[sample_filepath_seq_clusts].append(seq_id)
	filein.close()
	#lable seqs in seq clusters data
	for i in sample_filepath_outlier_dic_seq_clusts:
		sample = sequence_sample(filepath=i, count_attribute_name=count_attribute_name)
		sample.add_boolean_attribute_to_seqs(attribute_name='is_outlier', seq_ids_that_are_True=sample_filepath_outlier_dic_seq_clusts[i], overwrite_existing=True)
		sample.write_full_data_to_disk_fasta(output_filepath=i, append_to_file=False, seq_id_fileout_dic=None)

	#remove outlier seqs and seqs that are in 'outlier clade' using a phylogenetic approach
	remove_outlier_clade_from_hiv_seqs.run(input_seq_clusters_dirpath=input_clustered_samples_fasta_dirpath, output_msa_dirpath=output_msa_dirpath, output_newick_tree_dirpath=output_newick_tree_dirpath, input_indiv_seq_fasta_dirpath=input_indiv_seq_fasta_dirpath, output_outliers_removed_fasta_dirpath=output_outliers_removed_fasta_dirpath, output_outlier_seqs_dirpath=output_seqs_in_outlier_clade_dirpath)

	return