コード例 #1
0
def pre(hla_arr, input_dir, output_dir):
    #pid = os.getpid()
    #p = psutil.Process(pid)
    #print ('Process info:')
    #print ('name: ', p.name())
    #print ('exe:  ', p.exe())

    files = os.listdir(input_dir)
    for file in files:
        output_file = '{0}{1}'.format(output_dir, file)
        input_file = '{0}{1}'.format(input_dir, file)
        sh.mkdir(output_file)
        for item in hla_arr:
            start = time.time()
            tracemalloc.start(10)
            predict(class_='I',
                    peptides_path=input_file,
                    mhc=item,
                    output='{0}{1}/{2}.csv'.format(output_dir, file, item))
            snapshot = tracemalloc.take_snapshot()
            top_stats = snapshot.statistics('traceback')
            end = time.time()
            stat = top_stats[0]
            #print("%s memory blocks: %.1f KiB" % (stat.count, stat.size / 1024))
            #for line in stat.traceback.format():
            #    print(line)
            print(end - start)
コード例 #2
0
def main(args_input=sys.argv[1:]):
    parser = argparse.ArgumentParser(
        'mhcnuggets', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input_file', help="Input FASTA file")
    parser.add_argument('allele', help="Allele for which to make prediction")
    parser.add_argument('epitope_length',
                        type=int,
                        choices=[8, 9, 10, 11, 12, 13, 14, 15],
                        help="Length of subpeptides (epitopes) to predict")
    parser.add_argument('class_type',
                        choices=['I', 'II'],
                        help="Class I or class II")
    parser.add_argument('output_file', help="Output file from iedb")
    args = parser.parse_args(args_input)

    epitope_seq_nums = defaultdict(list)
    for record in SeqIO.parse(args.input_file, "fasta"):
        seq_num = record.id
        peptide = str(record.seq)
        epitopes = find_neoepitopes(peptide, args.epitope_length)
        for epitope, starts in epitopes.items():
            for start in starts:
                epitope_seq_nums[epitope].append((seq_num, start))

    tmp_file = tempfile.NamedTemporaryFile('w', delete=False)
    for epitope in epitope_seq_nums.keys():
        tmp_file.write("{}\n".format(epitope))
    tmp_file.close()

    tmp_output_file = tempfile.NamedTemporaryFile('r', delete=False)
    predict(args.class_type,
            tmp_file.name,
            mhcnuggets_allele(args.allele, args.class_type),
            output=tmp_output_file.name)
    tmp_output_file.close()
    df = pd.read_csv(tmp_output_file.name)
    processed_df = pd.DataFrame()
    for index, row in df.iterrows():
        seq_nums = epitope_seq_nums[row['peptide']]
        for seq_num, start in seq_nums:
            new_row = row.copy()
            new_row['seq_num'] = seq_num
            new_row['start'] = start
            new_row['allele'] = args.allele
            processed_df = processed_df.append(new_row)
    processed_df['start'] = pd.to_numeric(processed_df['start'],
                                          downcast='integer')
    processed_df = processed_df[[
        'peptide', 'ic50', 'seq_num', 'start', 'allele'
    ]]
    processed_df.to_csv(args.output_file, index=False)
コード例 #3
0
 def predict(self, input_file, allele, epitope_length, iedb_executable_path,
             iedb_retries):
     epitope_seq_nums = defaultdict(list)
     for line in input_file:
         match = re.search('^>([0-9]+)$', line)
         if match:
             seq_num = match.group(1)
         else:
             epitopes = self.find_neoepitopes(line.rstrip())
             for epitope, starts in epitopes.items():
                 for start in starts:
                     epitope_seq_nums[epitope].append((seq_num, start))
     tmp_file = tempfile.NamedTemporaryFile('w', delete=False)
     for epitope in epitope_seq_nums.keys():
         tmp_file.write("{}\n".format(epitope))
     tmp_file.close()
     tmp_output_file = tempfile.NamedTemporaryFile('r', delete=False)
     mhcnuggets_allele = "HLA-{}".format(allele).replace('*', '')
     predict('II',
             tmp_file.name,
             mhcnuggets_allele,
             output=tmp_output_file.name)
     tmp_output_file.close()
     df = pd.read_csv(tmp_output_file.name)
     processed_df = pd.DataFrame()
     for index, row in df.iterrows():
         seq_nums = epitope_seq_nums[row['peptide']]
         for seq_num, start in seq_nums:
             new_row = row.copy()
             new_row['seq_num'] = seq_num
             new_row['start'] = start
             new_row['allele'] = allele
             processed_df = processed_df.append(new_row)
     processed_df['start'] = pd.to_numeric(processed_df['start'],
                                           downcast='integer')
     processed_df = processed_df[[
         'peptide', 'ic50', 'seq_num', 'start', 'allele'
     ]]
     return (processed_df, 'pandas')
コード例 #4
0
 def predict(self, input_file, allele, epitope_length, iedb_executable_path,
             iedb_retries, class_type):
     epitope_seq_nums = defaultdict(list)
     for record in SeqIO.parse(input_file, "fasta"):
         seq_num = record.id
         peptide = str(record.seq)
         epitopes = self.find_neoepitopes(peptide, epitope_length)
         for epitope, starts in epitopes.items():
             for start in starts:
                 epitope_seq_nums[epitope].append((seq_num, start))
     tmp_file = tempfile.NamedTemporaryFile('w', delete=False)
     for epitope in epitope_seq_nums.keys():
         tmp_file.write("{}\n".format(epitope))
     tmp_file.close()
     tmp_output_file = tempfile.NamedTemporaryFile('r', delete=False)
     predict(class_type,
             tmp_file.name,
             self.mhcnuggets_allele(allele),
             output=tmp_output_file.name)
     tmp_output_file.close()
     df = pd.read_csv(tmp_output_file.name)
     processed_df = pd.DataFrame()
     for index, row in df.iterrows():
         seq_nums = epitope_seq_nums[row['peptide']]
         for seq_num, start in seq_nums:
             new_row = row.copy()
             new_row['seq_num'] = seq_num
             new_row['start'] = start
             new_row['allele'] = allele
             processed_df = processed_df.append(new_row)
     processed_df['start'] = pd.to_numeric(processed_df['start'],
                                           downcast='integer')
     processed_df = processed_df[[
         'peptide', 'ic50', 'seq_num', 'start', 'allele'
     ]]
     return (processed_df, 'pandas')
コード例 #5
0
def main():
    model = argparse.ArgumentParser(
        description='MHCNuggets binding prediction')

    model.add_argument('-p', '--peptides', type=str, help='mhcnuggets input')

    model.add_argument('-a', '--alleles', type=str, help='class 2 alleles')

    model.add_argument('-o', '--output', type=str, help='mhcnuggets output')

    args = model.parse_args()

    if open(args.peptides).readlines() != []:
        supp_alleles = parse_alleles(args.alleles)

        for allele in supp_alleles:
            predict(class_='II',
                    peptides_path=args.peptides,
                    mhc=allele,
                    output=allele + args.output)

    else:
        op = open('predicted_neoepitopes_class_2', 'w')
        op.close()
コード例 #6
0
def get_affinity_mhcnuggets(peptides, allele, version, remove_files=True):
    """ Obtains binding affinities from list of peptides

        peptides: peptides of interest (list of strings)
        allele: Allele to use for binding affinity (string)
        scores: list of scoring methods
        version: version of mhcnuggets
        remove_files: option to remove intermediate files

        Return value: affinities (a list of binding affinities
                        as strings)
    """
    from mhcnuggets.src.predict import predict

    files_to_remove = []
    try:
        # Check that allele is valid for method
        with open(
            os.path.join(neoepiscope_dir, "neoepiscope", "availableAlleles.pickle"),
            "rb",
        ) as allele_stream:
            avail_alleles = pickle.load(allele_stream)
        # Check that allele is valid for method
        allele = allele.replace("*", "")
        if allele in avail_alleles["mhcnuggets_mhcI"]:
            allele_class = "I"
            max_length = 15
        elif allele in avail_alleles["mhcnuggets_mhcII"]:
            allele_class = "II"
            max_length = 30
        else:
            warnings.warn(
                " ".join([allele, "is not a valid allele for mhcnuggets"]), Warning
            )
            return [(peptides[i], "NA") for i in range(0, len(peptides))]
        # Establish return list and sample id
        sample_id = ".".join(
            [peptides[0], str(len(peptides)), allele, "mhcnuggets", version]
        )
        affinities = []
        # Write one peptide per line to a temporary file for
        #   input if peptide length is at least 9
        # Count instances of smaller peptides
        # Establish temporary file to hold output
        peptide_file = tempfile.mkstemp(
            suffix=".txt", prefix="".join([sample_id, "."]), text=True
        )[1]
        files_to_remove.append(peptide_file)
        na_count = 0
        with open(peptide_file, "w") as f:
            for sequence in peptides:
                if len(sequence) > max_length:
                    na_count += 1
                else:
                    print(sequence, file=f)
        if na_count > 0:
            warnings.warn(
                " ".join(
                    [
                        str(na_count),
                        "peptides not compatible with",
                        "mhcnuggets will not receive score",
                    ]
                ),
                Warning,
            )
        # Establish temporary file to hold output
        mhc_out = tempfile.mkstemp(
            suffix=".mhcnuggets.out", prefix="".join([sample_id, "."]), text=True
        )[1]
        files_to_remove.append(mhc_out)
        # Run mhcnuggets
        predict(
            class_=allele_class, peptides_path=peptide_file, mhc=allele, output=mhc_out
        )
        # Retrieve scores for valid peptides
        score_dict = {}
        with open(mhc_out, "r") as f:
            # Skip headers
            f.readline()
            for line in f:
                tokens = line.strip("\n").split(",")
                score_dict[tokens[0]] = tokens[1]
        # Produce list of scores for valid peptides
        # Invalid peptides receive "NA" score
        for sequence in peptides:
            if sequence in score_dict:
                nM = (sequence, score_dict[sequence])
            else:
                nM = (sequence, "NA")
            affinities.append(nM)
        return affinities
    finally:
        if remove_files:
            for file_to_remove in files_to_remove:
                os.remove(file_to_remove)
コード例 #7
0
def get_normal_binding_scores(blast_dict, alleles, available_alleles, output_dir, pat_id, remove_files=True):
	''' Creates dictionary linking matched normal epitopes to binding scores for 
		different HLA alleles

		blast_dict: blast_dict - dictionary that links epitopes to a list of
					[match E value, set of transcripts it comes from, set of 
					genes it comes from, match pepetide sequence] (from process_blast())
		alleles: list of HLA alleles to use for binding predictions
		available_alleles: path to pickled dictionary describing available HLA alleles
						   for different binding affinity predictors
		output_dir: path to output directory for writing temporary files
		pat_id: patient identifier

		Return value: nested dictionary, where keys are matched normal epitopes and 
					  values are dictionaries, where keys are HLA alleles and values
					  are binding scores for that epitope/allele combo
	'''
	# Create list of temporary files to remove
	files_to_remove = []
	# Extract matched normal peptide sequences
	normal_epitopes = set()
	for epitope in blast_dict:
		normal_epitopes.add(blast_dict[epitope][3])
	# Load available alleles
	with open(available_alleles, 'rb') as allele_stream:
		avail_alleles = pickle.load(allele_stream)
	# Initialize dictionary
	normal_dict = defaultdict(dict)
	for hla in alleles:
		# Determine if allele is valid for mhcnuggets
		if hla in avail_alleles["mhcnuggets_mhcI"]:
			# Class I allele
			allele_class = "I"
			max_length = 15
		elif hla in avail_alleles["mhcnuggets_mhcII"]:
			# Class II allele
			allele_class = "II"
			max_length = 30
		else:
			# Not a valid allele
			continue
		# Write relevant peptides to file
		peptide_file = os.path.join(output_dir, ''.join([pat_id, '.mhc.', hla, '.csv']))
		files_to_remove.append(peptide_file)
		with open(peptide_file, 'w') as f:
			for sequence in normal_epitopes:
				if len(sequence) <= max_length:
					print(sequence, file=f)
		# Run binding predictions
		mhc_out = os.path.join(output_dir, ''.join([pat_id, '.mhc.', hla, '.out']))
		files_to_remove.append(mhc_out)
		predict(class_=allele_class, peptides_path=peptide_file, mhc=hla, output=mhc_out)
		# Process mhcnuggets results
		score_dict = {}
		with open(mhc_out) as f:
			f.readline()
			for line in f:
				tokens = line.strip().split(',')
				score_dict[tokens[0]] = tokens[1]
		# Store score for each epitope if available
		for sequence in normal_epitopes:
			if sequence in score_dict:
				normal_dict[sequence][hla] = float(score_dict[sequence])
	# Remove temporary files
	if remove_files:
		for file_to_remove in files_to_remove:
			os.remove(file_to_remove)
	# Return dictionary
	return normal_dict
コード例 #8
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul  2 16:55:19 2018

@author: frank-lsy
"""

# importing the predict module
from mhcnuggets.src.predict import predict

# predicting new line separated peptides present in the peptides_path file 
# for MHC class_I allele HLA-A*02:01
predict(class_='I',
        peptides_path='test.peps', 
        mhc='HLA-A02:01', output = 'new.csv')
print("\n")
# similarly doing the same prediction for MHC class_II allele HLA-DRB1*01:01
"""
predict(class_='II',
        peptides_path='mhcnuggets/mhcnuggets/data/test/test_peptides.peps', 
        mhc='HLA-DRB101:01', output = 'II.csv')
print("\n")
# as an example of prediction of rare alleles asking MHCnuggets to make predictions for HLA-A*02:60
# will make it search for the closest allele (HLA-A*02:01 in this case), and use the corresponding 
# network for prediction
predict(class_='I',
        peptides_path='mhcnuggets/mhcnuggets/data/test/test_peptides.peps', 
        mhc='HLA-A02:60', output = 'III.csv')
"""
コード例 #9
0
ファイル: predict.py プロジェクト: Frank-LSY/mhcnuggets-2.0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jul  2 16:55:19 2018

@author: frank-lsy
"""

# importing the predict module
from mhcnuggets.src.predict import predict

# predicting new line separated peptides present in the peptides_path file
# for MHC class_I allele HLA-A*02:01
predict(class_='I',
        peptides_path=
        '2018summer/mhcnuggets-2.0/mhcnuggets/data/test/test_peptides.peps',
        mhc='HLA-A02:01',
        output='I.csv')
print("\n")
# similarly doing the same prediction for MHC class_II allele HLA-DRB1*01:01
"""
predict(class_='II',
        peptides_path='mhcnuggets/mhcnuggets/data/test/test_peptides.peps', 
        mhc='HLA-DRB101:01', output = 'II.csv')
print("\n")
# as an example of prediction of rare alleles asking MHCnuggets to make predictions for HLA-A*02:60
# will make it search for the closest allele (HLA-A*02:01 in this case), and use the corresponding 
# network for prediction
predict(class_='I',
        peptides_path='mhcnuggets/mhcnuggets/data/test/test_peptides.peps', 
        mhc='HLA-A02:60', output = 'III.csv')