print("Error: imgt-data.fasta file not detected for\'", species + \ "'. Please generate and place it in the appropriate Data subdirectory.") sys.exit() # If so, check the modification time for the imgt-data.fasta file, assuming that's the last download time input_imgt_file = species_dir + 'imgt-data.fasta' mod_date = datetime.fromtimestamp( os.path.getmtime(input_imgt_file)).strftime('%Y-%m-%d') # Then read through the FASTA and sort into the appropriate chains with open(input_imgt_file, 'rU') as in_file, \ open(species_dir + 'TRA.fasta', 'w') as TRA, \ open(species_dir + 'TRB.fasta', 'w') as TRB: prot = coll.defaultdict(coll.defaultdict) for fasta_id, seq, blank in fxn.read_fa(in_file): gene, allele = fasta_id.split('|')[1].split('*') # NB: TRDV included with TRA genes due to the evidence that even non 'TRAV/DV' genes can recombine with TRAJ if 'TRA' in gene or 'TRDV' in gene: TRA.write(fxn.fastafy(fasta_id, seq)) elif 'TRB' in gene: TRB.write(fxn.fastafy(fasta_id, seq)) # Finally log the dates log_txt = 'imgt-data.fasta_last_modified ' + mod_date + '\nsplit-imgt-data.py_last_run ' + fxn.today( ) with open(species_dir + 'data-production-date.txt', 'w') as log_file: log_file.write(log_txt)
fxn.check_scripts_dir() sns.set(font="Arial", font_scale=1.5) # Sort directories, get data plot_dir = fxn.plot_dir + fxn.get_date() + '-mouse-proteome-check/' if not os.path.exists(plot_dir): os.mkdir(plot_dir) # Read proteome into dict mouse_proteome_file = [ x for x in os.listdir(fxn.base_data_dir) if '_mouse.fasta' in x ][0] mouse_proteins = coll.defaultdict() with gzip.open(fxn.base_data_dir + mouse_proteome_file, 'rU') as in_file: for protein, seq, blank in fxn.read_fa(in_file): mouse_proteins[protein.split(' ')[0]] = seq # Then scroll through non-predicted binder files, build an AC trie of all the peptides per file data_dir = '../Data/NonPredictedBinders/' matches = coll.defaultdict(fxn.nest_counter) all_peptides = coll.defaultdict(list) for f in [x for x in os.listdir(data_dir) if x.endswith('.txt')]: nam = f.split('-')[0] search_builder = AcoraBuilder() peptides = [] # Build trie with open(data_dir + f, 'rU') as in_file: for line in in_file: search_builder.add(line.rstrip())